aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig18
-rw-r--r--mm/Makefile3
-rw-r--r--mm/bootmem.c2
-rw-r--r--mm/bounce.c2
-rw-r--r--mm/fadvise.c2
-rw-r--r--mm/filemap.c297
-rw-r--r--mm/fremap.c27
-rw-r--r--mm/highmem.c5
-rw-r--r--mm/hugetlb.c124
-rw-r--r--mm/internal.h160
-rw-r--r--mm/memcontrol.c481
-rw-r--r--mm/memory.c140
-rw-r--r--mm/memory_hotplug.c22
-rw-r--r--mm/mempolicy.c29
-rw-r--r--mm/migrate.c281
-rw-r--r--mm/mlock.c439
-rw-r--r--mm/mmap.c82
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/mremap.c8
-rw-r--r--mm/nommu.c47
-rw-r--r--mm/oom_kill.c3
-rw-r--r--mm/page-writeback.c22
-rw-r--r--mm/page_alloc.c144
-rw-r--r--mm/page_cgroup.c256
-rw-r--r--mm/page_isolation.c15
-rw-r--r--mm/pdflush.c2
-rw-r--r--mm/readahead.c4
-rw-r--r--mm/rmap.c319
-rw-r--r--mm/shmem.c20
-rw-r--r--mm/slab.c52
-rw-r--r--mm/slob.c8
-rw-r--r--mm/slub.c30
-rw-r--r--mm/sparse-vmemmap.c2
-rw-r--r--mm/swap.c172
-rw-r--r--mm/swap_state.c11
-rw-r--r--mm/swapfile.c27
-rw-r--r--mm/tiny-shmem.c27
-rw-r--r--mm/truncate.c6
-rw-r--r--mm/vmalloc.c1049
-rw-r--r--mm/vmscan.c994
-rw-r--r--mm/vmstat.c102
41 files changed, 4010 insertions, 1426 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 0bd9c2dbb2a0..5b5790f8a816 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -101,7 +101,7 @@ config HAVE_MEMORY_PRESENT
101# with gcc 3.4 and later. 101# with gcc 3.4 and later.
102# 102#
103config SPARSEMEM_STATIC 103config SPARSEMEM_STATIC
104 def_bool n 104 bool
105 105
106# 106#
107# Architecture platforms which require a two level mem_section in SPARSEMEM 107# Architecture platforms which require a two level mem_section in SPARSEMEM
@@ -113,7 +113,7 @@ config SPARSEMEM_EXTREME
113 depends on SPARSEMEM && !SPARSEMEM_STATIC 113 depends on SPARSEMEM && !SPARSEMEM_STATIC
114 114
115config SPARSEMEM_VMEMMAP_ENABLE 115config SPARSEMEM_VMEMMAP_ENABLE
116 def_bool n 116 bool
117 117
118config SPARSEMEM_VMEMMAP 118config SPARSEMEM_VMEMMAP
119 bool "Sparse Memory virtual memmap" 119 bool "Sparse Memory virtual memmap"
@@ -187,6 +187,9 @@ config RESOURCES_64BIT
187 help 187 help
188 This option allows memory and IO resources to be 64 bit. 188 This option allows memory and IO resources to be 64 bit.
189 189
190config PHYS_ADDR_T_64BIT
191 def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
192
190config ZONE_DMA_FLAG 193config ZONE_DMA_FLAG
191 int 194 int
192 default "0" if !ZONE_DMA 195 default "0" if !ZONE_DMA
@@ -206,5 +209,16 @@ config VIRT_TO_BUS
206 def_bool y 209 def_bool y
207 depends on !ARCH_NO_VIRT_TO_BUS 210 depends on !ARCH_NO_VIRT_TO_BUS
208 211
212config UNEVICTABLE_LRU
213 bool "Add LRU list to track non-evictable pages"
214 default y
215 depends on MMU
216 help
217 Keeps unevictable pages off of the active and inactive pageout
218 lists, so kswapd will not waste CPU time or have its balancing
219 algorithms thrown off by scanning these pages. Selecting this
220 will use one page flag and increase the code size a little,
221 say Y unless you know what you are doing.
222
209config MMU_NOTIFIER 223config MMU_NOTIFIER
210 bool 224 bool
diff --git a/mm/Makefile b/mm/Makefile
index da4ccf015aea..c06b45a1ff5f 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -33,5 +33,4 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o
33obj-$(CONFIG_MIGRATION) += migrate.o 33obj-$(CONFIG_MIGRATION) += migrate.o
34obj-$(CONFIG_SMP) += allocpercpu.o 34obj-$(CONFIG_SMP) += allocpercpu.o
35obj-$(CONFIG_QUICKLIST) += quicklist.o 35obj-$(CONFIG_QUICKLIST) += quicklist.o
36obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o 36obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
37
diff --git a/mm/bootmem.c b/mm/bootmem.c
index ad8eec6e44a8..ac5a891f142a 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -48,7 +48,7 @@ early_param("bootmem_debug", bootmem_debug_setup);
48 if (unlikely(bootmem_debug)) \ 48 if (unlikely(bootmem_debug)) \
49 printk(KERN_INFO \ 49 printk(KERN_INFO \
50 "bootmem::%s " fmt, \ 50 "bootmem::%s " fmt, \
51 __FUNCTION__, ## args); \ 51 __func__, ## args); \
52}) 52})
53 53
54static unsigned long __init bootmap_bytes(unsigned long pages) 54static unsigned long __init bootmap_bytes(unsigned long pages)
diff --git a/mm/bounce.c b/mm/bounce.c
index b6d2d0f1019b..06722c403058 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -267,7 +267,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
267 /* 267 /*
268 * Data-less bio, nothing to bounce 268 * Data-less bio, nothing to bounce
269 */ 269 */
270 if (bio_empty_barrier(*bio_orig)) 270 if (!bio_has_data(*bio_orig))
271 return; 271 return;
272 272
273 /* 273 /*
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 343cfdfebd9e..a1da969bd980 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (C) 2002, Linus Torvalds 4 * Copyright (C) 2002, Linus Torvalds
5 * 5 *
6 * 11Jan2003 akpm@digeo.com 6 * 11Jan2003 Andrew Morton
7 * Initial version. 7 * Initial version.
8 */ 8 */
9 9
diff --git a/mm/filemap.c b/mm/filemap.c
index 876bc595d0f8..f3e5f8944d17 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -33,6 +33,7 @@
33#include <linux/cpuset.h> 33#include <linux/cpuset.h>
34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ 34#include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
35#include <linux/memcontrol.h> 35#include <linux/memcontrol.h>
36#include <linux/mm_inline.h> /* for page_is_file_cache() */
36#include "internal.h" 37#include "internal.h"
37 38
38/* 39/*
@@ -115,12 +116,12 @@ void __remove_from_page_cache(struct page *page)
115{ 116{
116 struct address_space *mapping = page->mapping; 117 struct address_space *mapping = page->mapping;
117 118
118 mem_cgroup_uncharge_cache_page(page);
119 radix_tree_delete(&mapping->page_tree, page->index); 119 radix_tree_delete(&mapping->page_tree, page->index);
120 page->mapping = NULL; 120 page->mapping = NULL;
121 mapping->nrpages--; 121 mapping->nrpages--;
122 __dec_zone_page_state(page, NR_FILE_PAGES); 122 __dec_zone_page_state(page, NR_FILE_PAGES);
123 BUG_ON(page_mapped(page)); 123 BUG_ON(page_mapped(page));
124 mem_cgroup_uncharge_cache_page(page);
124 125
125 /* 126 /*
126 * Some filesystems seem to re-dirty the page even after 127 * Some filesystems seem to re-dirty the page even after
@@ -492,9 +493,24 @@ EXPORT_SYMBOL(add_to_page_cache_locked);
492int add_to_page_cache_lru(struct page *page, struct address_space *mapping, 493int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
493 pgoff_t offset, gfp_t gfp_mask) 494 pgoff_t offset, gfp_t gfp_mask)
494{ 495{
495 int ret = add_to_page_cache(page, mapping, offset, gfp_mask); 496 int ret;
496 if (ret == 0) 497
497 lru_cache_add(page); 498 /*
499 * Splice_read and readahead add shmem/tmpfs pages into the page cache
500 * before shmem_readpage has a chance to mark them as SwapBacked: they
501 * need to go on the active_anon lru below, and mem_cgroup_cache_charge
502 * (called in add_to_page_cache) needs to know where they're going too.
503 */
504 if (mapping_cap_swap_backed(mapping))
505 SetPageSwapBacked(page);
506
507 ret = add_to_page_cache(page, mapping, offset, gfp_mask);
508 if (ret == 0) {
509 if (page_is_file_cache(page))
510 lru_cache_add_file(page);
511 else
512 lru_cache_add_active_anon(page);
513 }
498 return ret; 514 return ret;
499} 515}
500 516
@@ -557,17 +573,14 @@ EXPORT_SYMBOL(wait_on_page_bit);
557 * mechananism between PageLocked pages and PageWriteback pages is shared. 573 * mechananism between PageLocked pages and PageWriteback pages is shared.
558 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. 574 * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep.
559 * 575 *
560 * The first mb is necessary to safely close the critical section opened by the 576 * The mb is necessary to enforce ordering between the clear_bit and the read
561 * test_and_set_bit() to lock the page; the second mb is necessary to enforce 577 * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
562 * ordering between the clear_bit and the read of the waitqueue (to avoid SMP
563 * races with a parallel wait_on_page_locked()).
564 */ 578 */
565void unlock_page(struct page *page) 579void unlock_page(struct page *page)
566{ 580{
567 smp_mb__before_clear_bit(); 581 VM_BUG_ON(!PageLocked(page));
568 if (!test_and_clear_bit(PG_locked, &page->flags)) 582 clear_bit_unlock(PG_locked, &page->flags);
569 BUG(); 583 smp_mb__after_clear_bit();
570 smp_mb__after_clear_bit();
571 wake_up_page(page, PG_locked); 584 wake_up_page(page, PG_locked);
572} 585}
573EXPORT_SYMBOL(unlock_page); 586EXPORT_SYMBOL(unlock_page);
@@ -1100,8 +1113,9 @@ page_ok:
1100 1113
1101page_not_up_to_date: 1114page_not_up_to_date:
1102 /* Get exclusive access to the page ... */ 1115 /* Get exclusive access to the page ... */
1103 if (lock_page_killable(page)) 1116 error = lock_page_killable(page);
1104 goto readpage_eio; 1117 if (unlikely(error))
1118 goto readpage_error;
1105 1119
1106page_not_up_to_date_locked: 1120page_not_up_to_date_locked:
1107 /* Did it get truncated before we got the lock? */ 1121 /* Did it get truncated before we got the lock? */
@@ -1130,8 +1144,9 @@ readpage:
1130 } 1144 }
1131 1145
1132 if (!PageUptodate(page)) { 1146 if (!PageUptodate(page)) {
1133 if (lock_page_killable(page)) 1147 error = lock_page_killable(page);
1134 goto readpage_eio; 1148 if (unlikely(error))
1149 goto readpage_error;
1135 if (!PageUptodate(page)) { 1150 if (!PageUptodate(page)) {
1136 if (page->mapping == NULL) { 1151 if (page->mapping == NULL) {
1137 /* 1152 /*
@@ -1143,15 +1158,14 @@ readpage:
1143 } 1158 }
1144 unlock_page(page); 1159 unlock_page(page);
1145 shrink_readahead_size_eio(filp, ra); 1160 shrink_readahead_size_eio(filp, ra);
1146 goto readpage_eio; 1161 error = -EIO;
1162 goto readpage_error;
1147 } 1163 }
1148 unlock_page(page); 1164 unlock_page(page);
1149 } 1165 }
1150 1166
1151 goto page_ok; 1167 goto page_ok;
1152 1168
1153readpage_eio:
1154 error = -EIO;
1155readpage_error: 1169readpage_error:
1156 /* UHHUH! A synchronous read error occurred. Report it */ 1170 /* UHHUH! A synchronous read error occurred. Report it */
1157 desc->error = error; 1171 desc->error = error;
@@ -1186,8 +1200,7 @@ out:
1186 ra->prev_pos |= prev_offset; 1200 ra->prev_pos |= prev_offset;
1187 1201
1188 *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; 1202 *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset;
1189 if (filp) 1203 file_accessed(filp);
1190 file_accessed(filp);
1191} 1204}
1192 1205
1193int file_read_actor(read_descriptor_t *desc, struct page *page, 1206int file_read_actor(read_descriptor_t *desc, struct page *page,
@@ -2016,48 +2029,8 @@ int pagecache_write_begin(struct file *file, struct address_space *mapping,
2016{ 2029{
2017 const struct address_space_operations *aops = mapping->a_ops; 2030 const struct address_space_operations *aops = mapping->a_ops;
2018 2031
2019 if (aops->write_begin) { 2032 return aops->write_begin(file, mapping, pos, len, flags,
2020 return aops->write_begin(file, mapping, pos, len, flags,
2021 pagep, fsdata); 2033 pagep, fsdata);
2022 } else {
2023 int ret;
2024 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
2025 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
2026 struct inode *inode = mapping->host;
2027 struct page *page;
2028again:
2029 page = __grab_cache_page(mapping, index);
2030 *pagep = page;
2031 if (!page)
2032 return -ENOMEM;
2033
2034 if (flags & AOP_FLAG_UNINTERRUPTIBLE && !PageUptodate(page)) {
2035 /*
2036 * There is no way to resolve a short write situation
2037 * for a !Uptodate page (except by double copying in
2038 * the caller done by generic_perform_write_2copy).
2039 *
2040 * Instead, we have to bring it uptodate here.
2041 */
2042 ret = aops->readpage(file, page);
2043 page_cache_release(page);
2044 if (ret) {
2045 if (ret == AOP_TRUNCATED_PAGE)
2046 goto again;
2047 return ret;
2048 }
2049 goto again;
2050 }
2051
2052 ret = aops->prepare_write(file, page, offset, offset+len);
2053 if (ret) {
2054 unlock_page(page);
2055 page_cache_release(page);
2056 if (pos + len > inode->i_size)
2057 vmtruncate(inode, inode->i_size);
2058 }
2059 return ret;
2060 }
2061} 2034}
2062EXPORT_SYMBOL(pagecache_write_begin); 2035EXPORT_SYMBOL(pagecache_write_begin);
2063 2036
@@ -2066,32 +2039,9 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
2066 struct page *page, void *fsdata) 2039 struct page *page, void *fsdata)
2067{ 2040{
2068 const struct address_space_operations *aops = mapping->a_ops; 2041 const struct address_space_operations *aops = mapping->a_ops;
2069 int ret;
2070
2071 if (aops->write_end) {
2072 mark_page_accessed(page);
2073 ret = aops->write_end(file, mapping, pos, len, copied,
2074 page, fsdata);
2075 } else {
2076 unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
2077 struct inode *inode = mapping->host;
2078
2079 flush_dcache_page(page);
2080 ret = aops->commit_write(file, page, offset, offset+len);
2081 unlock_page(page);
2082 mark_page_accessed(page);
2083 page_cache_release(page);
2084 2042
2085 if (ret < 0) { 2043 mark_page_accessed(page);
2086 if (pos + len > inode->i_size) 2044 return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
2087 vmtruncate(inode, inode->i_size);
2088 } else if (ret > 0)
2089 ret = min_t(size_t, copied, ret);
2090 else
2091 ret = copied;
2092 }
2093
2094 return ret;
2095} 2045}
2096EXPORT_SYMBOL(pagecache_write_end); 2046EXPORT_SYMBOL(pagecache_write_end);
2097 2047
@@ -2213,174 +2163,6 @@ repeat:
2213} 2163}
2214EXPORT_SYMBOL(__grab_cache_page); 2164EXPORT_SYMBOL(__grab_cache_page);
2215 2165
2216static ssize_t generic_perform_write_2copy(struct file *file,
2217 struct iov_iter *i, loff_t pos)
2218{
2219 struct address_space *mapping = file->f_mapping;
2220 const struct address_space_operations *a_ops = mapping->a_ops;
2221 struct inode *inode = mapping->host;
2222 long status = 0;
2223 ssize_t written = 0;
2224
2225 do {
2226 struct page *src_page;
2227 struct page *page;
2228 pgoff_t index; /* Pagecache index for current page */
2229 unsigned long offset; /* Offset into pagecache page */
2230 unsigned long bytes; /* Bytes to write to page */
2231 size_t copied; /* Bytes copied from user */
2232
2233 offset = (pos & (PAGE_CACHE_SIZE - 1));
2234 index = pos >> PAGE_CACHE_SHIFT;
2235 bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset,
2236 iov_iter_count(i));
2237
2238 /*
2239 * a non-NULL src_page indicates that we're doing the
2240 * copy via get_user_pages and kmap.
2241 */
2242 src_page = NULL;
2243
2244 /*
2245 * Bring in the user page that we will copy from _first_.
2246 * Otherwise there's a nasty deadlock on copying from the
2247 * same page as we're writing to, without it being marked
2248 * up-to-date.
2249 *
2250 * Not only is this an optimisation, but it is also required
2251 * to check that the address is actually valid, when atomic
2252 * usercopies are used, below.
2253 */
2254 if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
2255 status = -EFAULT;
2256 break;
2257 }
2258
2259 page = __grab_cache_page(mapping, index);
2260 if (!page) {
2261 status = -ENOMEM;
2262 break;
2263 }
2264
2265 /*
2266 * non-uptodate pages cannot cope with short copies, and we
2267 * cannot take a pagefault with the destination page locked.
2268 * So pin the source page to copy it.
2269 */
2270 if (!PageUptodate(page) && !segment_eq(get_fs(), KERNEL_DS)) {
2271 unlock_page(page);
2272
2273 src_page = alloc_page(GFP_KERNEL);
2274 if (!src_page) {
2275 page_cache_release(page);
2276 status = -ENOMEM;
2277 break;
2278 }
2279
2280 /*
2281 * Cannot get_user_pages with a page locked for the
2282 * same reason as we can't take a page fault with a
2283 * page locked (as explained below).
2284 */
2285 copied = iov_iter_copy_from_user(src_page, i,
2286 offset, bytes);
2287 if (unlikely(copied == 0)) {
2288 status = -EFAULT;
2289 page_cache_release(page);
2290 page_cache_release(src_page);
2291 break;
2292 }
2293 bytes = copied;
2294
2295 lock_page(page);
2296 /*
2297 * Can't handle the page going uptodate here, because
2298 * that means we would use non-atomic usercopies, which
2299 * zero out the tail of the page, which can cause
2300 * zeroes to become transiently visible. We could just
2301 * use a non-zeroing copy, but the APIs aren't too
2302 * consistent.
2303 */
2304 if (unlikely(!page->mapping || PageUptodate(page))) {
2305 unlock_page(page);
2306 page_cache_release(page);
2307 page_cache_release(src_page);
2308 continue;
2309 }
2310 }
2311
2312 status = a_ops->prepare_write(file, page, offset, offset+bytes);
2313 if (unlikely(status))
2314 goto fs_write_aop_error;
2315
2316 if (!src_page) {
2317 /*
2318 * Must not enter the pagefault handler here, because
2319 * we hold the page lock, so we might recursively
2320 * deadlock on the same lock, or get an ABBA deadlock
2321 * against a different lock, or against the mmap_sem
2322 * (which nests outside the page lock). So increment
2323 * preempt count, and use _atomic usercopies.
2324 *
2325 * The page is uptodate so we are OK to encounter a
2326 * short copy: if unmodified parts of the page are
2327 * marked dirty and written out to disk, it doesn't
2328 * really matter.
2329 */
2330 pagefault_disable();
2331 copied = iov_iter_copy_from_user_atomic(page, i,
2332 offset, bytes);
2333 pagefault_enable();
2334 } else {
2335 void *src, *dst;
2336 src = kmap_atomic(src_page, KM_USER0);
2337 dst = kmap_atomic(page, KM_USER1);
2338 memcpy(dst + offset, src + offset, bytes);
2339 kunmap_atomic(dst, KM_USER1);
2340 kunmap_atomic(src, KM_USER0);
2341 copied = bytes;
2342 }
2343 flush_dcache_page(page);
2344
2345 status = a_ops->commit_write(file, page, offset, offset+bytes);
2346 if (unlikely(status < 0))
2347 goto fs_write_aop_error;
2348 if (unlikely(status > 0)) /* filesystem did partial write */
2349 copied = min_t(size_t, copied, status);
2350
2351 unlock_page(page);
2352 mark_page_accessed(page);
2353 page_cache_release(page);
2354 if (src_page)
2355 page_cache_release(src_page);
2356
2357 iov_iter_advance(i, copied);
2358 pos += copied;
2359 written += copied;
2360
2361 balance_dirty_pages_ratelimited(mapping);
2362 cond_resched();
2363 continue;
2364
2365fs_write_aop_error:
2366 unlock_page(page);
2367 page_cache_release(page);
2368 if (src_page)
2369 page_cache_release(src_page);
2370
2371 /*
2372 * prepare_write() may have instantiated a few blocks
2373 * outside i_size. Trim these off again. Don't need
2374 * i_size_read because we hold i_mutex.
2375 */
2376 if (pos + bytes > inode->i_size)
2377 vmtruncate(inode, inode->i_size);
2378 break;
2379 } while (iov_iter_count(i));
2380
2381 return written ? written : status;
2382}
2383
2384static ssize_t generic_perform_write(struct file *file, 2166static ssize_t generic_perform_write(struct file *file,
2385 struct iov_iter *i, loff_t pos) 2167 struct iov_iter *i, loff_t pos)
2386{ 2168{
@@ -2481,10 +2263,7 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2481 struct iov_iter i; 2263 struct iov_iter i;
2482 2264
2483 iov_iter_init(&i, iov, nr_segs, count, written); 2265 iov_iter_init(&i, iov, nr_segs, count, written);
2484 if (a_ops->write_begin) 2266 status = generic_perform_write(file, &i, pos);
2485 status = generic_perform_write(file, &i, pos);
2486 else
2487 status = generic_perform_write_2copy(file, &i, pos);
2488 2267
2489 if (likely(status >= 0)) { 2268 if (likely(status >= 0)) {
2490 written += status; 2269 written += status;
diff --git a/mm/fremap.c b/mm/fremap.c
index 7881638e4a12..7d12ca70ef7b 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -21,6 +21,8 @@
21#include <asm/cacheflush.h> 21#include <asm/cacheflush.h>
22#include <asm/tlbflush.h> 22#include <asm/tlbflush.h>
23 23
24#include "internal.h"
25
24static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, 26static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
25 unsigned long addr, pte_t *ptep) 27 unsigned long addr, pte_t *ptep)
26{ 28{
@@ -215,15 +217,31 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
215 spin_unlock(&mapping->i_mmap_lock); 217 spin_unlock(&mapping->i_mmap_lock);
216 } 218 }
217 219
220 if (vma->vm_flags & VM_LOCKED) {
221 /*
222 * drop PG_Mlocked flag for over-mapped range
223 */
224 unsigned int saved_flags = vma->vm_flags;
225 munlock_vma_pages_range(vma, start, start + size);
226 vma->vm_flags = saved_flags;
227 }
228
218 mmu_notifier_invalidate_range_start(mm, start, start + size); 229 mmu_notifier_invalidate_range_start(mm, start, start + size);
219 err = populate_range(mm, vma, start, size, pgoff); 230 err = populate_range(mm, vma, start, size, pgoff);
220 mmu_notifier_invalidate_range_end(mm, start, start + size); 231 mmu_notifier_invalidate_range_end(mm, start, start + size);
221 if (!err && !(flags & MAP_NONBLOCK)) { 232 if (!err && !(flags & MAP_NONBLOCK)) {
222 if (unlikely(has_write_lock)) { 233 if (vma->vm_flags & VM_LOCKED) {
223 downgrade_write(&mm->mmap_sem); 234 /*
224 has_write_lock = 0; 235 * might be mapping previously unmapped range of file
236 */
237 mlock_vma_pages_range(vma, start, start + size);
238 } else {
239 if (unlikely(has_write_lock)) {
240 downgrade_write(&mm->mmap_sem);
241 has_write_lock = 0;
242 }
243 make_pages_present(start, start+size);
225 } 244 }
226 make_pages_present(start, start+size);
227 } 245 }
228 246
229 /* 247 /*
@@ -240,4 +258,3 @@ out:
240 258
241 return err; 259 return err;
242} 260}
243
diff --git a/mm/highmem.c b/mm/highmem.c
index e16e1523b688..b36b83b920ff 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -70,6 +70,7 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait);
70static void flush_all_zero_pkmaps(void) 70static void flush_all_zero_pkmaps(void)
71{ 71{
72 int i; 72 int i;
73 int need_flush = 0;
73 74
74 flush_cache_kmaps(); 75 flush_cache_kmaps();
75 76
@@ -101,8 +102,10 @@ static void flush_all_zero_pkmaps(void)
101 &pkmap_page_table[i]); 102 &pkmap_page_table[i]);
102 103
103 set_page_address(page, NULL); 104 set_page_address(page, NULL);
105 need_flush = 1;
104 } 106 }
105 flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); 107 if (need_flush)
108 flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
106} 109}
107 110
108/** 111/**
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 67a71191136e..6058b53dcb89 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -7,6 +7,7 @@
7#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/seq_file.h>
10#include <linux/sysctl.h> 11#include <linux/sysctl.h>
11#include <linux/highmem.h> 12#include <linux/highmem.h>
12#include <linux/mmu_notifier.h> 13#include <linux/mmu_notifier.h>
@@ -262,7 +263,7 @@ struct resv_map {
262 struct list_head regions; 263 struct list_head regions;
263}; 264};
264 265
265struct resv_map *resv_map_alloc(void) 266static struct resv_map *resv_map_alloc(void)
266{ 267{
267 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); 268 struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
268 if (!resv_map) 269 if (!resv_map)
@@ -274,7 +275,7 @@ struct resv_map *resv_map_alloc(void)
274 return resv_map; 275 return resv_map;
275} 276}
276 277
277void resv_map_release(struct kref *ref) 278static void resv_map_release(struct kref *ref)
278{ 279{
279 struct resv_map *resv_map = container_of(ref, struct resv_map, refs); 280 struct resv_map *resv_map = container_of(ref, struct resv_map, refs);
280 281
@@ -289,7 +290,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
289 if (!(vma->vm_flags & VM_SHARED)) 290 if (!(vma->vm_flags & VM_SHARED))
290 return (struct resv_map *)(get_vma_private_data(vma) & 291 return (struct resv_map *)(get_vma_private_data(vma) &
291 ~HPAGE_RESV_MASK); 292 ~HPAGE_RESV_MASK);
292 return 0; 293 return NULL;
293} 294}
294 295
295static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) 296static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map)
@@ -353,11 +354,26 @@ static int vma_has_reserves(struct vm_area_struct *vma)
353 return 0; 354 return 0;
354} 355}
355 356
357static void clear_gigantic_page(struct page *page,
358 unsigned long addr, unsigned long sz)
359{
360 int i;
361 struct page *p = page;
362
363 might_sleep();
364 for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) {
365 cond_resched();
366 clear_user_highpage(p, addr + i * PAGE_SIZE);
367 }
368}
356static void clear_huge_page(struct page *page, 369static void clear_huge_page(struct page *page,
357 unsigned long addr, unsigned long sz) 370 unsigned long addr, unsigned long sz)
358{ 371{
359 int i; 372 int i;
360 373
374 if (unlikely(sz > MAX_ORDER_NR_PAGES))
375 return clear_gigantic_page(page, addr, sz);
376
361 might_sleep(); 377 might_sleep();
362 for (i = 0; i < sz/PAGE_SIZE; i++) { 378 for (i = 0; i < sz/PAGE_SIZE; i++) {
363 cond_resched(); 379 cond_resched();
@@ -365,12 +381,32 @@ static void clear_huge_page(struct page *page,
365 } 381 }
366} 382}
367 383
384static void copy_gigantic_page(struct page *dst, struct page *src,
385 unsigned long addr, struct vm_area_struct *vma)
386{
387 int i;
388 struct hstate *h = hstate_vma(vma);
389 struct page *dst_base = dst;
390 struct page *src_base = src;
391 might_sleep();
392 for (i = 0; i < pages_per_huge_page(h); ) {
393 cond_resched();
394 copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma);
395
396 i++;
397 dst = mem_map_next(dst, dst_base, i);
398 src = mem_map_next(src, src_base, i);
399 }
400}
368static void copy_huge_page(struct page *dst, struct page *src, 401static void copy_huge_page(struct page *dst, struct page *src,
369 unsigned long addr, struct vm_area_struct *vma) 402 unsigned long addr, struct vm_area_struct *vma)
370{ 403{
371 int i; 404 int i;
372 struct hstate *h = hstate_vma(vma); 405 struct hstate *h = hstate_vma(vma);
373 406
407 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES))
408 return copy_gigantic_page(dst, src, addr, vma);
409
374 might_sleep(); 410 might_sleep();
375 for (i = 0; i < pages_per_huge_page(h); i++) { 411 for (i = 0; i < pages_per_huge_page(h); i++) {
376 cond_resched(); 412 cond_resched();
@@ -455,6 +491,8 @@ static void update_and_free_page(struct hstate *h, struct page *page)
455{ 491{
456 int i; 492 int i;
457 493
494 VM_BUG_ON(h->order >= MAX_ORDER);
495
458 h->nr_huge_pages--; 496 h->nr_huge_pages--;
459 h->nr_huge_pages_node[page_to_nid(page)]--; 497 h->nr_huge_pages_node[page_to_nid(page)]--;
460 for (i = 0; i < pages_per_huge_page(h); i++) { 498 for (i = 0; i < pages_per_huge_page(h); i++) {
@@ -969,6 +1007,14 @@ found:
969 return 1; 1007 return 1;
970} 1008}
971 1009
1010static void prep_compound_huge_page(struct page *page, int order)
1011{
1012 if (unlikely(order > (MAX_ORDER - 1)))
1013 prep_compound_gigantic_page(page, order);
1014 else
1015 prep_compound_page(page, order);
1016}
1017
972/* Put bootmem huge pages into the standard lists after mem_map is up */ 1018/* Put bootmem huge pages into the standard lists after mem_map is up */
973static void __init gather_bootmem_prealloc(void) 1019static void __init gather_bootmem_prealloc(void)
974{ 1020{
@@ -979,7 +1025,7 @@ static void __init gather_bootmem_prealloc(void)
979 struct hstate *h = m->hstate; 1025 struct hstate *h = m->hstate;
980 __ClearPageReserved(page); 1026 __ClearPageReserved(page);
981 WARN_ON(page_count(page) != 1); 1027 WARN_ON(page_count(page) != 1);
982 prep_compound_page(page, h->order); 1028 prep_compound_huge_page(page, h->order);
983 prep_new_huge_page(h, page, page_to_nid(page)); 1029 prep_new_huge_page(h, page, page_to_nid(page));
984 } 1030 }
985} 1031}
@@ -1455,15 +1501,15 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
1455 1501
1456#endif /* CONFIG_SYSCTL */ 1502#endif /* CONFIG_SYSCTL */
1457 1503
1458int hugetlb_report_meminfo(char *buf) 1504void hugetlb_report_meminfo(struct seq_file *m)
1459{ 1505{
1460 struct hstate *h = &default_hstate; 1506 struct hstate *h = &default_hstate;
1461 return sprintf(buf, 1507 seq_printf(m,
1462 "HugePages_Total: %5lu\n" 1508 "HugePages_Total: %5lu\n"
1463 "HugePages_Free: %5lu\n" 1509 "HugePages_Free: %5lu\n"
1464 "HugePages_Rsvd: %5lu\n" 1510 "HugePages_Rsvd: %5lu\n"
1465 "HugePages_Surp: %5lu\n" 1511 "HugePages_Surp: %5lu\n"
1466 "Hugepagesize: %5lu kB\n", 1512 "Hugepagesize: %8lu kB\n",
1467 h->nr_huge_pages, 1513 h->nr_huge_pages,
1468 h->free_huge_pages, 1514 h->free_huge_pages,
1469 h->resv_huge_pages, 1515 h->resv_huge_pages,
@@ -1747,11 +1793,10 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
1747 * from other VMAs and let the children be SIGKILLed if they are faulting the 1793 * from other VMAs and let the children be SIGKILLed if they are faulting the
1748 * same region. 1794 * same region.
1749 */ 1795 */
1750int unmap_ref_private(struct mm_struct *mm, 1796static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
1751 struct vm_area_struct *vma, 1797 struct page *page, unsigned long address)
1752 struct page *page,
1753 unsigned long address)
1754{ 1798{
1799 struct hstate *h = hstate_vma(vma);
1755 struct vm_area_struct *iter_vma; 1800 struct vm_area_struct *iter_vma;
1756 struct address_space *mapping; 1801 struct address_space *mapping;
1757 struct prio_tree_iter iter; 1802 struct prio_tree_iter iter;
@@ -1761,7 +1806,7 @@ int unmap_ref_private(struct mm_struct *mm,
1761 * vm_pgoff is in PAGE_SIZE units, hence the different calculation 1806 * vm_pgoff is in PAGE_SIZE units, hence the different calculation
1762 * from page cache lookup which is in HPAGE_SIZE units. 1807 * from page cache lookup which is in HPAGE_SIZE units.
1763 */ 1808 */
1764 address = address & huge_page_mask(hstate_vma(vma)); 1809 address = address & huge_page_mask(h);
1765 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) 1810 pgoff = ((address - vma->vm_start) >> PAGE_SHIFT)
1766 + (vma->vm_pgoff >> PAGE_SHIFT); 1811 + (vma->vm_pgoff >> PAGE_SHIFT);
1767 mapping = (struct address_space *)page_private(page); 1812 mapping = (struct address_space *)page_private(page);
@@ -1780,7 +1825,7 @@ int unmap_ref_private(struct mm_struct *mm,
1780 */ 1825 */
1781 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) 1826 if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER))
1782 unmap_hugepage_range(iter_vma, 1827 unmap_hugepage_range(iter_vma,
1783 address, address + HPAGE_SIZE, 1828 address, address + huge_page_size(h),
1784 page); 1829 page);
1785 } 1830 }
1786 1831
@@ -2008,7 +2053,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2008 entry = huge_ptep_get(ptep); 2053 entry = huge_ptep_get(ptep);
2009 if (huge_pte_none(entry)) { 2054 if (huge_pte_none(entry)) {
2010 ret = hugetlb_no_page(mm, vma, address, ptep, write_access); 2055 ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
2011 goto out_unlock; 2056 goto out_mutex;
2012 } 2057 }
2013 2058
2014 ret = 0; 2059 ret = 0;
@@ -2024,7 +2069,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2024 if (write_access && !pte_write(entry)) { 2069 if (write_access && !pte_write(entry)) {
2025 if (vma_needs_reservation(h, vma, address) < 0) { 2070 if (vma_needs_reservation(h, vma, address) < 0) {
2026 ret = VM_FAULT_OOM; 2071 ret = VM_FAULT_OOM;
2027 goto out_unlock; 2072 goto out_mutex;
2028 } 2073 }
2029 2074
2030 if (!(vma->vm_flags & VM_SHARED)) 2075 if (!(vma->vm_flags & VM_SHARED))
@@ -2034,10 +2079,23 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2034 2079
2035 spin_lock(&mm->page_table_lock); 2080 spin_lock(&mm->page_table_lock);
2036 /* Check for a racing update before calling hugetlb_cow */ 2081 /* Check for a racing update before calling hugetlb_cow */
2037 if (likely(pte_same(entry, huge_ptep_get(ptep)))) 2082 if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
2038 if (write_access && !pte_write(entry)) 2083 goto out_page_table_lock;
2084
2085
2086 if (write_access) {
2087 if (!pte_write(entry)) {
2039 ret = hugetlb_cow(mm, vma, address, ptep, entry, 2088 ret = hugetlb_cow(mm, vma, address, ptep, entry,
2040 pagecache_page); 2089 pagecache_page);
2090 goto out_page_table_lock;
2091 }
2092 entry = pte_mkdirty(entry);
2093 }
2094 entry = pte_mkyoung(entry);
2095 if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access))
2096 update_mmu_cache(vma, address, entry);
2097
2098out_page_table_lock:
2041 spin_unlock(&mm->page_table_lock); 2099 spin_unlock(&mm->page_table_lock);
2042 2100
2043 if (pagecache_page) { 2101 if (pagecache_page) {
@@ -2045,7 +2103,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2045 put_page(pagecache_page); 2103 put_page(pagecache_page);
2046 } 2104 }
2047 2105
2048out_unlock: 2106out_mutex:
2049 mutex_unlock(&hugetlb_instantiation_mutex); 2107 mutex_unlock(&hugetlb_instantiation_mutex);
2050 2108
2051 return ret; 2109 return ret;
@@ -2060,6 +2118,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
2060 return NULL; 2118 return NULL;
2061} 2119}
2062 2120
2121static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
2122{
2123 if (!ptep || write || shared)
2124 return 0;
2125 else
2126 return huge_pte_none(huge_ptep_get(ptep));
2127}
2128
2063int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, 2129int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2064 struct page **pages, struct vm_area_struct **vmas, 2130 struct page **pages, struct vm_area_struct **vmas,
2065 unsigned long *position, int *length, int i, 2131 unsigned long *position, int *length, int i,
@@ -2069,6 +2135,8 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2069 unsigned long vaddr = *position; 2135 unsigned long vaddr = *position;
2070 int remainder = *length; 2136 int remainder = *length;
2071 struct hstate *h = hstate_vma(vma); 2137 struct hstate *h = hstate_vma(vma);
2138 int zeropage_ok = 0;
2139 int shared = vma->vm_flags & VM_SHARED;
2072 2140
2073 spin_lock(&mm->page_table_lock); 2141 spin_lock(&mm->page_table_lock);
2074 while (vaddr < vma->vm_end && remainder) { 2142 while (vaddr < vma->vm_end && remainder) {
@@ -2081,8 +2149,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2081 * first, for the page indexing below to work. 2149 * first, for the page indexing below to work.
2082 */ 2150 */
2083 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); 2151 pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
2152 if (huge_zeropage_ok(pte, write, shared))
2153 zeropage_ok = 1;
2084 2154
2085 if (!pte || huge_pte_none(huge_ptep_get(pte)) || 2155 if (!pte ||
2156 (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) ||
2086 (write && !pte_write(huge_ptep_get(pte)))) { 2157 (write && !pte_write(huge_ptep_get(pte)))) {
2087 int ret; 2158 int ret;
2088 2159
@@ -2102,8 +2173,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
2102 page = pte_page(huge_ptep_get(pte)); 2173 page = pte_page(huge_ptep_get(pte));
2103same_page: 2174same_page:
2104 if (pages) { 2175 if (pages) {
2105 get_page(page); 2176 if (zeropage_ok)
2106 pages[i] = page + pfn_offset; 2177 pages[i] = ZERO_PAGE(0);
2178 else
2179 pages[i] = mem_map_offset(page, pfn_offset);
2180 get_page(pages[i]);
2107 } 2181 }
2108 2182
2109 if (vmas) 2183 if (vmas)
diff --git a/mm/internal.h b/mm/internal.h
index 1f43f7416972..13333bc2eb68 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -17,6 +17,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
17 unsigned long floor, unsigned long ceiling); 17 unsigned long floor, unsigned long ceiling);
18 18
19extern void prep_compound_page(struct page *page, unsigned long order); 19extern void prep_compound_page(struct page *page, unsigned long order);
20extern void prep_compound_gigantic_page(struct page *page, unsigned long order);
20 21
21static inline void set_page_count(struct page *page, int v) 22static inline void set_page_count(struct page *page, int v)
22{ 23{
@@ -39,6 +40,15 @@ static inline void __put_page(struct page *page)
39 atomic_dec(&page->_count); 40 atomic_dec(&page->_count);
40} 41}
41 42
43/*
44 * in mm/vmscan.c:
45 */
46extern int isolate_lru_page(struct page *page);
47extern void putback_lru_page(struct page *page);
48
49/*
50 * in mm/page_alloc.c
51 */
42extern void __free_pages_bootmem(struct page *page, unsigned int order); 52extern void __free_pages_bootmem(struct page *page, unsigned int order);
43 53
44/* 54/*
@@ -52,6 +62,148 @@ static inline unsigned long page_order(struct page *page)
52 return page_private(page); 62 return page_private(page);
53} 63}
54 64
65extern long mlock_vma_pages_range(struct vm_area_struct *vma,
66 unsigned long start, unsigned long end);
67extern void munlock_vma_pages_range(struct vm_area_struct *vma,
68 unsigned long start, unsigned long end);
69static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
70{
71 munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
72}
73
74#ifdef CONFIG_UNEVICTABLE_LRU
75/*
76 * unevictable_migrate_page() called only from migrate_page_copy() to
77 * migrate unevictable flag to new page.
78 * Note that the old page has been isolated from the LRU lists at this
79 * point so we don't need to worry about LRU statistics.
80 */
81static inline void unevictable_migrate_page(struct page *new, struct page *old)
82{
83 if (TestClearPageUnevictable(old))
84 SetPageUnevictable(new);
85}
86#else
87static inline void unevictable_migrate_page(struct page *new, struct page *old)
88{
89}
90#endif
91
92#ifdef CONFIG_UNEVICTABLE_LRU
93/*
94 * Called only in fault path via page_evictable() for a new page
95 * to determine if it's being mapped into a LOCKED vma.
96 * If so, mark page as mlocked.
97 */
98static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page)
99{
100 VM_BUG_ON(PageLRU(page));
101
102 if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
103 return 0;
104
105 if (!TestSetPageMlocked(page)) {
106 inc_zone_page_state(page, NR_MLOCK);
107 count_vm_event(UNEVICTABLE_PGMLOCKED);
108 }
109 return 1;
110}
111
112/*
113 * must be called with vma's mmap_sem held for read, and page locked.
114 */
115extern void mlock_vma_page(struct page *page);
116
117/*
118 * Clear the page's PageMlocked(). This can be useful in a situation where
119 * we want to unconditionally remove a page from the pagecache -- e.g.,
120 * on truncation or freeing.
121 *
122 * It is legal to call this function for any page, mlocked or not.
123 * If called for a page that is still mapped by mlocked vmas, all we do
124 * is revert to lazy LRU behaviour -- semantics are not broken.
125 */
126extern void __clear_page_mlock(struct page *page);
127static inline void clear_page_mlock(struct page *page)
128{
129 if (unlikely(TestClearPageMlocked(page)))
130 __clear_page_mlock(page);
131}
132
133/*
134 * mlock_migrate_page - called only from migrate_page_copy() to
135 * migrate the Mlocked page flag; update statistics.
136 */
137static inline void mlock_migrate_page(struct page *newpage, struct page *page)
138{
139 if (TestClearPageMlocked(page)) {
140 unsigned long flags;
141
142 local_irq_save(flags);
143 __dec_zone_page_state(page, NR_MLOCK);
144 SetPageMlocked(newpage);
145 __inc_zone_page_state(newpage, NR_MLOCK);
146 local_irq_restore(flags);
147 }
148}
149
150/*
151 * free_page_mlock() -- clean up attempts to free and mlocked() page.
152 * Page should not be on lru, so no need to fix that up.
153 * free_pages_check() will verify...
154 */
155static inline void free_page_mlock(struct page *page)
156{
157 if (unlikely(TestClearPageMlocked(page))) {
158 unsigned long flags;
159
160 local_irq_save(flags);
161 __dec_zone_page_state(page, NR_MLOCK);
162 __count_vm_event(UNEVICTABLE_MLOCKFREED);
163 local_irq_restore(flags);
164 }
165}
166
167#else /* CONFIG_UNEVICTABLE_LRU */
168static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
169{
170 return 0;
171}
172static inline void clear_page_mlock(struct page *page) { }
173static inline void mlock_vma_page(struct page *page) { }
174static inline void mlock_migrate_page(struct page *new, struct page *old) { }
175static inline void free_page_mlock(struct page *page) { }
176
177#endif /* CONFIG_UNEVICTABLE_LRU */
178
179/*
180 * Return the mem_map entry representing the 'offset' subpage within
181 * the maximally aligned gigantic page 'base'. Handle any discontiguity
182 * in the mem_map at MAX_ORDER_NR_PAGES boundaries.
183 */
184static inline struct page *mem_map_offset(struct page *base, int offset)
185{
186 if (unlikely(offset >= MAX_ORDER_NR_PAGES))
187 return pfn_to_page(page_to_pfn(base) + offset);
188 return base + offset;
189}
190
191/*
192 * Iterator over all subpages withing the maximally aligned gigantic
193 * page 'base'. Handle any discontiguity in the mem_map.
194 */
195static inline struct page *mem_map_next(struct page *iter,
196 struct page *base, int offset)
197{
198 if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) {
199 unsigned long pfn = page_to_pfn(base) + offset;
200 if (!pfn_valid(pfn))
201 return NULL;
202 return pfn_to_page(pfn);
203 }
204 return iter + 1;
205}
206
55/* 207/*
56 * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, 208 * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node,
57 * so all functions starting at paging_init should be marked __init 209 * so all functions starting at paging_init should be marked __init
@@ -120,4 +272,12 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
120} 272}
121#endif /* CONFIG_SPARSEMEM */ 273#endif /* CONFIG_SPARSEMEM */
122 274
275#define GUP_FLAGS_WRITE 0x1
276#define GUP_FLAGS_FORCE 0x2
277#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
278
279int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
280 unsigned long start, int len, int flags,
281 struct page **pages, struct vm_area_struct **vmas);
282
123#endif 283#endif
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0f1f7a7374ba..866dcc7eeb0c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -32,11 +32,12 @@
32#include <linux/fs.h> 32#include <linux/fs.h>
33#include <linux/seq_file.h> 33#include <linux/seq_file.h>
34#include <linux/vmalloc.h> 34#include <linux/vmalloc.h>
35#include <linux/mm_inline.h>
36#include <linux/page_cgroup.h>
35 37
36#include <asm/uaccess.h> 38#include <asm/uaccess.h>
37 39
38struct cgroup_subsys mem_cgroup_subsys __read_mostly; 40struct cgroup_subsys mem_cgroup_subsys __read_mostly;
39static struct kmem_cache *page_cgroup_cache __read_mostly;
40#define MEM_CGROUP_RECLAIM_RETRIES 5 41#define MEM_CGROUP_RECLAIM_RETRIES 5
41 42
42/* 43/*
@@ -65,11 +66,10 @@ struct mem_cgroup_stat {
65/* 66/*
66 * For accounting under irq disable, no need for increment preempt count. 67 * For accounting under irq disable, no need for increment preempt count.
67 */ 68 */
68static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat, 69static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
69 enum mem_cgroup_stat_index idx, int val) 70 enum mem_cgroup_stat_index idx, int val)
70{ 71{
71 int cpu = smp_processor_id(); 72 stat->count[idx] += val;
72 stat->cpustat[cpu].count[idx] += val;
73} 73}
74 74
75static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, 75static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
@@ -85,22 +85,13 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
85/* 85/*
86 * per-zone information in memory controller. 86 * per-zone information in memory controller.
87 */ 87 */
88
89enum mem_cgroup_zstat_index {
90 MEM_CGROUP_ZSTAT_ACTIVE,
91 MEM_CGROUP_ZSTAT_INACTIVE,
92
93 NR_MEM_CGROUP_ZSTAT,
94};
95
96struct mem_cgroup_per_zone { 88struct mem_cgroup_per_zone {
97 /* 89 /*
98 * spin_lock to protect the per cgroup LRU 90 * spin_lock to protect the per cgroup LRU
99 */ 91 */
100 spinlock_t lru_lock; 92 spinlock_t lru_lock;
101 struct list_head active_list; 93 struct list_head lists[NR_LRU_LISTS];
102 struct list_head inactive_list; 94 unsigned long count[NR_LRU_LISTS];
103 unsigned long count[NR_MEM_CGROUP_ZSTAT];
104}; 95};
105/* Macro for accessing counter */ 96/* Macro for accessing counter */
106#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 97#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -144,69 +135,52 @@ struct mem_cgroup {
144}; 135};
145static struct mem_cgroup init_mem_cgroup; 136static struct mem_cgroup init_mem_cgroup;
146 137
147/*
148 * We use the lower bit of the page->page_cgroup pointer as a bit spin
149 * lock. We need to ensure that page->page_cgroup is at least two
150 * byte aligned (based on comments from Nick Piggin). But since
151 * bit_spin_lock doesn't actually set that lock bit in a non-debug
152 * uniprocessor kernel, we should avoid setting it here too.
153 */
154#define PAGE_CGROUP_LOCK_BIT 0x0
155#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
156#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
157#else
158#define PAGE_CGROUP_LOCK 0x0
159#endif
160
161/*
162 * A page_cgroup page is associated with every page descriptor. The
163 * page_cgroup helps us identify information about the cgroup
164 */
165struct page_cgroup {
166 struct list_head lru; /* per cgroup LRU list */
167 struct page *page;
168 struct mem_cgroup *mem_cgroup;
169 int flags;
170};
171#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
172#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
173
174static int page_cgroup_nid(struct page_cgroup *pc)
175{
176 return page_to_nid(pc->page);
177}
178
179static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
180{
181 return page_zonenum(pc->page);
182}
183
184enum charge_type { 138enum charge_type {
185 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 139 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
186 MEM_CGROUP_CHARGE_TYPE_MAPPED, 140 MEM_CGROUP_CHARGE_TYPE_MAPPED,
141 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
187 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 142 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
143 NR_CHARGE_TYPE,
144};
145
146/* only for here (for easy reading.) */
147#define PCGF_CACHE (1UL << PCG_CACHE)
148#define PCGF_USED (1UL << PCG_USED)
149#define PCGF_ACTIVE (1UL << PCG_ACTIVE)
150#define PCGF_LOCK (1UL << PCG_LOCK)
151#define PCGF_FILE (1UL << PCG_FILE)
152static const unsigned long
153pcg_default_flags[NR_CHARGE_TYPE] = {
154 PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */
155 PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */
156 PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
157 0, /* FORCE */
188}; 158};
189 159
190/* 160/*
191 * Always modified under lru lock. Then, not necessary to preempt_disable() 161 * Always modified under lru lock. Then, not necessary to preempt_disable()
192 */ 162 */
193static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, 163static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
194 bool charge) 164 struct page_cgroup *pc,
165 bool charge)
195{ 166{
196 int val = (charge)? 1 : -1; 167 int val = (charge)? 1 : -1;
197 struct mem_cgroup_stat *stat = &mem->stat; 168 struct mem_cgroup_stat *stat = &mem->stat;
169 struct mem_cgroup_stat_cpu *cpustat;
198 170
199 VM_BUG_ON(!irqs_disabled()); 171 VM_BUG_ON(!irqs_disabled());
200 if (flags & PAGE_CGROUP_FLAG_CACHE) 172
201 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val); 173 cpustat = &stat->cpustat[smp_processor_id()];
174 if (PageCgroupCache(pc))
175 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
202 else 176 else
203 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); 177 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
204 178
205 if (charge) 179 if (charge)
206 __mem_cgroup_stat_add_safe(stat, 180 __mem_cgroup_stat_add_safe(cpustat,
207 MEM_CGROUP_STAT_PGPGIN_COUNT, 1); 181 MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
208 else 182 else
209 __mem_cgroup_stat_add_safe(stat, 183 __mem_cgroup_stat_add_safe(cpustat,
210 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 184 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
211} 185}
212 186
@@ -227,7 +201,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
227} 201}
228 202
229static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, 203static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
230 enum mem_cgroup_zstat_index idx) 204 enum lru_list idx)
231{ 205{
232 int nid, zid; 206 int nid, zid;
233 struct mem_cgroup_per_zone *mz; 207 struct mem_cgroup_per_zone *mz;
@@ -250,89 +224,89 @@ static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
250 224
251struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 225struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
252{ 226{
227 /*
228 * mm_update_next_owner() may clear mm->owner to NULL
229 * if it races with swapoff, page migration, etc.
230 * So this can be called with p == NULL.
231 */
232 if (unlikely(!p))
233 return NULL;
234
253 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 235 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
254 struct mem_cgroup, css); 236 struct mem_cgroup, css);
255} 237}
256 238
257static inline int page_cgroup_locked(struct page *page)
258{
259 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
260}
261
262static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
263{
264 VM_BUG_ON(!page_cgroup_locked(page));
265 page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
266}
267
268struct page_cgroup *page_get_page_cgroup(struct page *page)
269{
270 return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
271}
272
273static void lock_page_cgroup(struct page *page)
274{
275 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
276}
277
278static int try_lock_page_cgroup(struct page *page)
279{
280 return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
281}
282
283static void unlock_page_cgroup(struct page *page)
284{
285 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
286}
287
288static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, 239static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz,
289 struct page_cgroup *pc) 240 struct page_cgroup *pc)
290{ 241{
291 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 242 int lru = LRU_BASE;
243
244 if (PageCgroupUnevictable(pc))
245 lru = LRU_UNEVICTABLE;
246 else {
247 if (PageCgroupActive(pc))
248 lru += LRU_ACTIVE;
249 if (PageCgroupFile(pc))
250 lru += LRU_FILE;
251 }
292 252
293 if (from) 253 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
294 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1;
295 else
296 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
297 254
298 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); 255 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false);
299 list_del(&pc->lru); 256 list_del(&pc->lru);
300} 257}
301 258
302static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, 259static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz,
303 struct page_cgroup *pc) 260 struct page_cgroup *pc)
304{ 261{
305 int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; 262 int lru = LRU_BASE;
306 263
307 if (!to) { 264 if (PageCgroupUnevictable(pc))
308 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 265 lru = LRU_UNEVICTABLE;
309 list_add(&pc->lru, &mz->inactive_list); 266 else {
310 } else { 267 if (PageCgroupActive(pc))
311 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 268 lru += LRU_ACTIVE;
312 list_add(&pc->lru, &mz->active_list); 269 if (PageCgroupFile(pc))
270 lru += LRU_FILE;
313 } 271 }
314 mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); 272
273 MEM_CGROUP_ZSTAT(mz, lru) += 1;
274 list_add(&pc->lru, &mz->lists[lru]);
275
276 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true);
315} 277}
316 278
317static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 279static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru)
318{ 280{
319 int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE;
320 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 281 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc);
282 int active = PageCgroupActive(pc);
283 int file = PageCgroupFile(pc);
284 int unevictable = PageCgroupUnevictable(pc);
285 enum lru_list from = unevictable ? LRU_UNEVICTABLE :
286 (LRU_FILE * !!file + !!active);
321 287
322 if (from) 288 if (lru == from)
323 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; 289 return;
324 else
325 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1;
326 290
327 if (active) { 291 MEM_CGROUP_ZSTAT(mz, from) -= 1;
328 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; 292 /*
329 pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; 293 * However this is done under mz->lru_lock, another flags, which
330 list_move(&pc->lru, &mz->active_list); 294 * are not related to LRU, will be modified from out-of-lock.
295 * We have to use atomic set/clear flags.
296 */
297 if (is_unevictable_lru(lru)) {
298 ClearPageCgroupActive(pc);
299 SetPageCgroupUnevictable(pc);
331 } else { 300 } else {
332 MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; 301 if (is_active_lru(lru))
333 pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; 302 SetPageCgroupActive(pc);
334 list_move(&pc->lru, &mz->inactive_list); 303 else
304 ClearPageCgroupActive(pc);
305 ClearPageCgroupUnevictable(pc);
335 } 306 }
307
308 MEM_CGROUP_ZSTAT(mz, lru) += 1;
309 list_move(&pc->lru, &mz->lists[lru]);
336} 310}
337 311
338int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 312int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
@@ -348,7 +322,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
348/* 322/*
349 * This routine assumes that the appropriate zone's lru lock is already held 323 * This routine assumes that the appropriate zone's lru lock is already held
350 */ 324 */
351void mem_cgroup_move_lists(struct page *page, bool active) 325void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
352{ 326{
353 struct page_cgroup *pc; 327 struct page_cgroup *pc;
354 struct mem_cgroup_per_zone *mz; 328 struct mem_cgroup_per_zone *mz;
@@ -364,17 +338,16 @@ void mem_cgroup_move_lists(struct page *page, bool active)
364 * safely get to page_cgroup without it, so just try_lock it: 338 * safely get to page_cgroup without it, so just try_lock it:
365 * mem_cgroup_isolate_pages allows for page left on wrong list. 339 * mem_cgroup_isolate_pages allows for page left on wrong list.
366 */ 340 */
367 if (!try_lock_page_cgroup(page)) 341 pc = lookup_page_cgroup(page);
342 if (!trylock_page_cgroup(pc))
368 return; 343 return;
369 344 if (pc && PageCgroupUsed(pc)) {
370 pc = page_get_page_cgroup(page);
371 if (pc) {
372 mz = page_cgroup_zoneinfo(pc); 345 mz = page_cgroup_zoneinfo(pc);
373 spin_lock_irqsave(&mz->lru_lock, flags); 346 spin_lock_irqsave(&mz->lru_lock, flags);
374 __mem_cgroup_move_lists(pc, active); 347 __mem_cgroup_move_lists(pc, lru);
375 spin_unlock_irqrestore(&mz->lru_lock, flags); 348 spin_unlock_irqrestore(&mz->lru_lock, flags);
376 } 349 }
377 unlock_page_cgroup(page); 350 unlock_page_cgroup(pc);
378} 351}
379 352
380/* 353/*
@@ -395,21 +368,6 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
395} 368}
396 369
397/* 370/*
398 * This function is called from vmscan.c. In page reclaiming loop. balance
399 * between active and inactive list is calculated. For memory controller
400 * page reclaiming, we should use using mem_cgroup's imbalance rather than
401 * zone's global lru imbalance.
402 */
403long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem)
404{
405 unsigned long active, inactive;
406 /* active and inactive are the number of pages. 'long' is ok.*/
407 active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE);
408 inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE);
409 return (long) (active / (inactive + 1));
410}
411
412/*
413 * prev_priority control...this will be used in memory reclaim path. 371 * prev_priority control...this will be used in memory reclaim path.
414 */ 372 */
415int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 373int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
@@ -436,28 +394,17 @@ void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
436 * (see include/linux/mmzone.h) 394 * (see include/linux/mmzone.h)
437 */ 395 */
438 396
439long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, 397long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone,
440 struct zone *zone, int priority) 398 int priority, enum lru_list lru)
441{ 399{
442 long nr_active; 400 long nr_pages;
443 int nid = zone->zone_pgdat->node_id; 401 int nid = zone->zone_pgdat->node_id;
444 int zid = zone_idx(zone); 402 int zid = zone_idx(zone);
445 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 403 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
446 404
447 nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE); 405 nr_pages = MEM_CGROUP_ZSTAT(mz, lru);
448 return (nr_active >> priority);
449}
450
451long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
452 struct zone *zone, int priority)
453{
454 long nr_inactive;
455 int nid = zone->zone_pgdat->node_id;
456 int zid = zone_idx(zone);
457 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
458 406
459 nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); 407 return (nr_pages >> priority);
460 return (nr_inactive >> priority);
461} 408}
462 409
463unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 410unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
@@ -465,7 +412,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
465 unsigned long *scanned, int order, 412 unsigned long *scanned, int order,
466 int mode, struct zone *z, 413 int mode, struct zone *z,
467 struct mem_cgroup *mem_cont, 414 struct mem_cgroup *mem_cont,
468 int active) 415 int active, int file)
469{ 416{
470 unsigned long nr_taken = 0; 417 unsigned long nr_taken = 0;
471 struct page *page; 418 struct page *page;
@@ -476,38 +423,38 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
476 int nid = z->zone_pgdat->node_id; 423 int nid = z->zone_pgdat->node_id;
477 int zid = zone_idx(z); 424 int zid = zone_idx(z);
478 struct mem_cgroup_per_zone *mz; 425 struct mem_cgroup_per_zone *mz;
426 int lru = LRU_FILE * !!file + !!active;
479 427
480 BUG_ON(!mem_cont); 428 BUG_ON(!mem_cont);
481 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 429 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
482 if (active) 430 src = &mz->lists[lru];
483 src = &mz->active_list;
484 else
485 src = &mz->inactive_list;
486
487 431
488 spin_lock(&mz->lru_lock); 432 spin_lock(&mz->lru_lock);
489 scan = 0; 433 scan = 0;
490 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 434 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
491 if (scan >= nr_to_scan) 435 if (scan >= nr_to_scan)
492 break; 436 break;
437 if (unlikely(!PageCgroupUsed(pc)))
438 continue;
493 page = pc->page; 439 page = pc->page;
494 440
495 if (unlikely(!PageLRU(page))) 441 if (unlikely(!PageLRU(page)))
496 continue; 442 continue;
497 443
498 if (PageActive(page) && !active) { 444 /*
499 __mem_cgroup_move_lists(pc, true); 445 * TODO: play better with lumpy reclaim, grabbing anything.
500 continue; 446 */
501 } 447 if (PageUnevictable(page) ||
502 if (!PageActive(page) && active) { 448 (PageActive(page) && !active) ||
503 __mem_cgroup_move_lists(pc, false); 449 (!PageActive(page) && active)) {
450 __mem_cgroup_move_lists(pc, page_lru(page));
504 continue; 451 continue;
505 } 452 }
506 453
507 scan++; 454 scan++;
508 list_move(&pc->lru, &pc_list); 455 list_move(&pc->lru, &pc_list);
509 456
510 if (__isolate_lru_page(page, mode) == 0) { 457 if (__isolate_lru_page(page, mode, file) == 0) {
511 list_move(&page->lru, dst); 458 list_move(&page->lru, dst);
512 nr_taken++; 459 nr_taken++;
513 } 460 }
@@ -532,23 +479,29 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
532{ 479{
533 struct mem_cgroup *mem; 480 struct mem_cgroup *mem;
534 struct page_cgroup *pc; 481 struct page_cgroup *pc;
535 unsigned long flags;
536 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 482 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
537 struct mem_cgroup_per_zone *mz; 483 struct mem_cgroup_per_zone *mz;
484 unsigned long flags;
538 485
539 pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); 486 pc = lookup_page_cgroup(page);
540 if (unlikely(pc == NULL)) 487 /* can happen at boot */
541 goto err; 488 if (unlikely(!pc))
542 489 return 0;
490 prefetchw(pc);
543 /* 491 /*
544 * We always charge the cgroup the mm_struct belongs to. 492 * We always charge the cgroup the mm_struct belongs to.
545 * The mm_struct's mem_cgroup changes on task migration if the 493 * The mm_struct's mem_cgroup changes on task migration if the
546 * thread group leader migrates. It's possible that mm is not 494 * thread group leader migrates. It's possible that mm is not
547 * set, if so charge the init_mm (happens for pagecache usage). 495 * set, if so charge the init_mm (happens for pagecache usage).
548 */ 496 */
497
549 if (likely(!memcg)) { 498 if (likely(!memcg)) {
550 rcu_read_lock(); 499 rcu_read_lock();
551 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 500 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
501 if (unlikely(!mem)) {
502 rcu_read_unlock();
503 return 0;
504 }
552 /* 505 /*
553 * For every charge from the cgroup, increment reference count 506 * For every charge from the cgroup, increment reference count
554 */ 507 */
@@ -559,7 +512,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
559 css_get(&memcg->css); 512 css_get(&memcg->css);
560 } 513 }
561 514
562 while (res_counter_charge(&mem->res, PAGE_SIZE)) { 515 while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
563 if (!(gfp_mask & __GFP_WAIT)) 516 if (!(gfp_mask & __GFP_WAIT))
564 goto out; 517 goto out;
565 518
@@ -582,39 +535,33 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
582 } 535 }
583 } 536 }
584 537
585 pc->mem_cgroup = mem;
586 pc->page = page;
587 /*
588 * If a page is accounted as a page cache, insert to inactive list.
589 * If anon, insert to active list.
590 */
591 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
592 pc->flags = PAGE_CGROUP_FLAG_CACHE;
593 else
594 pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
595 538
596 lock_page_cgroup(page); 539 lock_page_cgroup(pc);
597 if (unlikely(page_get_page_cgroup(page))) { 540 if (unlikely(PageCgroupUsed(pc))) {
598 unlock_page_cgroup(page); 541 unlock_page_cgroup(pc);
599 res_counter_uncharge(&mem->res, PAGE_SIZE); 542 res_counter_uncharge(&mem->res, PAGE_SIZE);
600 css_put(&mem->css); 543 css_put(&mem->css);
601 kmem_cache_free(page_cgroup_cache, pc); 544
602 goto done; 545 goto done;
603 } 546 }
604 page_assign_page_cgroup(page, pc); 547 pc->mem_cgroup = mem;
548 /*
549 * If a page is accounted as a page cache, insert to inactive list.
550 * If anon, insert to active list.
551 */
552 pc->flags = pcg_default_flags[ctype];
605 553
606 mz = page_cgroup_zoneinfo(pc); 554 mz = page_cgroup_zoneinfo(pc);
555
607 spin_lock_irqsave(&mz->lru_lock, flags); 556 spin_lock_irqsave(&mz->lru_lock, flags);
608 __mem_cgroup_add_list(mz, pc); 557 __mem_cgroup_add_list(mz, pc);
609 spin_unlock_irqrestore(&mz->lru_lock, flags); 558 spin_unlock_irqrestore(&mz->lru_lock, flags);
559 unlock_page_cgroup(pc);
610 560
611 unlock_page_cgroup(page);
612done: 561done:
613 return 0; 562 return 0;
614out: 563out:
615 css_put(&mem->css); 564 css_put(&mem->css);
616 kmem_cache_free(page_cgroup_cache, pc);
617err:
618 return -ENOMEM; 565 return -ENOMEM;
619} 566}
620 567
@@ -622,7 +569,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
622{ 569{
623 if (mem_cgroup_subsys.disabled) 570 if (mem_cgroup_subsys.disabled)
624 return 0; 571 return 0;
625 572 if (PageCompound(page))
573 return 0;
626 /* 574 /*
627 * If already mapped, we don't have to account. 575 * If already mapped, we don't have to account.
628 * If page cache, page->mapping has address_space. 576 * If page cache, page->mapping has address_space.
@@ -643,7 +591,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
643{ 591{
644 if (mem_cgroup_subsys.disabled) 592 if (mem_cgroup_subsys.disabled)
645 return 0; 593 return 0;
646 594 if (PageCompound(page))
595 return 0;
647 /* 596 /*
648 * Corner case handling. This is called from add_to_page_cache() 597 * Corner case handling. This is called from add_to_page_cache()
649 * in usual. But some FS (shmem) precharges this page before calling it 598 * in usual. But some FS (shmem) precharges this page before calling it
@@ -656,22 +605,27 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
656 if (!(gfp_mask & __GFP_WAIT)) { 605 if (!(gfp_mask & __GFP_WAIT)) {
657 struct page_cgroup *pc; 606 struct page_cgroup *pc;
658 607
659 lock_page_cgroup(page); 608
660 pc = page_get_page_cgroup(page); 609 pc = lookup_page_cgroup(page);
661 if (pc) { 610 if (!pc)
662 VM_BUG_ON(pc->page != page); 611 return 0;
663 VM_BUG_ON(!pc->mem_cgroup); 612 lock_page_cgroup(pc);
664 unlock_page_cgroup(page); 613 if (PageCgroupUsed(pc)) {
614 unlock_page_cgroup(pc);
665 return 0; 615 return 0;
666 } 616 }
667 unlock_page_cgroup(page); 617 unlock_page_cgroup(pc);
668 } 618 }
669 619
670 if (unlikely(!mm)) 620 if (unlikely(!mm))
671 mm = &init_mm; 621 mm = &init_mm;
672 622
673 return mem_cgroup_charge_common(page, mm, gfp_mask, 623 if (page_is_file_cache(page))
624 return mem_cgroup_charge_common(page, mm, gfp_mask,
674 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 625 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
626 else
627 return mem_cgroup_charge_common(page, mm, gfp_mask,
628 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
675} 629}
676 630
677/* 631/*
@@ -691,44 +645,46 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
691 /* 645 /*
692 * Check if our page_cgroup is valid 646 * Check if our page_cgroup is valid
693 */ 647 */
694 lock_page_cgroup(page); 648 pc = lookup_page_cgroup(page);
695 pc = page_get_page_cgroup(page); 649 if (unlikely(!pc || !PageCgroupUsed(pc)))
696 if (unlikely(!pc)) 650 return;
697 goto unlock;
698
699 VM_BUG_ON(pc->page != page);
700 651
701 if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) 652 lock_page_cgroup(pc);
702 && ((pc->flags & PAGE_CGROUP_FLAG_CACHE) 653 if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
703 || page_mapped(page))) 654 || !PageCgroupUsed(pc)) {
704 goto unlock; 655 /* This happens at race in zap_pte_range() and do_swap_page()*/
656 unlock_page_cgroup(pc);
657 return;
658 }
659 ClearPageCgroupUsed(pc);
660 mem = pc->mem_cgroup;
705 661
706 mz = page_cgroup_zoneinfo(pc); 662 mz = page_cgroup_zoneinfo(pc);
707 spin_lock_irqsave(&mz->lru_lock, flags); 663 spin_lock_irqsave(&mz->lru_lock, flags);
708 __mem_cgroup_remove_list(mz, pc); 664 __mem_cgroup_remove_list(mz, pc);
709 spin_unlock_irqrestore(&mz->lru_lock, flags); 665 spin_unlock_irqrestore(&mz->lru_lock, flags);
666 unlock_page_cgroup(pc);
710 667
711 page_assign_page_cgroup(page, NULL);
712 unlock_page_cgroup(page);
713
714 mem = pc->mem_cgroup;
715 res_counter_uncharge(&mem->res, PAGE_SIZE); 668 res_counter_uncharge(&mem->res, PAGE_SIZE);
716 css_put(&mem->css); 669 css_put(&mem->css);
717 670
718 kmem_cache_free(page_cgroup_cache, pc);
719 return; 671 return;
720unlock:
721 unlock_page_cgroup(page);
722} 672}
723 673
724void mem_cgroup_uncharge_page(struct page *page) 674void mem_cgroup_uncharge_page(struct page *page)
725{ 675{
676 /* early check. */
677 if (page_mapped(page))
678 return;
679 if (page->mapping && !PageAnon(page))
680 return;
726 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); 681 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED);
727} 682}
728 683
729void mem_cgroup_uncharge_cache_page(struct page *page) 684void mem_cgroup_uncharge_cache_page(struct page *page)
730{ 685{
731 VM_BUG_ON(page_mapped(page)); 686 VM_BUG_ON(page_mapped(page));
687 VM_BUG_ON(page->mapping);
732 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); 688 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
733} 689}
734 690
@@ -745,15 +701,19 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
745 if (mem_cgroup_subsys.disabled) 701 if (mem_cgroup_subsys.disabled)
746 return 0; 702 return 0;
747 703
748 lock_page_cgroup(page); 704 pc = lookup_page_cgroup(page);
749 pc = page_get_page_cgroup(page); 705 lock_page_cgroup(pc);
750 if (pc) { 706 if (PageCgroupUsed(pc)) {
751 mem = pc->mem_cgroup; 707 mem = pc->mem_cgroup;
752 css_get(&mem->css); 708 css_get(&mem->css);
753 if (pc->flags & PAGE_CGROUP_FLAG_CACHE) 709 if (PageCgroupCache(pc)) {
754 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 710 if (page_is_file_cache(page))
711 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
712 else
713 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
714 }
755 } 715 }
756 unlock_page_cgroup(page); 716 unlock_page_cgroup(pc);
757 if (mem) { 717 if (mem) {
758 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, 718 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL,
759 ctype, mem); 719 ctype, mem);
@@ -778,7 +738,7 @@ void mem_cgroup_end_migration(struct page *newpage)
778 */ 738 */
779 if (!newpage->mapping) 739 if (!newpage->mapping)
780 __mem_cgroup_uncharge_common(newpage, 740 __mem_cgroup_uncharge_common(newpage,
781 MEM_CGROUP_CHARGE_TYPE_FORCE); 741 MEM_CGROUP_CHARGE_TYPE_FORCE);
782 else if (PageAnon(newpage)) 742 else if (PageAnon(newpage))
783 mem_cgroup_uncharge_page(newpage); 743 mem_cgroup_uncharge_page(newpage);
784} 744}
@@ -801,11 +761,16 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
801 761
802 rcu_read_lock(); 762 rcu_read_lock();
803 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 763 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
764 if (unlikely(!mem)) {
765 rcu_read_unlock();
766 return 0;
767 }
804 css_get(&mem->css); 768 css_get(&mem->css);
805 rcu_read_unlock(); 769 rcu_read_unlock();
806 770
807 do { 771 do {
808 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); 772 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask);
773 progress += res_counter_check_under_limit(&mem->res);
809 } while (!progress && --retry); 774 } while (!progress && --retry);
810 775
811 css_put(&mem->css); 776 css_put(&mem->css);
@@ -845,7 +810,7 @@ int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val)
845#define FORCE_UNCHARGE_BATCH (128) 810#define FORCE_UNCHARGE_BATCH (128)
846static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, 811static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
847 struct mem_cgroup_per_zone *mz, 812 struct mem_cgroup_per_zone *mz,
848 int active) 813 enum lru_list lru)
849{ 814{
850 struct page_cgroup *pc; 815 struct page_cgroup *pc;
851 struct page *page; 816 struct page *page;
@@ -853,15 +818,14 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
853 unsigned long flags; 818 unsigned long flags;
854 struct list_head *list; 819 struct list_head *list;
855 820
856 if (active) 821 list = &mz->lists[lru];
857 list = &mz->active_list;
858 else
859 list = &mz->inactive_list;
860 822
861 spin_lock_irqsave(&mz->lru_lock, flags); 823 spin_lock_irqsave(&mz->lru_lock, flags);
862 while (!list_empty(list)) { 824 while (!list_empty(list)) {
863 pc = list_entry(list->prev, struct page_cgroup, lru); 825 pc = list_entry(list->prev, struct page_cgroup, lru);
864 page = pc->page; 826 page = pc->page;
827 if (!PageCgroupUsed(pc))
828 break;
865 get_page(page); 829 get_page(page);
866 spin_unlock_irqrestore(&mz->lru_lock, flags); 830 spin_unlock_irqrestore(&mz->lru_lock, flags);
867 /* 831 /*
@@ -876,8 +840,10 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
876 count = FORCE_UNCHARGE_BATCH; 840 count = FORCE_UNCHARGE_BATCH;
877 cond_resched(); 841 cond_resched();
878 } 842 }
879 } else 843 } else {
880 cond_resched(); 844 spin_lock_irqsave(&mz->lru_lock, flags);
845 break;
846 }
881 spin_lock_irqsave(&mz->lru_lock, flags); 847 spin_lock_irqsave(&mz->lru_lock, flags);
882 } 848 }
883 spin_unlock_irqrestore(&mz->lru_lock, flags); 849 spin_unlock_irqrestore(&mz->lru_lock, flags);
@@ -901,15 +867,17 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem)
901 while (mem->res.usage > 0) { 867 while (mem->res.usage > 0) {
902 if (atomic_read(&mem->css.cgroup->count) > 0) 868 if (atomic_read(&mem->css.cgroup->count) > 0)
903 goto out; 869 goto out;
870 /* This is for making all *used* pages to be on LRU. */
871 lru_add_drain_all();
904 for_each_node_state(node, N_POSSIBLE) 872 for_each_node_state(node, N_POSSIBLE)
905 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 873 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
906 struct mem_cgroup_per_zone *mz; 874 struct mem_cgroup_per_zone *mz;
875 enum lru_list l;
907 mz = mem_cgroup_zoneinfo(mem, node, zid); 876 mz = mem_cgroup_zoneinfo(mem, node, zid);
908 /* drop all page_cgroup in active_list */ 877 for_each_lru(l)
909 mem_cgroup_force_empty_list(mem, mz, 1); 878 mem_cgroup_force_empty_list(mem, mz, l);
910 /* drop all page_cgroup in inactive_list */
911 mem_cgroup_force_empty_list(mem, mz, 0);
912 } 879 }
880 cond_resched();
913 } 881 }
914 ret = 0; 882 ret = 0;
915out: 883out:
@@ -994,14 +962,27 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
994 } 962 }
995 /* showing # of active pages */ 963 /* showing # of active pages */
996 { 964 {
997 unsigned long active, inactive; 965 unsigned long active_anon, inactive_anon;
998 966 unsigned long active_file, inactive_file;
999 inactive = mem_cgroup_get_all_zonestat(mem_cont, 967 unsigned long unevictable;
1000 MEM_CGROUP_ZSTAT_INACTIVE); 968
1001 active = mem_cgroup_get_all_zonestat(mem_cont, 969 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
1002 MEM_CGROUP_ZSTAT_ACTIVE); 970 LRU_INACTIVE_ANON);
1003 cb->fill(cb, "active", (active) * PAGE_SIZE); 971 active_anon = mem_cgroup_get_all_zonestat(mem_cont,
1004 cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); 972 LRU_ACTIVE_ANON);
973 inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
974 LRU_INACTIVE_FILE);
975 active_file = mem_cgroup_get_all_zonestat(mem_cont,
976 LRU_ACTIVE_FILE);
977 unevictable = mem_cgroup_get_all_zonestat(mem_cont,
978 LRU_UNEVICTABLE);
979
980 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
981 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
982 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
983 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
984 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
985
1005 } 986 }
1006 return 0; 987 return 0;
1007} 988}
@@ -1044,6 +1025,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1044{ 1025{
1045 struct mem_cgroup_per_node *pn; 1026 struct mem_cgroup_per_node *pn;
1046 struct mem_cgroup_per_zone *mz; 1027 struct mem_cgroup_per_zone *mz;
1028 enum lru_list l;
1047 int zone, tmp = node; 1029 int zone, tmp = node;
1048 /* 1030 /*
1049 * This routine is called against possible nodes. 1031 * This routine is called against possible nodes.
@@ -1064,9 +1046,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1064 1046
1065 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 1047 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
1066 mz = &pn->zoneinfo[zone]; 1048 mz = &pn->zoneinfo[zone];
1067 INIT_LIST_HEAD(&mz->active_list);
1068 INIT_LIST_HEAD(&mz->inactive_list);
1069 spin_lock_init(&mz->lru_lock); 1049 spin_lock_init(&mz->lru_lock);
1050 for_each_lru(l)
1051 INIT_LIST_HEAD(&mz->lists[l]);
1070 } 1052 }
1071 return 0; 1053 return 0;
1072} 1054}
@@ -1107,7 +1089,6 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1107 1089
1108 if (unlikely((cont->parent) == NULL)) { 1090 if (unlikely((cont->parent) == NULL)) {
1109 mem = &init_mem_cgroup; 1091 mem = &init_mem_cgroup;
1110 page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC);
1111 } else { 1092 } else {
1112 mem = mem_cgroup_alloc(); 1093 mem = mem_cgroup_alloc();
1113 if (!mem) 1094 if (!mem)
diff --git a/mm/memory.c b/mm/memory.c
index 1002f473f497..fc031d68327e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1129,12 +1129,17 @@ static inline int use_zero_page(struct vm_area_struct *vma)
1129 return !vma->vm_ops || !vma->vm_ops->fault; 1129 return !vma->vm_ops || !vma->vm_ops->fault;
1130} 1130}
1131 1131
1132int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 1132
1133 unsigned long start, int len, int write, int force, 1133
1134int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1135 unsigned long start, int len, int flags,
1134 struct page **pages, struct vm_area_struct **vmas) 1136 struct page **pages, struct vm_area_struct **vmas)
1135{ 1137{
1136 int i; 1138 int i;
1137 unsigned int vm_flags; 1139 unsigned int vm_flags = 0;
1140 int write = !!(flags & GUP_FLAGS_WRITE);
1141 int force = !!(flags & GUP_FLAGS_FORCE);
1142 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1138 1143
1139 if (len <= 0) 1144 if (len <= 0)
1140 return 0; 1145 return 0;
@@ -1158,7 +1163,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1158 pud_t *pud; 1163 pud_t *pud;
1159 pmd_t *pmd; 1164 pmd_t *pmd;
1160 pte_t *pte; 1165 pte_t *pte;
1161 if (write) /* user gate pages are read-only */ 1166
1167 /* user gate pages are read-only */
1168 if (!ignore && write)
1162 return i ? : -EFAULT; 1169 return i ? : -EFAULT;
1163 if (pg > TASK_SIZE) 1170 if (pg > TASK_SIZE)
1164 pgd = pgd_offset_k(pg); 1171 pgd = pgd_offset_k(pg);
@@ -1190,8 +1197,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1190 continue; 1197 continue;
1191 } 1198 }
1192 1199
1193 if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) 1200 if (!vma ||
1194 || !(vm_flags & vma->vm_flags)) 1201 (vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
1202 (!ignore && !(vm_flags & vma->vm_flags)))
1195 return i ? : -EFAULT; 1203 return i ? : -EFAULT;
1196 1204
1197 if (is_vm_hugetlb_page(vma)) { 1205 if (is_vm_hugetlb_page(vma)) {
@@ -1266,6 +1274,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1266 } while (len); 1274 } while (len);
1267 return i; 1275 return i;
1268} 1276}
1277
1278int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1279 unsigned long start, int len, int write, int force,
1280 struct page **pages, struct vm_area_struct **vmas)
1281{
1282 int flags = 0;
1283
1284 if (write)
1285 flags |= GUP_FLAGS_WRITE;
1286 if (force)
1287 flags |= GUP_FLAGS_FORCE;
1288
1289 return __get_user_pages(tsk, mm,
1290 start, len, flags,
1291 pages, vmas);
1292}
1293
1269EXPORT_SYMBOL(get_user_pages); 1294EXPORT_SYMBOL(get_user_pages);
1270 1295
1271pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, 1296pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr,
@@ -1296,18 +1321,14 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1296 pte_t *pte; 1321 pte_t *pte;
1297 spinlock_t *ptl; 1322 spinlock_t *ptl;
1298 1323
1299 retval = mem_cgroup_charge(page, mm, GFP_KERNEL);
1300 if (retval)
1301 goto out;
1302
1303 retval = -EINVAL; 1324 retval = -EINVAL;
1304 if (PageAnon(page)) 1325 if (PageAnon(page))
1305 goto out_uncharge; 1326 goto out;
1306 retval = -ENOMEM; 1327 retval = -ENOMEM;
1307 flush_dcache_page(page); 1328 flush_dcache_page(page);
1308 pte = get_locked_pte(mm, addr, &ptl); 1329 pte = get_locked_pte(mm, addr, &ptl);
1309 if (!pte) 1330 if (!pte)
1310 goto out_uncharge; 1331 goto out;
1311 retval = -EBUSY; 1332 retval = -EBUSY;
1312 if (!pte_none(*pte)) 1333 if (!pte_none(*pte))
1313 goto out_unlock; 1334 goto out_unlock;
@@ -1323,8 +1344,6 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
1323 return retval; 1344 return retval;
1324out_unlock: 1345out_unlock:
1325 pte_unmap_unlock(pte, ptl); 1346 pte_unmap_unlock(pte, ptl);
1326out_uncharge:
1327 mem_cgroup_uncharge_page(page);
1328out: 1347out:
1329 return retval; 1348 return retval;
1330} 1349}
@@ -1858,6 +1877,15 @@ gotten:
1858 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); 1877 new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
1859 if (!new_page) 1878 if (!new_page)
1860 goto oom; 1879 goto oom;
1880 /*
1881 * Don't let another task, with possibly unlocked vma,
1882 * keep the mlocked page.
1883 */
1884 if (vma->vm_flags & VM_LOCKED) {
1885 lock_page(old_page); /* for LRU manipulation */
1886 clear_page_mlock(old_page);
1887 unlock_page(old_page);
1888 }
1861 cow_user_page(new_page, old_page, address, vma); 1889 cow_user_page(new_page, old_page, address, vma);
1862 __SetPageUptodate(new_page); 1890 __SetPageUptodate(new_page);
1863 1891
@@ -1886,11 +1914,13 @@ gotten:
1886 * thread doing COW. 1914 * thread doing COW.
1887 */ 1915 */
1888 ptep_clear_flush_notify(vma, address, page_table); 1916 ptep_clear_flush_notify(vma, address, page_table);
1889 set_pte_at(mm, address, page_table, entry); 1917 SetPageSwapBacked(new_page);
1890 update_mmu_cache(vma, address, entry); 1918 lru_cache_add_active_or_unevictable(new_page, vma);
1891 lru_cache_add_active(new_page);
1892 page_add_new_anon_rmap(new_page, vma, address); 1919 page_add_new_anon_rmap(new_page, vma, address);
1893 1920
1921//TODO: is this safe? do_anonymous_page() does it this way.
1922 set_pte_at(mm, address, page_table, entry);
1923 update_mmu_cache(vma, address, entry);
1894 if (old_page) { 1924 if (old_page) {
1895 /* 1925 /*
1896 * Only after switching the pte to the new page may 1926 * Only after switching the pte to the new page may
@@ -2288,16 +2318,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2288 count_vm_event(PGMAJFAULT); 2318 count_vm_event(PGMAJFAULT);
2289 } 2319 }
2290 2320
2321 mark_page_accessed(page);
2322
2323 lock_page(page);
2324 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2325
2291 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { 2326 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
2292 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2293 ret = VM_FAULT_OOM; 2327 ret = VM_FAULT_OOM;
2328 unlock_page(page);
2294 goto out; 2329 goto out;
2295 } 2330 }
2296 2331
2297 mark_page_accessed(page);
2298 lock_page(page);
2299 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2300
2301 /* 2332 /*
2302 * Back out if somebody else already faulted in this pte. 2333 * Back out if somebody else already faulted in this pte.
2303 */ 2334 */
@@ -2324,7 +2355,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2324 page_add_anon_rmap(page, vma, address); 2355 page_add_anon_rmap(page, vma, address);
2325 2356
2326 swap_free(entry); 2357 swap_free(entry);
2327 if (vm_swap_full()) 2358 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2328 remove_exclusive_swap_page(page); 2359 remove_exclusive_swap_page(page);
2329 unlock_page(page); 2360 unlock_page(page);
2330 2361
@@ -2382,7 +2413,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2382 if (!pte_none(*page_table)) 2413 if (!pte_none(*page_table))
2383 goto release; 2414 goto release;
2384 inc_mm_counter(mm, anon_rss); 2415 inc_mm_counter(mm, anon_rss);
2385 lru_cache_add_active(page); 2416 SetPageSwapBacked(page);
2417 lru_cache_add_active_or_unevictable(page, vma);
2386 page_add_new_anon_rmap(page, vma, address); 2418 page_add_new_anon_rmap(page, vma, address);
2387 set_pte_at(mm, address, page_table, entry); 2419 set_pte_at(mm, address, page_table, entry);
2388 2420
@@ -2423,6 +2455,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2423 struct page *page; 2455 struct page *page;
2424 pte_t entry; 2456 pte_t entry;
2425 int anon = 0; 2457 int anon = 0;
2458 int charged = 0;
2426 struct page *dirty_page = NULL; 2459 struct page *dirty_page = NULL;
2427 struct vm_fault vmf; 2460 struct vm_fault vmf;
2428 int ret; 2461 int ret;
@@ -2463,6 +2496,18 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2463 ret = VM_FAULT_OOM; 2496 ret = VM_FAULT_OOM;
2464 goto out; 2497 goto out;
2465 } 2498 }
2499 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
2500 ret = VM_FAULT_OOM;
2501 page_cache_release(page);
2502 goto out;
2503 }
2504 charged = 1;
2505 /*
2506 * Don't let another task, with possibly unlocked vma,
2507 * keep the mlocked page.
2508 */
2509 if (vma->vm_flags & VM_LOCKED)
2510 clear_page_mlock(vmf.page);
2466 copy_user_highpage(page, vmf.page, address, vma); 2511 copy_user_highpage(page, vmf.page, address, vma);
2467 __SetPageUptodate(page); 2512 __SetPageUptodate(page);
2468 } else { 2513 } else {
@@ -2497,11 +2542,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2497 2542
2498 } 2543 }
2499 2544
2500 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
2501 ret = VM_FAULT_OOM;
2502 goto out;
2503 }
2504
2505 page_table = pte_offset_map_lock(mm, pmd, address, &ptl); 2545 page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
2506 2546
2507 /* 2547 /*
@@ -2520,11 +2560,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2520 entry = mk_pte(page, vma->vm_page_prot); 2560 entry = mk_pte(page, vma->vm_page_prot);
2521 if (flags & FAULT_FLAG_WRITE) 2561 if (flags & FAULT_FLAG_WRITE)
2522 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2562 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2523 set_pte_at(mm, address, page_table, entry);
2524 if (anon) { 2563 if (anon) {
2525 inc_mm_counter(mm, anon_rss); 2564 inc_mm_counter(mm, anon_rss);
2526 lru_cache_add_active(page); 2565 SetPageSwapBacked(page);
2527 page_add_new_anon_rmap(page, vma, address); 2566 lru_cache_add_active_or_unevictable(page, vma);
2567 page_add_new_anon_rmap(page, vma, address);
2528 } else { 2568 } else {
2529 inc_mm_counter(mm, file_rss); 2569 inc_mm_counter(mm, file_rss);
2530 page_add_file_rmap(page); 2570 page_add_file_rmap(page);
@@ -2533,11 +2573,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2533 get_page(dirty_page); 2573 get_page(dirty_page);
2534 } 2574 }
2535 } 2575 }
2576//TODO: is this safe? do_anonymous_page() does it this way.
2577 set_pte_at(mm, address, page_table, entry);
2536 2578
2537 /* no need to invalidate: a not-present page won't be cached */ 2579 /* no need to invalidate: a not-present page won't be cached */
2538 update_mmu_cache(vma, address, entry); 2580 update_mmu_cache(vma, address, entry);
2539 } else { 2581 } else {
2540 mem_cgroup_uncharge_page(page); 2582 if (charged)
2583 mem_cgroup_uncharge_page(page);
2541 if (anon) 2584 if (anon)
2542 page_cache_release(page); 2585 page_cache_release(page);
2543 else 2586 else
@@ -2772,19 +2815,9 @@ int make_pages_present(unsigned long addr, unsigned long end)
2772 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; 2815 len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
2773 ret = get_user_pages(current, current->mm, addr, 2816 ret = get_user_pages(current, current->mm, addr,
2774 len, write, 0, NULL, NULL); 2817 len, write, 0, NULL, NULL);
2775 if (ret < 0) { 2818 if (ret < 0)
2776 /*
2777 SUS require strange return value to mlock
2778 - invalid addr generate to ENOMEM.
2779 - out of memory should generate EAGAIN.
2780 */
2781 if (ret == -EFAULT)
2782 ret = -ENOMEM;
2783 else if (ret == -ENOMEM)
2784 ret = -EAGAIN;
2785 return ret; 2819 return ret;
2786 } 2820 return ret == len ? 0 : -EFAULT;
2787 return ret == len ? 0 : -ENOMEM;
2788} 2821}
2789 2822
2790#if !defined(__HAVE_ARCH_GATE_AREA) 2823#if !defined(__HAVE_ARCH_GATE_AREA)
@@ -3016,3 +3049,18 @@ void print_vma_addr(char *prefix, unsigned long ip)
3016 } 3049 }
3017 up_read(&current->mm->mmap_sem); 3050 up_read(&current->mm->mmap_sem);
3018} 3051}
3052
3053#ifdef CONFIG_PROVE_LOCKING
3054void might_fault(void)
3055{
3056 might_sleep();
3057 /*
3058 * it would be nicer only to annotate paths which are not under
3059 * pagefault_disable, however that requires a larger audit and
3060 * providing helpers like get_user_atomic.
3061 */
3062 if (!in_atomic() && current->mm)
3063 might_lock_read(&current->mm->mmap_sem);
3064}
3065EXPORT_SYMBOL(might_fault);
3066#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 89fee2dcb039..b5b2b15085a8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -22,10 +22,10 @@
22#include <linux/highmem.h> 22#include <linux/highmem.h>
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <linux/ioport.h> 24#include <linux/ioport.h>
25#include <linux/cpuset.h>
26#include <linux/delay.h> 25#include <linux/delay.h>
27#include <linux/migrate.h> 26#include <linux/migrate.h>
28#include <linux/page-isolation.h> 27#include <linux/page-isolation.h>
28#include <linux/pfn.h>
29 29
30#include <asm/tlbflush.h> 30#include <asm/tlbflush.h>
31 31
@@ -323,11 +323,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
323 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); 323 BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
324 BUG_ON(nr_pages % PAGES_PER_SECTION); 324 BUG_ON(nr_pages % PAGES_PER_SECTION);
325 325
326 release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
327
328 sections_to_remove = nr_pages / PAGES_PER_SECTION; 326 sections_to_remove = nr_pages / PAGES_PER_SECTION;
329 for (i = 0; i < sections_to_remove; i++) { 327 for (i = 0; i < sections_to_remove; i++) {
330 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; 328 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
329 release_mem_region(pfn << PAGE_SHIFT,
330 PAGES_PER_SECTION << PAGE_SHIFT);
331 ret = __remove_section(zone, __pfn_to_section(pfn)); 331 ret = __remove_section(zone, __pfn_to_section(pfn));
332 if (ret) 332 if (ret)
333 break; 333 break;
@@ -497,8 +497,6 @@ int add_memory(int nid, u64 start, u64 size)
497 /* we online node here. we can't roll back from here. */ 497 /* we online node here. we can't roll back from here. */
498 node_set_online(nid); 498 node_set_online(nid);
499 499
500 cpuset_track_online_nodes();
501
502 if (new_pgdat) { 500 if (new_pgdat) {
503 ret = register_one_node(nid); 501 ret = register_one_node(nid);
504 /* 502 /*
@@ -657,8 +655,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
657 * We can skip free pages. And we can only deal with pages on 655 * We can skip free pages. And we can only deal with pages on
658 * LRU. 656 * LRU.
659 */ 657 */
660 ret = isolate_lru_page(page, &source); 658 ret = isolate_lru_page(page);
661 if (!ret) { /* Success */ 659 if (!ret) { /* Success */
660 list_add_tail(&page->lru, &source);
662 move_pages--; 661 move_pages--;
663 } else { 662 } else {
664 /* Becasue we don't have big zone->lock. we should 663 /* Becasue we don't have big zone->lock. we should
@@ -849,10 +848,19 @@ failed_removal:
849 848
850 return ret; 849 return ret;
851} 850}
851
852int remove_memory(u64 start, u64 size)
853{
854 unsigned long start_pfn, end_pfn;
855
856 start_pfn = PFN_DOWN(start);
857 end_pfn = start_pfn + PFN_DOWN(size);
858 return offline_pages(start_pfn, end_pfn, 120 * HZ);
859}
852#else 860#else
853int remove_memory(u64 start, u64 size) 861int remove_memory(u64 start, u64 size)
854{ 862{
855 return -EINVAL; 863 return -EINVAL;
856} 864}
857EXPORT_SYMBOL_GPL(remove_memory);
858#endif /* CONFIG_MEMORY_HOTREMOVE */ 865#endif /* CONFIG_MEMORY_HOTREMOVE */
866EXPORT_SYMBOL_GPL(remove_memory);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 83369058ec13..e9493b1c1117 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -93,6 +93,8 @@
93#include <asm/tlbflush.h> 93#include <asm/tlbflush.h>
94#include <asm/uaccess.h> 94#include <asm/uaccess.h>
95 95
96#include "internal.h"
97
96/* Internal flags */ 98/* Internal flags */
97#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ 99#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
98#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ 100#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
@@ -487,12 +489,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
487 int err; 489 int err;
488 struct vm_area_struct *first, *vma, *prev; 490 struct vm_area_struct *first, *vma, *prev;
489 491
490 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
491
492 err = migrate_prep();
493 if (err)
494 return ERR_PTR(err);
495 }
496 492
497 first = find_vma(mm, start); 493 first = find_vma(mm, start);
498 if (!first) 494 if (!first)
@@ -762,8 +758,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist,
762 /* 758 /*
763 * Avoid migrating a page that is shared with others. 759 * Avoid migrating a page that is shared with others.
764 */ 760 */
765 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) 761 if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
766 isolate_lru_page(page, pagelist); 762 if (!isolate_lru_page(page)) {
763 list_add_tail(&page->lru, pagelist);
764 }
765 }
767} 766}
768 767
769static struct page *new_node_page(struct page *page, unsigned long node, int **x) 768static struct page *new_node_page(struct page *page, unsigned long node, int **x)
@@ -804,9 +803,13 @@ int do_migrate_pages(struct mm_struct *mm,
804 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) 803 const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
805{ 804{
806 int busy = 0; 805 int busy = 0;
807 int err = 0; 806 int err;
808 nodemask_t tmp; 807 nodemask_t tmp;
809 808
809 err = migrate_prep();
810 if (err)
811 return err;
812
810 down_read(&mm->mmap_sem); 813 down_read(&mm->mmap_sem);
811 814
812 err = migrate_vmas(mm, from_nodes, to_nodes, flags); 815 err = migrate_vmas(mm, from_nodes, to_nodes, flags);
@@ -969,6 +972,12 @@ static long do_mbind(unsigned long start, unsigned long len,
969 start, start + len, mode, mode_flags, 972 start, start + len, mode, mode_flags,
970 nmask ? nodes_addr(*nmask)[0] : -1); 973 nmask ? nodes_addr(*nmask)[0] : -1);
971 974
975 if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
976
977 err = migrate_prep();
978 if (err)
979 return err;
980 }
972 down_write(&mm->mmap_sem); 981 down_write(&mm->mmap_sem);
973 vma = check_range(mm, start, end, nmask, 982 vma = check_range(mm, start, end, nmask,
974 flags | MPOL_MF_INVERT, &pagelist); 983 flags | MPOL_MF_INVERT, &pagelist);
@@ -2197,7 +2206,7 @@ static void gather_stats(struct page *page, void *private, int pte_dirty)
2197 if (PageSwapCache(page)) 2206 if (PageSwapCache(page))
2198 md->swapcache++; 2207 md->swapcache++;
2199 2208
2200 if (PageActive(page)) 2209 if (PageActive(page) || PageUnevictable(page))
2201 md->active++; 2210 md->active++;
2202 2211
2203 if (PageWriteback(page)) 2212 if (PageWriteback(page))
diff --git a/mm/migrate.c b/mm/migrate.c
index 2a80136b23bb..1e0d6b237f44 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -37,36 +37,6 @@
37#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 37#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
38 38
39/* 39/*
40 * Isolate one page from the LRU lists. If successful put it onto
41 * the indicated list with elevated page count.
42 *
43 * Result:
44 * -EBUSY: page not on LRU list
45 * 0: page removed from LRU list and added to the specified list.
46 */
47int isolate_lru_page(struct page *page, struct list_head *pagelist)
48{
49 int ret = -EBUSY;
50
51 if (PageLRU(page)) {
52 struct zone *zone = page_zone(page);
53
54 spin_lock_irq(&zone->lru_lock);
55 if (PageLRU(page) && get_page_unless_zero(page)) {
56 ret = 0;
57 ClearPageLRU(page);
58 if (PageActive(page))
59 del_page_from_active_list(zone, page);
60 else
61 del_page_from_inactive_list(zone, page);
62 list_add_tail(&page->lru, pagelist);
63 }
64 spin_unlock_irq(&zone->lru_lock);
65 }
66 return ret;
67}
68
69/*
70 * migrate_prep() needs to be called before we start compiling a list of pages 40 * migrate_prep() needs to be called before we start compiling a list of pages
71 * to be migrated using isolate_lru_page(). 41 * to be migrated using isolate_lru_page().
72 */ 42 */
@@ -83,23 +53,9 @@ int migrate_prep(void)
83 return 0; 53 return 0;
84} 54}
85 55
86static inline void move_to_lru(struct page *page)
87{
88 if (PageActive(page)) {
89 /*
90 * lru_cache_add_active checks that
91 * the PG_active bit is off.
92 */
93 ClearPageActive(page);
94 lru_cache_add_active(page);
95 } else {
96 lru_cache_add(page);
97 }
98 put_page(page);
99}
100
101/* 56/*
102 * Add isolated pages on the list back to the LRU. 57 * Add isolated pages on the list back to the LRU under page lock
58 * to avoid leaking evictable pages back onto unevictable list.
103 * 59 *
104 * returns the number of pages put back. 60 * returns the number of pages put back.
105 */ 61 */
@@ -111,7 +67,7 @@ int putback_lru_pages(struct list_head *l)
111 67
112 list_for_each_entry_safe(page, page2, l, lru) { 68 list_for_each_entry_safe(page, page2, l, lru) {
113 list_del(&page->lru); 69 list_del(&page->lru);
114 move_to_lru(page); 70 putback_lru_page(page);
115 count++; 71 count++;
116 } 72 }
117 return count; 73 return count;
@@ -374,8 +330,6 @@ static int migrate_page_move_mapping(struct address_space *mapping,
374 __inc_zone_page_state(newpage, NR_FILE_PAGES); 330 __inc_zone_page_state(newpage, NR_FILE_PAGES);
375 331
376 spin_unlock_irq(&mapping->tree_lock); 332 spin_unlock_irq(&mapping->tree_lock);
377 if (!PageSwapCache(newpage))
378 mem_cgroup_uncharge_cache_page(page);
379 333
380 return 0; 334 return 0;
381} 335}
@@ -385,6 +339,8 @@ static int migrate_page_move_mapping(struct address_space *mapping,
385 */ 339 */
386static void migrate_page_copy(struct page *newpage, struct page *page) 340static void migrate_page_copy(struct page *newpage, struct page *page)
387{ 341{
342 int anon;
343
388 copy_highpage(newpage, page); 344 copy_highpage(newpage, page);
389 345
390 if (PageError(page)) 346 if (PageError(page))
@@ -393,8 +349,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
393 SetPageReferenced(newpage); 349 SetPageReferenced(newpage);
394 if (PageUptodate(page)) 350 if (PageUptodate(page))
395 SetPageUptodate(newpage); 351 SetPageUptodate(newpage);
396 if (PageActive(page)) 352 if (TestClearPageActive(page)) {
353 VM_BUG_ON(PageUnevictable(page));
397 SetPageActive(newpage); 354 SetPageActive(newpage);
355 } else
356 unevictable_migrate_page(newpage, page);
398 if (PageChecked(page)) 357 if (PageChecked(page))
399 SetPageChecked(newpage); 358 SetPageChecked(newpage);
400 if (PageMappedToDisk(page)) 359 if (PageMappedToDisk(page))
@@ -412,14 +371,20 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
412 __set_page_dirty_nobuffers(newpage); 371 __set_page_dirty_nobuffers(newpage);
413 } 372 }
414 373
374 mlock_migrate_page(newpage, page);
375
415#ifdef CONFIG_SWAP 376#ifdef CONFIG_SWAP
416 ClearPageSwapCache(page); 377 ClearPageSwapCache(page);
417#endif 378#endif
418 ClearPageActive(page);
419 ClearPagePrivate(page); 379 ClearPagePrivate(page);
420 set_page_private(page, 0); 380 set_page_private(page, 0);
381 /* page->mapping contains a flag for PageAnon() */
382 anon = PageAnon(page);
421 page->mapping = NULL; 383 page->mapping = NULL;
422 384
385 if (!anon) /* This page was removed from radix-tree. */
386 mem_cgroup_uncharge_cache_page(page);
387
423 /* 388 /*
424 * If any waiters have accumulated on the new page then 389 * If any waiters have accumulated on the new page then
425 * wake them up. 390 * wake them up.
@@ -557,15 +522,12 @@ static int writeout(struct address_space *mapping, struct page *page)
557 remove_migration_ptes(page, page); 522 remove_migration_ptes(page, page);
558 523
559 rc = mapping->a_ops->writepage(page, &wbc); 524 rc = mapping->a_ops->writepage(page, &wbc);
560 if (rc < 0)
561 /* I/O Error writing */
562 return -EIO;
563 525
564 if (rc != AOP_WRITEPAGE_ACTIVATE) 526 if (rc != AOP_WRITEPAGE_ACTIVATE)
565 /* unlocked. Relock */ 527 /* unlocked. Relock */
566 lock_page(page); 528 lock_page(page);
567 529
568 return -EAGAIN; 530 return (rc < 0) ? -EIO : -EAGAIN;
569} 531}
570 532
571/* 533/*
@@ -594,6 +556,10 @@ static int fallback_migrate_page(struct address_space *mapping,
594 * 556 *
595 * The new page will have replaced the old page if this function 557 * The new page will have replaced the old page if this function
596 * is successful. 558 * is successful.
559 *
560 * Return value:
561 * < 0 - error code
562 * == 0 - success
597 */ 563 */
598static int move_to_new_page(struct page *newpage, struct page *page) 564static int move_to_new_page(struct page *newpage, struct page *page)
599{ 565{
@@ -611,6 +577,8 @@ static int move_to_new_page(struct page *newpage, struct page *page)
611 /* Prepare mapping for the new page.*/ 577 /* Prepare mapping for the new page.*/
612 newpage->index = page->index; 578 newpage->index = page->index;
613 newpage->mapping = page->mapping; 579 newpage->mapping = page->mapping;
580 if (PageSwapBacked(page))
581 SetPageSwapBacked(newpage);
614 582
615 mapping = page_mapping(page); 583 mapping = page_mapping(page);
616 if (!mapping) 584 if (!mapping)
@@ -654,9 +622,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
654 if (!newpage) 622 if (!newpage)
655 return -ENOMEM; 623 return -ENOMEM;
656 624
657 if (page_count(page) == 1) 625 if (page_count(page) == 1) {
658 /* page was freed from under us. So we are done. */ 626 /* page was freed from under us. So we are done. */
659 goto move_newpage; 627 goto move_newpage;
628 }
660 629
661 charge = mem_cgroup_prepare_migration(page, newpage); 630 charge = mem_cgroup_prepare_migration(page, newpage);
662 if (charge == -ENOMEM) { 631 if (charge == -ENOMEM) {
@@ -730,7 +699,6 @@ rcu_unlock:
730 rcu_read_unlock(); 699 rcu_read_unlock();
731 700
732unlock: 701unlock:
733
734 unlock_page(page); 702 unlock_page(page);
735 703
736 if (rc != -EAGAIN) { 704 if (rc != -EAGAIN) {
@@ -741,17 +709,19 @@ unlock:
741 * restored. 709 * restored.
742 */ 710 */
743 list_del(&page->lru); 711 list_del(&page->lru);
744 move_to_lru(page); 712 putback_lru_page(page);
745 } 713 }
746 714
747move_newpage: 715move_newpage:
748 if (!charge) 716 if (!charge)
749 mem_cgroup_end_migration(newpage); 717 mem_cgroup_end_migration(newpage);
718
750 /* 719 /*
751 * Move the new page to the LRU. If migration was not successful 720 * Move the new page to the LRU. If migration was not successful
752 * then this will free the page. 721 * then this will free the page.
753 */ 722 */
754 move_to_lru(newpage); 723 putback_lru_page(newpage);
724
755 if (result) { 725 if (result) {
756 if (rc) 726 if (rc)
757 *result = rc; 727 *result = rc;
@@ -858,20 +828,22 @@ static struct page *new_page_node(struct page *p, unsigned long private,
858 * Move a set of pages as indicated in the pm array. The addr 828 * Move a set of pages as indicated in the pm array. The addr
859 * field must be set to the virtual address of the page to be moved 829 * field must be set to the virtual address of the page to be moved
860 * and the node number must contain a valid target node. 830 * and the node number must contain a valid target node.
831 * The pm array ends with node = MAX_NUMNODES.
861 */ 832 */
862static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, 833static int do_move_page_to_node_array(struct mm_struct *mm,
863 int migrate_all) 834 struct page_to_node *pm,
835 int migrate_all)
864{ 836{
865 int err; 837 int err;
866 struct page_to_node *pp; 838 struct page_to_node *pp;
867 LIST_HEAD(pagelist); 839 LIST_HEAD(pagelist);
868 840
841 migrate_prep();
869 down_read(&mm->mmap_sem); 842 down_read(&mm->mmap_sem);
870 843
871 /* 844 /*
872 * Build a list of pages to migrate 845 * Build a list of pages to migrate
873 */ 846 */
874 migrate_prep();
875 for (pp = pm; pp->node != MAX_NUMNODES; pp++) { 847 for (pp = pm; pp->node != MAX_NUMNODES; pp++) {
876 struct vm_area_struct *vma; 848 struct vm_area_struct *vma;
877 struct page *page; 849 struct page *page;
@@ -914,7 +886,9 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm,
914 !migrate_all) 886 !migrate_all)
915 goto put_and_set; 887 goto put_and_set;
916 888
917 err = isolate_lru_page(page, &pagelist); 889 err = isolate_lru_page(page);
890 if (!err)
891 list_add_tail(&page->lru, &pagelist);
918put_and_set: 892put_and_set:
919 /* 893 /*
920 * Either remove the duplicate refcount from 894 * Either remove the duplicate refcount from
@@ -926,36 +900,118 @@ set_status:
926 pp->status = err; 900 pp->status = err;
927 } 901 }
928 902
903 err = 0;
929 if (!list_empty(&pagelist)) 904 if (!list_empty(&pagelist))
930 err = migrate_pages(&pagelist, new_page_node, 905 err = migrate_pages(&pagelist, new_page_node,
931 (unsigned long)pm); 906 (unsigned long)pm);
932 else
933 err = -ENOENT;
934 907
935 up_read(&mm->mmap_sem); 908 up_read(&mm->mmap_sem);
936 return err; 909 return err;
937} 910}
938 911
939/* 912/*
940 * Determine the nodes of a list of pages. The addr in the pm array 913 * Migrate an array of page address onto an array of nodes and fill
941 * must have been set to the virtual address of which we want to determine 914 * the corresponding array of status.
942 * the node number.
943 */ 915 */
944static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) 916static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
917 unsigned long nr_pages,
918 const void __user * __user *pages,
919 const int __user *nodes,
920 int __user *status, int flags)
945{ 921{
922 struct page_to_node *pm = NULL;
923 nodemask_t task_nodes;
924 int err = 0;
925 int i;
926
927 task_nodes = cpuset_mems_allowed(task);
928
929 /* Limit nr_pages so that the multiplication may not overflow */
930 if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
931 err = -E2BIG;
932 goto out;
933 }
934
935 pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
936 if (!pm) {
937 err = -ENOMEM;
938 goto out;
939 }
940
941 /*
942 * Get parameters from user space and initialize the pm
943 * array. Return various errors if the user did something wrong.
944 */
945 for (i = 0; i < nr_pages; i++) {
946 const void __user *p;
947
948 err = -EFAULT;
949 if (get_user(p, pages + i))
950 goto out_pm;
951
952 pm[i].addr = (unsigned long)p;
953 if (nodes) {
954 int node;
955
956 if (get_user(node, nodes + i))
957 goto out_pm;
958
959 err = -ENODEV;
960 if (!node_state(node, N_HIGH_MEMORY))
961 goto out_pm;
962
963 err = -EACCES;
964 if (!node_isset(node, task_nodes))
965 goto out_pm;
966
967 pm[i].node = node;
968 } else
969 pm[i].node = 0; /* anything to not match MAX_NUMNODES */
970 }
971 /* End marker */
972 pm[nr_pages].node = MAX_NUMNODES;
973
974 err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL);
975 if (err >= 0)
976 /* Return status information */
977 for (i = 0; i < nr_pages; i++)
978 if (put_user(pm[i].status, status + i))
979 err = -EFAULT;
980
981out_pm:
982 vfree(pm);
983out:
984 return err;
985}
986
987/*
988 * Determine the nodes of an array of pages and store it in an array of status.
989 */
990static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages,
991 const void __user * __user *pages,
992 int __user *status)
993{
994 unsigned long i;
995 int err;
996
946 down_read(&mm->mmap_sem); 997 down_read(&mm->mmap_sem);
947 998
948 for ( ; pm->node != MAX_NUMNODES; pm++) { 999 for (i = 0; i < nr_pages; i++) {
1000 const void __user *p;
1001 unsigned long addr;
949 struct vm_area_struct *vma; 1002 struct vm_area_struct *vma;
950 struct page *page; 1003 struct page *page;
951 int err;
952 1004
953 err = -EFAULT; 1005 err = -EFAULT;
954 vma = find_vma(mm, pm->addr); 1006 if (get_user(p, pages+i))
1007 goto out;
1008 addr = (unsigned long) p;
1009
1010 vma = find_vma(mm, addr);
955 if (!vma) 1011 if (!vma)
956 goto set_status; 1012 goto set_status;
957 1013
958 page = follow_page(vma, pm->addr, 0); 1014 page = follow_page(vma, addr, 0);
959 1015
960 err = PTR_ERR(page); 1016 err = PTR_ERR(page);
961 if (IS_ERR(page)) 1017 if (IS_ERR(page))
@@ -968,11 +1024,13 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm)
968 1024
969 err = page_to_nid(page); 1025 err = page_to_nid(page);
970set_status: 1026set_status:
971 pm->status = err; 1027 put_user(err, status+i);
972 } 1028 }
1029 err = 0;
973 1030
1031out:
974 up_read(&mm->mmap_sem); 1032 up_read(&mm->mmap_sem);
975 return 0; 1033 return err;
976} 1034}
977 1035
978/* 1036/*
@@ -984,12 +1042,9 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
984 const int __user *nodes, 1042 const int __user *nodes,
985 int __user *status, int flags) 1043 int __user *status, int flags)
986{ 1044{
987 int err = 0;
988 int i;
989 struct task_struct *task; 1045 struct task_struct *task;
990 nodemask_t task_nodes;
991 struct mm_struct *mm; 1046 struct mm_struct *mm;
992 struct page_to_node *pm = NULL; 1047 int err;
993 1048
994 /* Check flags */ 1049 /* Check flags */
995 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) 1050 if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL))
@@ -1021,75 +1076,21 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
1021 (current->uid != task->suid) && (current->uid != task->uid) && 1076 (current->uid != task->suid) && (current->uid != task->uid) &&
1022 !capable(CAP_SYS_NICE)) { 1077 !capable(CAP_SYS_NICE)) {
1023 err = -EPERM; 1078 err = -EPERM;
1024 goto out2; 1079 goto out;
1025 } 1080 }
1026 1081
1027 err = security_task_movememory(task); 1082 err = security_task_movememory(task);
1028 if (err) 1083 if (err)
1029 goto out2; 1084 goto out;
1030
1031
1032 task_nodes = cpuset_mems_allowed(task);
1033
1034 /* Limit nr_pages so that the multiplication may not overflow */
1035 if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) {
1036 err = -E2BIG;
1037 goto out2;
1038 }
1039
1040 pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
1041 if (!pm) {
1042 err = -ENOMEM;
1043 goto out2;
1044 }
1045
1046 /*
1047 * Get parameters from user space and initialize the pm
1048 * array. Return various errors if the user did something wrong.
1049 */
1050 for (i = 0; i < nr_pages; i++) {
1051 const void __user *p;
1052
1053 err = -EFAULT;
1054 if (get_user(p, pages + i))
1055 goto out;
1056
1057 pm[i].addr = (unsigned long)p;
1058 if (nodes) {
1059 int node;
1060
1061 if (get_user(node, nodes + i))
1062 goto out;
1063
1064 err = -ENODEV;
1065 if (!node_state(node, N_HIGH_MEMORY))
1066 goto out;
1067
1068 err = -EACCES;
1069 if (!node_isset(node, task_nodes))
1070 goto out;
1071 1085
1072 pm[i].node = node; 1086 if (nodes) {
1073 } else 1087 err = do_pages_move(mm, task, nr_pages, pages, nodes, status,
1074 pm[i].node = 0; /* anything to not match MAX_NUMNODES */ 1088 flags);
1089 } else {
1090 err = do_pages_stat(mm, nr_pages, pages, status);
1075 } 1091 }
1076 /* End marker */
1077 pm[nr_pages].node = MAX_NUMNODES;
1078
1079 if (nodes)
1080 err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL);
1081 else
1082 err = do_pages_stat(mm, pm);
1083
1084 if (err >= 0)
1085 /* Return status information */
1086 for (i = 0; i < nr_pages; i++)
1087 if (put_user(pm[i].status, status + i))
1088 err = -EFAULT;
1089 1092
1090out: 1093out:
1091 vfree(pm);
1092out2:
1093 mmput(mm); 1094 mmput(mm);
1094 return err; 1095 return err;
1095} 1096}
diff --git a/mm/mlock.c b/mm/mlock.c
index 01fbe93eff5c..1ada366570cb 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -8,10 +8,18 @@
8#include <linux/capability.h> 8#include <linux/capability.h>
9#include <linux/mman.h> 9#include <linux/mman.h>
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/swap.h>
12#include <linux/swapops.h>
13#include <linux/pagemap.h>
11#include <linux/mempolicy.h> 14#include <linux/mempolicy.h>
12#include <linux/syscalls.h> 15#include <linux/syscalls.h>
13#include <linux/sched.h> 16#include <linux/sched.h>
14#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/rmap.h>
19#include <linux/mmzone.h>
20#include <linux/hugetlb.h>
21
22#include "internal.h"
15 23
16int can_do_mlock(void) 24int can_do_mlock(void)
17{ 25{
@@ -23,17 +31,373 @@ int can_do_mlock(void)
23} 31}
24EXPORT_SYMBOL(can_do_mlock); 32EXPORT_SYMBOL(can_do_mlock);
25 33
34#ifdef CONFIG_UNEVICTABLE_LRU
35/*
36 * Mlocked pages are marked with PageMlocked() flag for efficient testing
37 * in vmscan and, possibly, the fault path; and to support semi-accurate
38 * statistics.
39 *
40 * An mlocked page [PageMlocked(page)] is unevictable. As such, it will
41 * be placed on the LRU "unevictable" list, rather than the [in]active lists.
42 * The unevictable list is an LRU sibling list to the [in]active lists.
43 * PageUnevictable is set to indicate the unevictable state.
44 *
45 * When lazy mlocking via vmscan, it is important to ensure that the
46 * vma's VM_LOCKED status is not concurrently being modified, otherwise we
47 * may have mlocked a page that is being munlocked. So lazy mlock must take
48 * the mmap_sem for read, and verify that the vma really is locked
49 * (see mm/rmap.c).
50 */
51
52/*
53 * LRU accounting for clear_page_mlock()
54 */
55void __clear_page_mlock(struct page *page)
56{
57 VM_BUG_ON(!PageLocked(page));
58
59 if (!page->mapping) { /* truncated ? */
60 return;
61 }
62
63 dec_zone_page_state(page, NR_MLOCK);
64 count_vm_event(UNEVICTABLE_PGCLEARED);
65 if (!isolate_lru_page(page)) {
66 putback_lru_page(page);
67 } else {
68 /*
69 * We lost the race. the page already moved to evictable list.
70 */
71 if (PageUnevictable(page))
72 count_vm_event(UNEVICTABLE_PGSTRANDED);
73 }
74}
75
76/*
77 * Mark page as mlocked if not already.
78 * If page on LRU, isolate and putback to move to unevictable list.
79 */
80void mlock_vma_page(struct page *page)
81{
82 BUG_ON(!PageLocked(page));
83
84 if (!TestSetPageMlocked(page)) {
85 inc_zone_page_state(page, NR_MLOCK);
86 count_vm_event(UNEVICTABLE_PGMLOCKED);
87 if (!isolate_lru_page(page))
88 putback_lru_page(page);
89 }
90}
91
92/*
93 * called from munlock()/munmap() path with page supposedly on the LRU.
94 *
95 * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked
96 * [in try_to_munlock()] and then attempt to isolate the page. We must
97 * isolate the page to keep others from messing with its unevictable
98 * and mlocked state while trying to munlock. However, we pre-clear the
99 * mlocked state anyway as we might lose the isolation race and we might
100 * not get another chance to clear PageMlocked. If we successfully
101 * isolate the page and try_to_munlock() detects other VM_LOCKED vmas
102 * mapping the page, it will restore the PageMlocked state, unless the page
103 * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(),
104 * perhaps redundantly.
105 * If we lose the isolation race, and the page is mapped by other VM_LOCKED
106 * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap()
107 * either of which will restore the PageMlocked state by calling
108 * mlock_vma_page() above, if it can grab the vma's mmap sem.
109 */
110static void munlock_vma_page(struct page *page)
111{
112 BUG_ON(!PageLocked(page));
113
114 if (TestClearPageMlocked(page)) {
115 dec_zone_page_state(page, NR_MLOCK);
116 if (!isolate_lru_page(page)) {
117 int ret = try_to_munlock(page);
118 /*
119 * did try_to_unlock() succeed or punt?
120 */
121 if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN)
122 count_vm_event(UNEVICTABLE_PGMUNLOCKED);
123
124 putback_lru_page(page);
125 } else {
126 /*
127 * We lost the race. let try_to_unmap() deal
128 * with it. At least we get the page state and
129 * mlock stats right. However, page is still on
130 * the noreclaim list. We'll fix that up when
131 * the page is eventually freed or we scan the
132 * noreclaim list.
133 */
134 if (PageUnevictable(page))
135 count_vm_event(UNEVICTABLE_PGSTRANDED);
136 else
137 count_vm_event(UNEVICTABLE_PGMUNLOCKED);
138 }
139 }
140}
141
142/**
143 * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma.
144 * @vma: target vma
145 * @start: start address
146 * @end: end address
147 * @mlock: 0 indicate munlock, otherwise mlock.
148 *
149 * If @mlock == 0, unlock an mlocked range;
150 * else mlock the range of pages. This takes care of making the pages present ,
151 * too.
152 *
153 * return 0 on success, negative error code on error.
154 *
155 * vma->vm_mm->mmap_sem must be held for at least read.
156 */
157static long __mlock_vma_pages_range(struct vm_area_struct *vma,
158 unsigned long start, unsigned long end,
159 int mlock)
160{
161 struct mm_struct *mm = vma->vm_mm;
162 unsigned long addr = start;
163 struct page *pages[16]; /* 16 gives a reasonable batch */
164 int nr_pages = (end - start) / PAGE_SIZE;
165 int ret = 0;
166 int gup_flags = 0;
167
168 VM_BUG_ON(start & ~PAGE_MASK);
169 VM_BUG_ON(end & ~PAGE_MASK);
170 VM_BUG_ON(start < vma->vm_start);
171 VM_BUG_ON(end > vma->vm_end);
172 VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) &&
173 (atomic_read(&mm->mm_users) != 0));
174
175 /*
176 * mlock: don't page populate if page has PROT_NONE permission.
177 * munlock: the pages always do munlock althrough
178 * its has PROT_NONE permission.
179 */
180 if (!mlock)
181 gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS;
182
183 if (vma->vm_flags & VM_WRITE)
184 gup_flags |= GUP_FLAGS_WRITE;
185
186 while (nr_pages > 0) {
187 int i;
188
189 cond_resched();
190
191 /*
192 * get_user_pages makes pages present if we are
193 * setting mlock. and this extra reference count will
194 * disable migration of this page. However, page may
195 * still be truncated out from under us.
196 */
197 ret = __get_user_pages(current, mm, addr,
198 min_t(int, nr_pages, ARRAY_SIZE(pages)),
199 gup_flags, pages, NULL);
200 /*
201 * This can happen for, e.g., VM_NONLINEAR regions before
202 * a page has been allocated and mapped at a given offset,
203 * or for addresses that map beyond end of a file.
204 * We'll mlock the the pages if/when they get faulted in.
205 */
206 if (ret < 0)
207 break;
208 if (ret == 0) {
209 /*
210 * We know the vma is there, so the only time
211 * we cannot get a single page should be an
212 * error (ret < 0) case.
213 */
214 WARN_ON(1);
215 break;
216 }
217
218 lru_add_drain(); /* push cached pages to LRU */
219
220 for (i = 0; i < ret; i++) {
221 struct page *page = pages[i];
222
223 lock_page(page);
224 /*
225 * Because we lock page here and migration is blocked
226 * by the elevated reference, we need only check for
227 * page truncation (file-cache only).
228 */
229 if (page->mapping) {
230 if (mlock)
231 mlock_vma_page(page);
232 else
233 munlock_vma_page(page);
234 }
235 unlock_page(page);
236 put_page(page); /* ref from get_user_pages() */
237
238 /*
239 * here we assume that get_user_pages() has given us
240 * a list of virtually contiguous pages.
241 */
242 addr += PAGE_SIZE; /* for next get_user_pages() */
243 nr_pages--;
244 }
245 ret = 0;
246 }
247
248 return ret; /* count entire vma as locked_vm */
249}
250
251/*
252 * convert get_user_pages() return value to posix mlock() error
253 */
254static int __mlock_posix_error_return(long retval)
255{
256 if (retval == -EFAULT)
257 retval = -ENOMEM;
258 else if (retval == -ENOMEM)
259 retval = -EAGAIN;
260 return retval;
261}
262
263#else /* CONFIG_UNEVICTABLE_LRU */
264
265/*
266 * Just make pages present if VM_LOCKED. No-op if unlocking.
267 */
268static long __mlock_vma_pages_range(struct vm_area_struct *vma,
269 unsigned long start, unsigned long end,
270 int mlock)
271{
272 if (mlock && (vma->vm_flags & VM_LOCKED))
273 return make_pages_present(start, end);
274 return 0;
275}
276
277static inline int __mlock_posix_error_return(long retval)
278{
279 return 0;
280}
281
282#endif /* CONFIG_UNEVICTABLE_LRU */
283
284/**
285 * mlock_vma_pages_range() - mlock pages in specified vma range.
286 * @vma - the vma containing the specfied address range
287 * @start - starting address in @vma to mlock
288 * @end - end address [+1] in @vma to mlock
289 *
290 * For mmap()/mremap()/expansion of mlocked vma.
291 *
292 * return 0 on success for "normal" vmas.
293 *
294 * return number of pages [> 0] to be removed from locked_vm on success
295 * of "special" vmas.
296 *
297 * return negative error if vma spanning @start-@range disappears while
298 * mmap semaphore is dropped. Unlikely?
299 */
300long mlock_vma_pages_range(struct vm_area_struct *vma,
301 unsigned long start, unsigned long end)
302{
303 struct mm_struct *mm = vma->vm_mm;
304 int nr_pages = (end - start) / PAGE_SIZE;
305 BUG_ON(!(vma->vm_flags & VM_LOCKED));
306
307 /*
308 * filter unlockable vmas
309 */
310 if (vma->vm_flags & (VM_IO | VM_PFNMAP))
311 goto no_mlock;
312
313 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
314 is_vm_hugetlb_page(vma) ||
315 vma == get_gate_vma(current))) {
316 long error;
317 downgrade_write(&mm->mmap_sem);
318
319 error = __mlock_vma_pages_range(vma, start, end, 1);
320
321 up_read(&mm->mmap_sem);
322 /* vma can change or disappear */
323 down_write(&mm->mmap_sem);
324 vma = find_vma(mm, start);
325 /* non-NULL vma must contain @start, but need to check @end */
326 if (!vma || end > vma->vm_end)
327 return -ENOMEM;
328
329 return 0; /* hide other errors from mmap(), et al */
330 }
331
332 /*
333 * User mapped kernel pages or huge pages:
334 * make these pages present to populate the ptes, but
335 * fall thru' to reset VM_LOCKED--no need to unlock, and
336 * return nr_pages so these don't get counted against task's
337 * locked limit. huge pages are already counted against
338 * locked vm limit.
339 */
340 make_pages_present(start, end);
341
342no_mlock:
343 vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */
344 return nr_pages; /* error or pages NOT mlocked */
345}
346
347
348/*
349 * munlock_vma_pages_range() - munlock all pages in the vma range.'
350 * @vma - vma containing range to be munlock()ed.
351 * @start - start address in @vma of the range
352 * @end - end of range in @vma.
353 *
354 * For mremap(), munmap() and exit().
355 *
356 * Called with @vma VM_LOCKED.
357 *
358 * Returns with VM_LOCKED cleared. Callers must be prepared to
359 * deal with this.
360 *
361 * We don't save and restore VM_LOCKED here because pages are
362 * still on lru. In unmap path, pages might be scanned by reclaim
363 * and re-mlocked by try_to_{munlock|unmap} before we unmap and
364 * free them. This will result in freeing mlocked pages.
365 */
366void munlock_vma_pages_range(struct vm_area_struct *vma,
367 unsigned long start, unsigned long end)
368{
369 vma->vm_flags &= ~VM_LOCKED;
370 __mlock_vma_pages_range(vma, start, end, 0);
371}
372
373/*
374 * mlock_fixup - handle mlock[all]/munlock[all] requests.
375 *
376 * Filters out "special" vmas -- VM_LOCKED never gets set for these, and
377 * munlock is a no-op. However, for some special vmas, we go ahead and
378 * populate the ptes via make_pages_present().
379 *
380 * For vmas that pass the filters, merge/split as appropriate.
381 */
26static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, 382static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
27 unsigned long start, unsigned long end, unsigned int newflags) 383 unsigned long start, unsigned long end, unsigned int newflags)
28{ 384{
29 struct mm_struct * mm = vma->vm_mm; 385 struct mm_struct *mm = vma->vm_mm;
30 pgoff_t pgoff; 386 pgoff_t pgoff;
31 int pages; 387 int nr_pages;
32 int ret = 0; 388 int ret = 0;
33 389 int lock = newflags & VM_LOCKED;
34 if (newflags == vma->vm_flags) { 390
35 *prev = vma; 391 if (newflags == vma->vm_flags ||
36 goto out; 392 (vma->vm_flags & (VM_IO | VM_PFNMAP)))
393 goto out; /* don't set VM_LOCKED, don't count */
394
395 if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
396 is_vm_hugetlb_page(vma) ||
397 vma == get_gate_vma(current)) {
398 if (lock)
399 make_pages_present(start, end);
400 goto out; /* don't set VM_LOCKED, don't count */
37 } 401 }
38 402
39 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); 403 pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
@@ -44,8 +408,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
44 goto success; 408 goto success;
45 } 409 }
46 410
47 *prev = vma;
48
49 if (start != vma->vm_start) { 411 if (start != vma->vm_start) {
50 ret = split_vma(mm, vma, start, 1); 412 ret = split_vma(mm, vma, start, 1);
51 if (ret) 413 if (ret)
@@ -60,24 +422,61 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
60 422
61success: 423success:
62 /* 424 /*
425 * Keep track of amount of locked VM.
426 */
427 nr_pages = (end - start) >> PAGE_SHIFT;
428 if (!lock)
429 nr_pages = -nr_pages;
430 mm->locked_vm += nr_pages;
431
432 /*
63 * vm_flags is protected by the mmap_sem held in write mode. 433 * vm_flags is protected by the mmap_sem held in write mode.
64 * It's okay if try_to_unmap_one unmaps a page just after we 434 * It's okay if try_to_unmap_one unmaps a page just after we
65 * set VM_LOCKED, make_pages_present below will bring it back. 435 * set VM_LOCKED, __mlock_vma_pages_range will bring it back.
66 */ 436 */
67 vma->vm_flags = newflags; 437 vma->vm_flags = newflags;
68 438
69 /* 439 if (lock) {
70 * Keep track of amount of locked VM. 440 /*
71 */ 441 * mmap_sem is currently held for write. Downgrade the write
72 pages = (end - start) >> PAGE_SHIFT; 442 * lock to a read lock so that other faults, mmap scans, ...
73 if (newflags & VM_LOCKED) { 443 * while we fault in all pages.
74 pages = -pages; 444 */
75 if (!(newflags & VM_IO)) 445 downgrade_write(&mm->mmap_sem);
76 ret = make_pages_present(start, end); 446
447 ret = __mlock_vma_pages_range(vma, start, end, 1);
448
449 /*
450 * Need to reacquire mmap sem in write mode, as our callers
451 * expect this. We have no support for atomically upgrading
452 * a sem to write, so we need to check for ranges while sem
453 * is unlocked.
454 */
455 up_read(&mm->mmap_sem);
456 /* vma can change or disappear */
457 down_write(&mm->mmap_sem);
458 *prev = find_vma(mm, start);
459 /* non-NULL *prev must contain @start, but need to check @end */
460 if (!(*prev) || end > (*prev)->vm_end)
461 ret = -ENOMEM;
462 else if (ret > 0) {
463 mm->locked_vm -= ret;
464 ret = 0;
465 } else
466 ret = __mlock_posix_error_return(ret); /* translate if needed */
467 } else {
468 /*
469 * TODO: for unlocking, pages will already be resident, so
470 * we don't need to wait for allocations/reclaim/pagein, ...
471 * However, unlocking a very large region can still take a
472 * while. Should we downgrade the semaphore for both lock
473 * AND unlock ?
474 */
475 __mlock_vma_pages_range(vma, start, end, 0);
77 } 476 }
78 477
79 mm->locked_vm -= pages;
80out: 478out:
479 *prev = vma;
81 return ret; 480 return ret;
82} 481}
83 482
@@ -139,6 +538,8 @@ asmlinkage long sys_mlock(unsigned long start, size_t len)
139 if (!can_do_mlock()) 538 if (!can_do_mlock())
140 return -EPERM; 539 return -EPERM;
141 540
541 lru_add_drain_all(); /* flush pagevec */
542
142 down_write(&current->mm->mmap_sem); 543 down_write(&current->mm->mmap_sem);
143 len = PAGE_ALIGN(len + (start & ~PAGE_MASK)); 544 len = PAGE_ALIGN(len + (start & ~PAGE_MASK));
144 start &= PAGE_MASK; 545 start &= PAGE_MASK;
@@ -205,6 +606,8 @@ asmlinkage long sys_mlockall(int flags)
205 if (!can_do_mlock()) 606 if (!can_do_mlock())
206 goto out; 607 goto out;
207 608
609 lru_add_drain_all(); /* flush pagevec */
610
208 down_write(&current->mm->mmap_sem); 611 down_write(&current->mm->mmap_sem);
209 612
210 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; 613 lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
diff --git a/mm/mmap.c b/mm/mmap.c
index e7a5a68a9c2e..d4855a682ab6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -175,7 +175,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
175 175
176 /* Don't let a single process grow too big: 176 /* Don't let a single process grow too big:
177 leave 3% of the size of this process for other processes */ 177 leave 3% of the size of this process for other processes */
178 allowed -= mm->total_vm / 32; 178 if (mm)
179 allowed -= mm->total_vm / 32;
179 180
180 /* 181 /*
181 * cast `allowed' as a signed long because vm_committed_space 182 * cast `allowed' as a signed long because vm_committed_space
@@ -410,7 +411,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
410 rb_insert_color(&vma->vm_rb, &mm->mm_rb); 411 rb_insert_color(&vma->vm_rb, &mm->mm_rb);
411} 412}
412 413
413static inline void __vma_link_file(struct vm_area_struct *vma) 414static void __vma_link_file(struct vm_area_struct *vma)
414{ 415{
415 struct file * file; 416 struct file * file;
416 417
@@ -662,8 +663,6 @@ again: remove_next = 1 + (end > next->vm_end);
662 * If the vma has a ->close operation then the driver probably needs to release 663 * If the vma has a ->close operation then the driver probably needs to release
663 * per-vma resources, so we don't attempt to merge those. 664 * per-vma resources, so we don't attempt to merge those.
664 */ 665 */
665#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP)
666
667static inline int is_mergeable_vma(struct vm_area_struct *vma, 666static inline int is_mergeable_vma(struct vm_area_struct *vma,
668 struct file *file, unsigned long vm_flags) 667 struct file *file, unsigned long vm_flags)
669{ 668{
@@ -972,6 +971,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
972 return -EPERM; 971 return -EPERM;
973 vm_flags |= VM_LOCKED; 972 vm_flags |= VM_LOCKED;
974 } 973 }
974
975 /* mlock MCL_FUTURE? */ 975 /* mlock MCL_FUTURE? */
976 if (vm_flags & VM_LOCKED) { 976 if (vm_flags & VM_LOCKED) {
977 unsigned long locked, lock_limit; 977 unsigned long locked, lock_limit;
@@ -1139,10 +1139,12 @@ munmap_back:
1139 * The VM_SHARED test is necessary because shmem_zero_setup 1139 * The VM_SHARED test is necessary because shmem_zero_setup
1140 * will create the file object for a shared anonymous map below. 1140 * will create the file object for a shared anonymous map below.
1141 */ 1141 */
1142 if (!file && !(vm_flags & VM_SHARED) && 1142 if (!file && !(vm_flags & VM_SHARED)) {
1143 vma_merge(mm, prev, addr, addr + len, vm_flags, 1143 vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
1144 NULL, NULL, pgoff, NULL)) 1144 NULL, NULL, pgoff, NULL);
1145 goto out; 1145 if (vma)
1146 goto out;
1147 }
1146 1148
1147 /* 1149 /*
1148 * Determine the object being mapped and call the appropriate 1150 * Determine the object being mapped and call the appropriate
@@ -1224,10 +1226,14 @@ out:
1224 mm->total_vm += len >> PAGE_SHIFT; 1226 mm->total_vm += len >> PAGE_SHIFT;
1225 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); 1227 vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);
1226 if (vm_flags & VM_LOCKED) { 1228 if (vm_flags & VM_LOCKED) {
1227 mm->locked_vm += len >> PAGE_SHIFT; 1229 /*
1228 make_pages_present(addr, addr + len); 1230 * makes pages present; downgrades, drops, reacquires mmap_sem
1229 } 1231 */
1230 if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) 1232 long nr_pages = mlock_vma_pages_range(vma, addr, addr + len);
1233 if (nr_pages < 0)
1234 return nr_pages; /* vma gone! */
1235 mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages;
1236 } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK))
1231 make_pages_present(addr, addr + len); 1237 make_pages_present(addr, addr + len);
1232 return addr; 1238 return addr;
1233 1239
@@ -1586,7 +1592,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un
1586 * vma is the last one with address > vma->vm_end. Have to extend vma. 1592 * vma is the last one with address > vma->vm_end. Have to extend vma.
1587 */ 1593 */
1588#ifndef CONFIG_IA64 1594#ifndef CONFIG_IA64
1589static inline 1595static
1590#endif 1596#endif
1591int expand_upwards(struct vm_area_struct *vma, unsigned long address) 1597int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1592{ 1598{
@@ -1636,7 +1642,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
1636/* 1642/*
1637 * vma is the first one with address < vma->vm_start. Have to extend vma. 1643 * vma is the first one with address < vma->vm_start. Have to extend vma.
1638 */ 1644 */
1639static inline int expand_downwards(struct vm_area_struct *vma, 1645static int expand_downwards(struct vm_area_struct *vma,
1640 unsigned long address) 1646 unsigned long address)
1641{ 1647{
1642 int error; 1648 int error;
@@ -1700,8 +1706,10 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
1700 return vma; 1706 return vma;
1701 if (!prev || expand_stack(prev, addr)) 1707 if (!prev || expand_stack(prev, addr))
1702 return NULL; 1708 return NULL;
1703 if (prev->vm_flags & VM_LOCKED) 1709 if (prev->vm_flags & VM_LOCKED) {
1704 make_pages_present(addr, prev->vm_end); 1710 if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0)
1711 return NULL; /* vma gone! */
1712 }
1705 return prev; 1713 return prev;
1706} 1714}
1707#else 1715#else
@@ -1727,8 +1735,10 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr)
1727 start = vma->vm_start; 1735 start = vma->vm_start;
1728 if (expand_stack(vma, addr)) 1736 if (expand_stack(vma, addr))
1729 return NULL; 1737 return NULL;
1730 if (vma->vm_flags & VM_LOCKED) 1738 if (vma->vm_flags & VM_LOCKED) {
1731 make_pages_present(addr, start); 1739 if (mlock_vma_pages_range(vma, addr, start) < 0)
1740 return NULL; /* vma gone! */
1741 }
1732 return vma; 1742 return vma;
1733} 1743}
1734#endif 1744#endif
@@ -1747,8 +1757,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma)
1747 long nrpages = vma_pages(vma); 1757 long nrpages = vma_pages(vma);
1748 1758
1749 mm->total_vm -= nrpages; 1759 mm->total_vm -= nrpages;
1750 if (vma->vm_flags & VM_LOCKED)
1751 mm->locked_vm -= nrpages;
1752 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); 1760 vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages);
1753 vma = remove_vma(vma); 1761 vma = remove_vma(vma);
1754 } while (vma); 1762 } while (vma);
@@ -1914,6 +1922,20 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1914 vma = prev? prev->vm_next: mm->mmap; 1922 vma = prev? prev->vm_next: mm->mmap;
1915 1923
1916 /* 1924 /*
1925 * unlock any mlock()ed ranges before detaching vmas
1926 */
1927 if (mm->locked_vm) {
1928 struct vm_area_struct *tmp = vma;
1929 while (tmp && tmp->vm_start < end) {
1930 if (tmp->vm_flags & VM_LOCKED) {
1931 mm->locked_vm -= vma_pages(tmp);
1932 munlock_vma_pages_all(tmp);
1933 }
1934 tmp = tmp->vm_next;
1935 }
1936 }
1937
1938 /*
1917 * Remove the vma's, and unmap the actual pages 1939 * Remove the vma's, and unmap the actual pages
1918 */ 1940 */
1919 detach_vmas_to_be_unmapped(mm, vma, prev, end); 1941 detach_vmas_to_be_unmapped(mm, vma, prev, end);
@@ -2025,8 +2047,9 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2025 return -ENOMEM; 2047 return -ENOMEM;
2026 2048
2027 /* Can we just expand an old private anonymous mapping? */ 2049 /* Can we just expand an old private anonymous mapping? */
2028 if (vma_merge(mm, prev, addr, addr + len, flags, 2050 vma = vma_merge(mm, prev, addr, addr + len, flags,
2029 NULL, NULL, pgoff, NULL)) 2051 NULL, NULL, pgoff, NULL);
2052 if (vma)
2030 goto out; 2053 goto out;
2031 2054
2032 /* 2055 /*
@@ -2048,8 +2071,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
2048out: 2071out:
2049 mm->total_vm += len >> PAGE_SHIFT; 2072 mm->total_vm += len >> PAGE_SHIFT;
2050 if (flags & VM_LOCKED) { 2073 if (flags & VM_LOCKED) {
2051 mm->locked_vm += len >> PAGE_SHIFT; 2074 if (!mlock_vma_pages_range(vma, addr, addr + len))
2052 make_pages_present(addr, addr + len); 2075 mm->locked_vm += (len >> PAGE_SHIFT);
2053 } 2076 }
2054 return addr; 2077 return addr;
2055} 2078}
@@ -2060,7 +2083,7 @@ EXPORT_SYMBOL(do_brk);
2060void exit_mmap(struct mm_struct *mm) 2083void exit_mmap(struct mm_struct *mm)
2061{ 2084{
2062 struct mmu_gather *tlb; 2085 struct mmu_gather *tlb;
2063 struct vm_area_struct *vma = mm->mmap; 2086 struct vm_area_struct *vma;
2064 unsigned long nr_accounted = 0; 2087 unsigned long nr_accounted = 0;
2065 unsigned long end; 2088 unsigned long end;
2066 2089
@@ -2068,6 +2091,15 @@ void exit_mmap(struct mm_struct *mm)
2068 arch_exit_mmap(mm); 2091 arch_exit_mmap(mm);
2069 mmu_notifier_release(mm); 2092 mmu_notifier_release(mm);
2070 2093
2094 if (mm->locked_vm) {
2095 vma = mm->mmap;
2096 while (vma) {
2097 if (vma->vm_flags & VM_LOCKED)
2098 munlock_vma_pages_all(vma);
2099 vma = vma->vm_next;
2100 }
2101 }
2102 vma = mm->mmap;
2071 lru_add_drain(); 2103 lru_add_drain();
2072 flush_cache_mm(mm); 2104 flush_cache_mm(mm);
2073 tlb = tlb_gather_mmu(mm, 1); 2105 tlb = tlb_gather_mmu(mm, 1);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 486ed595ee6f..16ce8b955dcf 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -69,6 +69,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
69 (z->zone && !zref_in_nodemask(z, nodes))) 69 (z->zone && !zref_in_nodemask(z, nodes)))
70 z++; 70 z++;
71 71
72 *zone = zonelist_zone(z++); 72 *zone = zonelist_zone(z);
73 return z; 73 return z;
74} 74}
diff --git a/mm/mremap.c b/mm/mremap.c
index 1a7743923c8c..58a2908f42f5 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -24,6 +24,8 @@
24#include <asm/cacheflush.h> 24#include <asm/cacheflush.h>
25#include <asm/tlbflush.h> 25#include <asm/tlbflush.h>
26 26
27#include "internal.h"
28
27static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) 29static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
28{ 30{
29 pgd_t *pgd; 31 pgd_t *pgd;
@@ -238,8 +240,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
238 if (vm_flags & VM_LOCKED) { 240 if (vm_flags & VM_LOCKED) {
239 mm->locked_vm += new_len >> PAGE_SHIFT; 241 mm->locked_vm += new_len >> PAGE_SHIFT;
240 if (new_len > old_len) 242 if (new_len > old_len)
241 make_pages_present(new_addr + old_len, 243 mlock_vma_pages_range(new_vma, new_addr + old_len,
242 new_addr + new_len); 244 new_addr + new_len);
243 } 245 }
244 246
245 return new_addr; 247 return new_addr;
@@ -379,7 +381,7 @@ unsigned long do_mremap(unsigned long addr,
379 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); 381 vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages);
380 if (vma->vm_flags & VM_LOCKED) { 382 if (vma->vm_flags & VM_LOCKED) {
381 mm->locked_vm += pages; 383 mm->locked_vm += pages;
382 make_pages_present(addr + old_len, 384 mlock_vma_pages_range(vma, addr + old_len,
383 addr + new_len); 385 addr + new_len);
384 } 386 }
385 ret = addr; 387 ret = addr;
diff --git a/mm/nommu.c b/mm/nommu.c
index ed75bc962fbe..7695dc850785 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -34,6 +34,8 @@
34#include <asm/tlb.h> 34#include <asm/tlb.h>
35#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
36 36
37#include "internal.h"
38
37void *high_memory; 39void *high_memory;
38struct page *mem_map; 40struct page *mem_map;
39unsigned long max_mapnr; 41unsigned long max_mapnr;
@@ -128,20 +130,16 @@ unsigned int kobjsize(const void *objp)
128 return PAGE_SIZE << compound_order(page); 130 return PAGE_SIZE << compound_order(page);
129} 131}
130 132
131/* 133int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
132 * get a list of pages in an address range belonging to the specified process 134 unsigned long start, int len, int flags,
133 * and indicate the VMA that covers each page 135 struct page **pages, struct vm_area_struct **vmas)
134 * - this is potentially dodgy as we may end incrementing the page count of a
135 * slab page or a secondary page from a compound page
136 * - don't permit access to VMAs that don't support it, such as I/O mappings
137 */
138int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
139 unsigned long start, int len, int write, int force,
140 struct page **pages, struct vm_area_struct **vmas)
141{ 136{
142 struct vm_area_struct *vma; 137 struct vm_area_struct *vma;
143 unsigned long vm_flags; 138 unsigned long vm_flags;
144 int i; 139 int i;
140 int write = !!(flags & GUP_FLAGS_WRITE);
141 int force = !!(flags & GUP_FLAGS_FORCE);
142 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
145 143
146 /* calculate required read or write permissions. 144 /* calculate required read or write permissions.
147 * - if 'force' is set, we only require the "MAY" flags. 145 * - if 'force' is set, we only require the "MAY" flags.
@@ -156,7 +154,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
156 154
157 /* protect what we can, including chardevs */ 155 /* protect what we can, including chardevs */
158 if (vma->vm_flags & (VM_IO | VM_PFNMAP) || 156 if (vma->vm_flags & (VM_IO | VM_PFNMAP) ||
159 !(vm_flags & vma->vm_flags)) 157 (!ignore && !(vm_flags & vma->vm_flags)))
160 goto finish_or_fault; 158 goto finish_or_fault;
161 159
162 if (pages) { 160 if (pages) {
@@ -174,6 +172,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
174finish_or_fault: 172finish_or_fault:
175 return i ? : -EFAULT; 173 return i ? : -EFAULT;
176} 174}
175
176
177/*
178 * get a list of pages in an address range belonging to the specified process
179 * and indicate the VMA that covers each page
180 * - this is potentially dodgy as we may end incrementing the page count of a
181 * slab page or a secondary page from a compound page
182 * - don't permit access to VMAs that don't support it, such as I/O mappings
183 */
184int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
185 unsigned long start, int len, int write, int force,
186 struct page **pages, struct vm_area_struct **vmas)
187{
188 int flags = 0;
189
190 if (write)
191 flags |= GUP_FLAGS_WRITE;
192 if (force)
193 flags |= GUP_FLAGS_FORCE;
194
195 return __get_user_pages(tsk, mm,
196 start, len, flags,
197 pages, vmas);
198}
177EXPORT_SYMBOL(get_user_pages); 199EXPORT_SYMBOL(get_user_pages);
178 200
179DEFINE_RWLOCK(vmlist_lock); 201DEFINE_RWLOCK(vmlist_lock);
@@ -1432,7 +1454,8 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
1432 1454
1433 /* Don't let a single process grow too big: 1455 /* Don't let a single process grow too big:
1434 leave 3% of the size of this process for other processes */ 1456 leave 3% of the size of this process for other processes */
1435 allowed -= current->mm->total_vm / 32; 1457 if (mm)
1458 allowed -= mm->total_vm / 32;
1436 1459
1437 /* 1460 /*
1438 * cast `allowed' as a signed long because vm_committed_space 1461 * cast `allowed' as a signed long because vm_committed_space
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 64e5b4bcd964..a0a01902f551 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -38,7 +38,6 @@ static DEFINE_SPINLOCK(zone_scan_mutex);
38 * badness - calculate a numeric value for how bad this task has been 38 * badness - calculate a numeric value for how bad this task has been
39 * @p: task struct of which task we should calculate 39 * @p: task struct of which task we should calculate
40 * @uptime: current uptime in seconds 40 * @uptime: current uptime in seconds
41 * @mem: target memory controller
42 * 41 *
43 * The formula used is relatively simple and documented inline in the 42 * The formula used is relatively simple and documented inline in the
44 * function. The main rationale is that we want to select a good task 43 * function. The main rationale is that we want to select a good task
@@ -295,6 +294,8 @@ static void dump_tasks(const struct mem_cgroup *mem)
295 continue; 294 continue;
296 if (mem && !task_in_mem_cgroup(p, mem)) 295 if (mem && !task_in_mem_cgroup(p, mem))
297 continue; 296 continue;
297 if (!thread_group_leader(p))
298 continue;
298 299
299 task_lock(p); 300 task_lock(p);
300 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", 301 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n",
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 24de8b65fdbd..2970e35fd03f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -7,7 +7,7 @@
7 * Contains functions related to writing back dirty pages at the 7 * Contains functions related to writing back dirty pages at the
8 * address_space level. 8 * address_space level.
9 * 9 *
10 * 10Apr2002 akpm@zip.com.au 10 * 10Apr2002 Andrew Morton
11 * Initial version 11 * Initial version
12 */ 12 */
13 13
@@ -329,9 +329,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
329 struct zone *z = 329 struct zone *z =
330 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; 330 &NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
331 331
332 x += zone_page_state(z, NR_FREE_PAGES) 332 x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z);
333 + zone_page_state(z, NR_INACTIVE)
334 + zone_page_state(z, NR_ACTIVE);
335 } 333 }
336 /* 334 /*
337 * Make sure that the number of highmem pages is never larger 335 * Make sure that the number of highmem pages is never larger
@@ -355,9 +353,7 @@ unsigned long determine_dirtyable_memory(void)
355{ 353{
356 unsigned long x; 354 unsigned long x;
357 355
358 x = global_page_state(NR_FREE_PAGES) 356 x = global_page_state(NR_FREE_PAGES) + global_lru_pages();
359 + global_page_state(NR_INACTIVE)
360 + global_page_state(NR_ACTIVE);
361 357
362 if (!vm_highmem_is_dirtyable) 358 if (!vm_highmem_is_dirtyable)
363 x -= highmem_dirtyable_memory(x); 359 x -= highmem_dirtyable_memory(x);
@@ -876,6 +872,7 @@ int write_cache_pages(struct address_space *mapping,
876 pgoff_t end; /* Inclusive */ 872 pgoff_t end; /* Inclusive */
877 int scanned = 0; 873 int scanned = 0;
878 int range_whole = 0; 874 int range_whole = 0;
875 long nr_to_write = wbc->nr_to_write;
879 876
880 if (wbc->nonblocking && bdi_write_congested(bdi)) { 877 if (wbc->nonblocking && bdi_write_congested(bdi)) {
881 wbc->encountered_congestion = 1; 878 wbc->encountered_congestion = 1;
@@ -939,7 +936,7 @@ retry:
939 unlock_page(page); 936 unlock_page(page);
940 ret = 0; 937 ret = 0;
941 } 938 }
942 if (ret || (--(wbc->nr_to_write) <= 0)) 939 if (ret || (--nr_to_write <= 0))
943 done = 1; 940 done = 1;
944 if (wbc->nonblocking && bdi_write_congested(bdi)) { 941 if (wbc->nonblocking && bdi_write_congested(bdi)) {
945 wbc->encountered_congestion = 1; 942 wbc->encountered_congestion = 1;
@@ -958,11 +955,12 @@ retry:
958 index = 0; 955 index = 0;
959 goto retry; 956 goto retry;
960 } 957 }
961 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) 958 if (!wbc->no_nrwrite_index_update) {
962 mapping->writeback_index = index; 959 if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
960 mapping->writeback_index = index;
961 wbc->nr_to_write = nr_to_write;
962 }
963 963
964 if (wbc->range_cont)
965 wbc->range_start = index << PAGE_CACHE_SHIFT;
966 return ret; 964 return ret;
967} 965}
968EXPORT_SYMBOL(write_cache_pages); 966EXPORT_SYMBOL(write_cache_pages);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e293c58bea58..d8ac01474563 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -44,7 +44,7 @@
44#include <linux/backing-dev.h> 44#include <linux/backing-dev.h>
45#include <linux/fault-inject.h> 45#include <linux/fault-inject.h>
46#include <linux/page-isolation.h> 46#include <linux/page-isolation.h>
47#include <linux/memcontrol.h> 47#include <linux/page_cgroup.h>
48#include <linux/debugobjects.h> 48#include <linux/debugobjects.h>
49 49
50#include <asm/tlbflush.h> 50#include <asm/tlbflush.h>
@@ -223,17 +223,12 @@ static inline int bad_range(struct zone *zone, struct page *page)
223 223
224static void bad_page(struct page *page) 224static void bad_page(struct page *page)
225{ 225{
226 void *pc = page_get_page_cgroup(page);
227
228 printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG 226 printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG
229 "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 227 "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
230 current->comm, page, (int)(2*sizeof(unsigned long)), 228 current->comm, page, (int)(2*sizeof(unsigned long)),
231 (unsigned long)page->flags, page->mapping, 229 (unsigned long)page->flags, page->mapping,
232 page_mapcount(page), page_count(page)); 230 page_mapcount(page), page_count(page));
233 if (pc) { 231
234 printk(KERN_EMERG "cgroup:%p\n", pc);
235 page_reset_bad_cgroup(page);
236 }
237 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" 232 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
238 KERN_EMERG "Backtrace:\n"); 233 KERN_EMERG "Backtrace:\n");
239 dump_stack(); 234 dump_stack();
@@ -280,6 +275,23 @@ void prep_compound_page(struct page *page, unsigned long order)
280 } 275 }
281} 276}
282 277
278#ifdef CONFIG_HUGETLBFS
279void prep_compound_gigantic_page(struct page *page, unsigned long order)
280{
281 int i;
282 int nr_pages = 1 << order;
283 struct page *p = page + 1;
284
285 set_compound_page_dtor(page, free_compound_page);
286 set_compound_order(page, order);
287 __SetPageHead(page);
288 for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
289 __SetPageTail(p);
290 p->first_page = page;
291 }
292}
293#endif
294
283static void destroy_compound_page(struct page *page, unsigned long order) 295static void destroy_compound_page(struct page *page, unsigned long order)
284{ 296{
285 int i; 297 int i;
@@ -451,14 +463,16 @@ static inline void __free_one_page(struct page *page,
451 463
452static inline int free_pages_check(struct page *page) 464static inline int free_pages_check(struct page *page)
453{ 465{
466 free_page_mlock(page);
454 if (unlikely(page_mapcount(page) | 467 if (unlikely(page_mapcount(page) |
455 (page->mapping != NULL) | 468 (page->mapping != NULL) |
456 (page_get_page_cgroup(page) != NULL) |
457 (page_count(page) != 0) | 469 (page_count(page) != 0) |
458 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) 470 (page->flags & PAGE_FLAGS_CHECK_AT_FREE)))
459 bad_page(page); 471 bad_page(page);
460 if (PageDirty(page)) 472 if (PageDirty(page))
461 __ClearPageDirty(page); 473 __ClearPageDirty(page);
474 if (PageSwapBacked(page))
475 __ClearPageSwapBacked(page);
462 /* 476 /*
463 * For now, we report if PG_reserved was found set, but do not 477 * For now, we report if PG_reserved was found set, but do not
464 * clear it, and do not free the page. But we shall soon need 478 * clear it, and do not free the page. But we shall soon need
@@ -597,7 +611,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
597{ 611{
598 if (unlikely(page_mapcount(page) | 612 if (unlikely(page_mapcount(page) |
599 (page->mapping != NULL) | 613 (page->mapping != NULL) |
600 (page_get_page_cgroup(page) != NULL) |
601 (page_count(page) != 0) | 614 (page_count(page) != 0) |
602 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) 615 (page->flags & PAGE_FLAGS_CHECK_AT_PREP)))
603 bad_page(page); 616 bad_page(page);
@@ -611,7 +624,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
611 624
612 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | 625 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
613 1 << PG_referenced | 1 << PG_arch_1 | 626 1 << PG_referenced | 1 << PG_arch_1 |
614 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); 627 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk
628#ifdef CONFIG_UNEVICTABLE_LRU
629 | 1 << PG_mlocked
630#endif
631 );
615 set_page_private(page, 0); 632 set_page_private(page, 0);
616 set_page_refcounted(page); 633 set_page_refcounted(page);
617 634
@@ -1544,6 +1561,10 @@ nofail_alloc:
1544 1561
1545 /* We now go into synchronous reclaim */ 1562 /* We now go into synchronous reclaim */
1546 cpuset_memory_pressure_bump(); 1563 cpuset_memory_pressure_bump();
1564 /*
1565 * The task's cpuset might have expanded its set of allowable nodes
1566 */
1567 cpuset_update_task_memory_state();
1547 p->flags |= PF_MEMALLOC; 1568 p->flags |= PF_MEMALLOC;
1548 reclaim_state.reclaimed_slab = 0; 1569 reclaim_state.reclaimed_slab = 0;
1549 p->reclaim_state = &reclaim_state; 1570 p->reclaim_state = &reclaim_state;
@@ -1859,10 +1880,21 @@ void show_free_areas(void)
1859 } 1880 }
1860 } 1881 }
1861 1882
1862 printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" 1883 printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
1884 " inactive_file:%lu"
1885//TODO: check/adjust line lengths
1886#ifdef CONFIG_UNEVICTABLE_LRU
1887 " unevictable:%lu"
1888#endif
1889 " dirty:%lu writeback:%lu unstable:%lu\n"
1863 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", 1890 " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
1864 global_page_state(NR_ACTIVE), 1891 global_page_state(NR_ACTIVE_ANON),
1865 global_page_state(NR_INACTIVE), 1892 global_page_state(NR_ACTIVE_FILE),
1893 global_page_state(NR_INACTIVE_ANON),
1894 global_page_state(NR_INACTIVE_FILE),
1895#ifdef CONFIG_UNEVICTABLE_LRU
1896 global_page_state(NR_UNEVICTABLE),
1897#endif
1866 global_page_state(NR_FILE_DIRTY), 1898 global_page_state(NR_FILE_DIRTY),
1867 global_page_state(NR_WRITEBACK), 1899 global_page_state(NR_WRITEBACK),
1868 global_page_state(NR_UNSTABLE_NFS), 1900 global_page_state(NR_UNSTABLE_NFS),
@@ -1885,8 +1917,13 @@ void show_free_areas(void)
1885 " min:%lukB" 1917 " min:%lukB"
1886 " low:%lukB" 1918 " low:%lukB"
1887 " high:%lukB" 1919 " high:%lukB"
1888 " active:%lukB" 1920 " active_anon:%lukB"
1889 " inactive:%lukB" 1921 " inactive_anon:%lukB"
1922 " active_file:%lukB"
1923 " inactive_file:%lukB"
1924#ifdef CONFIG_UNEVICTABLE_LRU
1925 " unevictable:%lukB"
1926#endif
1890 " present:%lukB" 1927 " present:%lukB"
1891 " pages_scanned:%lu" 1928 " pages_scanned:%lu"
1892 " all_unreclaimable? %s" 1929 " all_unreclaimable? %s"
@@ -1896,8 +1933,13 @@ void show_free_areas(void)
1896 K(zone->pages_min), 1933 K(zone->pages_min),
1897 K(zone->pages_low), 1934 K(zone->pages_low),
1898 K(zone->pages_high), 1935 K(zone->pages_high),
1899 K(zone_page_state(zone, NR_ACTIVE)), 1936 K(zone_page_state(zone, NR_ACTIVE_ANON)),
1900 K(zone_page_state(zone, NR_INACTIVE)), 1937 K(zone_page_state(zone, NR_INACTIVE_ANON)),
1938 K(zone_page_state(zone, NR_ACTIVE_FILE)),
1939 K(zone_page_state(zone, NR_INACTIVE_FILE)),
1940#ifdef CONFIG_UNEVICTABLE_LRU
1941 K(zone_page_state(zone, NR_UNEVICTABLE)),
1942#endif
1901 K(zone->present_pages), 1943 K(zone->present_pages),
1902 zone->pages_scanned, 1944 zone->pages_scanned,
1903 (zone_is_all_unreclaimable(zone) ? "yes" : "no") 1945 (zone_is_all_unreclaimable(zone) ? "yes" : "no")
@@ -3407,10 +3449,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3407 pgdat->nr_zones = 0; 3449 pgdat->nr_zones = 0;
3408 init_waitqueue_head(&pgdat->kswapd_wait); 3450 init_waitqueue_head(&pgdat->kswapd_wait);
3409 pgdat->kswapd_max_order = 0; 3451 pgdat->kswapd_max_order = 0;
3452 pgdat_page_cgroup_init(pgdat);
3410 3453
3411 for (j = 0; j < MAX_NR_ZONES; j++) { 3454 for (j = 0; j < MAX_NR_ZONES; j++) {
3412 struct zone *zone = pgdat->node_zones + j; 3455 struct zone *zone = pgdat->node_zones + j;
3413 unsigned long size, realsize, memmap_pages; 3456 unsigned long size, realsize, memmap_pages;
3457 enum lru_list l;
3414 3458
3415 size = zone_spanned_pages_in_node(nid, j, zones_size); 3459 size = zone_spanned_pages_in_node(nid, j, zones_size);
3416 realsize = size - zone_absent_pages_in_node(nid, j, 3460 realsize = size - zone_absent_pages_in_node(nid, j,
@@ -3425,8 +3469,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3425 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 3469 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
3426 if (realsize >= memmap_pages) { 3470 if (realsize >= memmap_pages) {
3427 realsize -= memmap_pages; 3471 realsize -= memmap_pages;
3428 mminit_dprintk(MMINIT_TRACE, "memmap_init", 3472 printk(KERN_DEBUG
3429 "%s zone: %lu pages used for memmap\n", 3473 " %s zone: %lu pages used for memmap\n",
3430 zone_names[j], memmap_pages); 3474 zone_names[j], memmap_pages);
3431 } else 3475 } else
3432 printk(KERN_WARNING 3476 printk(KERN_WARNING
@@ -3436,8 +3480,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3436 /* Account for reserved pages */ 3480 /* Account for reserved pages */
3437 if (j == 0 && realsize > dma_reserve) { 3481 if (j == 0 && realsize > dma_reserve) {
3438 realsize -= dma_reserve; 3482 realsize -= dma_reserve;
3439 mminit_dprintk(MMINIT_TRACE, "memmap_init", 3483 printk(KERN_DEBUG " %s zone: %lu pages reserved\n",
3440 "%s zone: %lu pages reserved\n",
3441 zone_names[0], dma_reserve); 3484 zone_names[0], dma_reserve);
3442 } 3485 }
3443 3486
@@ -3462,10 +3505,14 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3462 zone->prev_priority = DEF_PRIORITY; 3505 zone->prev_priority = DEF_PRIORITY;
3463 3506
3464 zone_pcp_init(zone); 3507 zone_pcp_init(zone);
3465 INIT_LIST_HEAD(&zone->active_list); 3508 for_each_lru(l) {
3466 INIT_LIST_HEAD(&zone->inactive_list); 3509 INIT_LIST_HEAD(&zone->lru[l].list);
3467 zone->nr_scan_active = 0; 3510 zone->lru[l].nr_scan = 0;
3468 zone->nr_scan_inactive = 0; 3511 }
3512 zone->recent_rotated[0] = 0;
3513 zone->recent_rotated[1] = 0;
3514 zone->recent_scanned[0] = 0;
3515 zone->recent_scanned[1] = 0;
3469 zap_zone_vm_stats(zone); 3516 zap_zone_vm_stats(zone);
3470 zone->flags = 0; 3517 zone->flags = 0;
3471 if (!size) 3518 if (!size)
@@ -3949,7 +3996,7 @@ static void check_for_regular_memory(pg_data_t *pgdat)
3949void __init free_area_init_nodes(unsigned long *max_zone_pfn) 3996void __init free_area_init_nodes(unsigned long *max_zone_pfn)
3950{ 3997{
3951 unsigned long nid; 3998 unsigned long nid;
3952 enum zone_type i; 3999 int i;
3953 4000
3954 /* Sort early_node_map as initialisation assumes it is sorted */ 4001 /* Sort early_node_map as initialisation assumes it is sorted */
3955 sort_node_map(); 4002 sort_node_map();
@@ -4207,7 +4254,7 @@ void setup_per_zone_pages_min(void)
4207 for_each_zone(zone) { 4254 for_each_zone(zone) {
4208 u64 tmp; 4255 u64 tmp;
4209 4256
4210 spin_lock_irqsave(&zone->lru_lock, flags); 4257 spin_lock_irqsave(&zone->lock, flags);
4211 tmp = (u64)pages_min * zone->present_pages; 4258 tmp = (u64)pages_min * zone->present_pages;
4212 do_div(tmp, lowmem_pages); 4259 do_div(tmp, lowmem_pages);
4213 if (is_highmem(zone)) { 4260 if (is_highmem(zone)) {
@@ -4239,13 +4286,53 @@ void setup_per_zone_pages_min(void)
4239 zone->pages_low = zone->pages_min + (tmp >> 2); 4286 zone->pages_low = zone->pages_min + (tmp >> 2);
4240 zone->pages_high = zone->pages_min + (tmp >> 1); 4287 zone->pages_high = zone->pages_min + (tmp >> 1);
4241 setup_zone_migrate_reserve(zone); 4288 setup_zone_migrate_reserve(zone);
4242 spin_unlock_irqrestore(&zone->lru_lock, flags); 4289 spin_unlock_irqrestore(&zone->lock, flags);
4243 } 4290 }
4244 4291
4245 /* update totalreserve_pages */ 4292 /* update totalreserve_pages */
4246 calculate_totalreserve_pages(); 4293 calculate_totalreserve_pages();
4247} 4294}
4248 4295
4296/**
4297 * setup_per_zone_inactive_ratio - called when min_free_kbytes changes.
4298 *
4299 * The inactive anon list should be small enough that the VM never has to
4300 * do too much work, but large enough that each inactive page has a chance
4301 * to be referenced again before it is swapped out.
4302 *
4303 * The inactive_anon ratio is the target ratio of ACTIVE_ANON to
4304 * INACTIVE_ANON pages on this zone's LRU, maintained by the
4305 * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of
4306 * the anonymous pages are kept on the inactive list.
4307 *
4308 * total target max
4309 * memory ratio inactive anon
4310 * -------------------------------------
4311 * 10MB 1 5MB
4312 * 100MB 1 50MB
4313 * 1GB 3 250MB
4314 * 10GB 10 0.9GB
4315 * 100GB 31 3GB
4316 * 1TB 101 10GB
4317 * 10TB 320 32GB
4318 */
4319void setup_per_zone_inactive_ratio(void)
4320{
4321 struct zone *zone;
4322
4323 for_each_zone(zone) {
4324 unsigned int gb, ratio;
4325
4326 /* Zone size in gigabytes */
4327 gb = zone->present_pages >> (30 - PAGE_SHIFT);
4328 ratio = int_sqrt(10 * gb);
4329 if (!ratio)
4330 ratio = 1;
4331
4332 zone->inactive_ratio = ratio;
4333 }
4334}
4335
4249/* 4336/*
4250 * Initialise min_free_kbytes. 4337 * Initialise min_free_kbytes.
4251 * 4338 *
@@ -4283,6 +4370,7 @@ static int __init init_per_zone_pages_min(void)
4283 min_free_kbytes = 65536; 4370 min_free_kbytes = 65536;
4284 setup_per_zone_pages_min(); 4371 setup_per_zone_pages_min();
4285 setup_per_zone_lowmem_reserve(); 4372 setup_per_zone_lowmem_reserve();
4373 setup_per_zone_inactive_ratio();
4286 return 0; 4374 return 0;
4287} 4375}
4288module_init(init_per_zone_pages_min) 4376module_init(init_per_zone_pages_min)
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
new file mode 100644
index 000000000000..1223d927904d
--- /dev/null
+++ b/mm/page_cgroup.c
@@ -0,0 +1,256 @@
1#include <linux/mm.h>
2#include <linux/mmzone.h>
3#include <linux/bootmem.h>
4#include <linux/bit_spinlock.h>
5#include <linux/page_cgroup.h>
6#include <linux/hash.h>
7#include <linux/slab.h>
8#include <linux/memory.h>
9#include <linux/vmalloc.h>
10#include <linux/cgroup.h>
11
12static void __meminit
13__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
14{
15 pc->flags = 0;
16 pc->mem_cgroup = NULL;
17 pc->page = pfn_to_page(pfn);
18}
19static unsigned long total_usage;
20
21#if !defined(CONFIG_SPARSEMEM)
22
23
24void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
25{
26 pgdat->node_page_cgroup = NULL;
27}
28
29struct page_cgroup *lookup_page_cgroup(struct page *page)
30{
31 unsigned long pfn = page_to_pfn(page);
32 unsigned long offset;
33 struct page_cgroup *base;
34
35 base = NODE_DATA(page_to_nid(page))->node_page_cgroup;
36 if (unlikely(!base))
37 return NULL;
38
39 offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn;
40 return base + offset;
41}
42
43static int __init alloc_node_page_cgroup(int nid)
44{
45 struct page_cgroup *base, *pc;
46 unsigned long table_size;
47 unsigned long start_pfn, nr_pages, index;
48
49 start_pfn = NODE_DATA(nid)->node_start_pfn;
50 nr_pages = NODE_DATA(nid)->node_spanned_pages;
51
52 table_size = sizeof(struct page_cgroup) * nr_pages;
53
54 base = __alloc_bootmem_node_nopanic(NODE_DATA(nid),
55 table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
56 if (!base)
57 return -ENOMEM;
58 for (index = 0; index < nr_pages; index++) {
59 pc = base + index;
60 __init_page_cgroup(pc, start_pfn + index);
61 }
62 NODE_DATA(nid)->node_page_cgroup = base;
63 total_usage += table_size;
64 return 0;
65}
66
67void __init page_cgroup_init(void)
68{
69
70 int nid, fail;
71
72 if (mem_cgroup_subsys.disabled)
73 return;
74
75 for_each_online_node(nid) {
76 fail = alloc_node_page_cgroup(nid);
77 if (fail)
78 goto fail;
79 }
80 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
81 printk(KERN_INFO "please try cgroup_disable=memory option if you"
82 " don't want\n");
83 return;
84fail:
85 printk(KERN_CRIT "allocation of page_cgroup was failed.\n");
86 printk(KERN_CRIT "please try cgroup_disable=memory boot option\n");
87 panic("Out of memory");
88}
89
90#else /* CONFIG_FLAT_NODE_MEM_MAP */
91
92struct page_cgroup *lookup_page_cgroup(struct page *page)
93{
94 unsigned long pfn = page_to_pfn(page);
95 struct mem_section *section = __pfn_to_section(pfn);
96
97 return section->page_cgroup + pfn;
98}
99
100int __meminit init_section_page_cgroup(unsigned long pfn)
101{
102 struct mem_section *section;
103 struct page_cgroup *base, *pc;
104 unsigned long table_size;
105 int nid, index;
106
107 section = __pfn_to_section(pfn);
108
109 if (section->page_cgroup)
110 return 0;
111
112 nid = page_to_nid(pfn_to_page(pfn));
113
114 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
115 if (slab_is_available()) {
116 base = kmalloc_node(table_size, GFP_KERNEL, nid);
117 if (!base)
118 base = vmalloc_node(table_size, nid);
119 } else {
120 base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), table_size,
121 PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
122 }
123
124 if (!base) {
125 printk(KERN_ERR "page cgroup allocation failure\n");
126 return -ENOMEM;
127 }
128
129 for (index = 0; index < PAGES_PER_SECTION; index++) {
130 pc = base + index;
131 __init_page_cgroup(pc, pfn + index);
132 }
133
134 section = __pfn_to_section(pfn);
135 section->page_cgroup = base - pfn;
136 total_usage += table_size;
137 return 0;
138}
139#ifdef CONFIG_MEMORY_HOTPLUG
140void __free_page_cgroup(unsigned long pfn)
141{
142 struct mem_section *ms;
143 struct page_cgroup *base;
144
145 ms = __pfn_to_section(pfn);
146 if (!ms || !ms->page_cgroup)
147 return;
148 base = ms->page_cgroup + pfn;
149 if (is_vmalloc_addr(base)) {
150 vfree(base);
151 ms->page_cgroup = NULL;
152 } else {
153 struct page *page = virt_to_page(base);
154 if (!PageReserved(page)) { /* Is bootmem ? */
155 kfree(base);
156 ms->page_cgroup = NULL;
157 }
158 }
159}
160
161int online_page_cgroup(unsigned long start_pfn,
162 unsigned long nr_pages,
163 int nid)
164{
165 unsigned long start, end, pfn;
166 int fail = 0;
167
168 start = start_pfn & ~(PAGES_PER_SECTION - 1);
169 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
170
171 for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) {
172 if (!pfn_present(pfn))
173 continue;
174 fail = init_section_page_cgroup(pfn);
175 }
176 if (!fail)
177 return 0;
178
179 /* rollback */
180 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
181 __free_page_cgroup(pfn);
182
183 return -ENOMEM;
184}
185
186int offline_page_cgroup(unsigned long start_pfn,
187 unsigned long nr_pages, int nid)
188{
189 unsigned long start, end, pfn;
190
191 start = start_pfn & ~(PAGES_PER_SECTION - 1);
192 end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION);
193
194 for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION)
195 __free_page_cgroup(pfn);
196 return 0;
197
198}
199
200static int page_cgroup_callback(struct notifier_block *self,
201 unsigned long action, void *arg)
202{
203 struct memory_notify *mn = arg;
204 int ret = 0;
205 switch (action) {
206 case MEM_GOING_ONLINE:
207 ret = online_page_cgroup(mn->start_pfn,
208 mn->nr_pages, mn->status_change_nid);
209 break;
210 case MEM_CANCEL_ONLINE:
211 case MEM_OFFLINE:
212 offline_page_cgroup(mn->start_pfn,
213 mn->nr_pages, mn->status_change_nid);
214 break;
215 case MEM_GOING_OFFLINE:
216 break;
217 case MEM_ONLINE:
218 case MEM_CANCEL_OFFLINE:
219 break;
220 }
221 ret = notifier_from_errno(ret);
222 return ret;
223}
224
225#endif
226
227void __init page_cgroup_init(void)
228{
229 unsigned long pfn;
230 int fail = 0;
231
232 if (mem_cgroup_subsys.disabled)
233 return;
234
235 for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
236 if (!pfn_present(pfn))
237 continue;
238 fail = init_section_page_cgroup(pfn);
239 }
240 if (fail) {
241 printk(KERN_CRIT "try cgroup_disable=memory boot option\n");
242 panic("Out of memory");
243 } else {
244 hotplug_memory_notifier(page_cgroup_callback, 0);
245 }
246 printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage);
247 printk(KERN_INFO "please try cgroup_disable=memory option if you don't"
248 " want\n");
249}
250
251void __init pgdat_page_cgroup_init(struct pglist_data *pgdat)
252{
253 return;
254}
255
256#endif
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index c69f84fe038d..5e0ffd967452 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -114,8 +114,10 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn)
114 114
115int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) 115int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
116{ 116{
117 unsigned long pfn; 117 unsigned long pfn, flags;
118 struct page *page; 118 struct page *page;
119 struct zone *zone;
120 int ret;
119 121
120 pfn = start_pfn; 122 pfn = start_pfn;
121 /* 123 /*
@@ -128,10 +130,13 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
128 if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE) 130 if (page && get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
129 break; 131 break;
130 } 132 }
131 if (pfn < end_pfn) 133 page = __first_valid_page(start_pfn, end_pfn - start_pfn);
134 if ((pfn < end_pfn) || !page)
132 return -EBUSY; 135 return -EBUSY;
133 /* Check all pages are free or Marked as ISOLATED */ 136 /* Check all pages are free or Marked as ISOLATED */
134 if (__test_page_isolated_in_pageblock(start_pfn, end_pfn)) 137 zone = page_zone(page);
135 return 0; 138 spin_lock_irqsave(&zone->lock, flags);
136 return -EBUSY; 139 ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn);
140 spin_unlock_irqrestore(&zone->lock, flags);
141 return ret ? 0 : -EBUSY;
137} 142}
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 0cbe0c60c6bf..a0a14c4d5072 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (C) 2002, Linus Torvalds. 4 * Copyright (C) 2002, Linus Torvalds.
5 * 5 *
6 * 09Apr2002 akpm@zip.com.au 6 * 09Apr2002 Andrew Morton
7 * Initial version 7 * Initial version
8 * 29Feb2004 kaos@sgi.com 8 * 29Feb2004 kaos@sgi.com
9 * Move worker thread creation to kthread to avoid chewing 9 * Move worker thread creation to kthread to avoid chewing
diff --git a/mm/readahead.c b/mm/readahead.c
index 77e8ddf945e9..bec83c15a78f 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (C) 2002, Linus Torvalds 4 * Copyright (C) 2002, Linus Torvalds
5 * 5 *
6 * 09Apr2002 akpm@zip.com.au 6 * 09Apr2002 Andrew Morton
7 * Initial version. 7 * Initial version.
8 */ 8 */
9 9
@@ -229,7 +229,7 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp,
229 */ 229 */
230unsigned long max_sane_readahead(unsigned long nr) 230unsigned long max_sane_readahead(unsigned long nr)
231{ 231{
232 return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE) 232 return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE)
233 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); 233 + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2);
234} 234}
235 235
diff --git a/mm/rmap.c b/mm/rmap.c
index 0383acfcb068..10993942d6c9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -53,9 +53,47 @@
53 53
54#include <asm/tlbflush.h> 54#include <asm/tlbflush.h>
55 55
56struct kmem_cache *anon_vma_cachep; 56#include "internal.h"
57 57
58/* This must be called under the mmap_sem. */ 58static struct kmem_cache *anon_vma_cachep;
59
60static inline struct anon_vma *anon_vma_alloc(void)
61{
62 return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
63}
64
65static inline void anon_vma_free(struct anon_vma *anon_vma)
66{
67 kmem_cache_free(anon_vma_cachep, anon_vma);
68}
69
70/**
71 * anon_vma_prepare - attach an anon_vma to a memory region
72 * @vma: the memory region in question
73 *
74 * This makes sure the memory mapping described by 'vma' has
75 * an 'anon_vma' attached to it, so that we can associate the
76 * anonymous pages mapped into it with that anon_vma.
77 *
78 * The common case will be that we already have one, but if
79 * if not we either need to find an adjacent mapping that we
80 * can re-use the anon_vma from (very common when the only
81 * reason for splitting a vma has been mprotect()), or we
82 * allocate a new one.
83 *
84 * Anon-vma allocations are very subtle, because we may have
85 * optimistically looked up an anon_vma in page_lock_anon_vma()
86 * and that may actually touch the spinlock even in the newly
87 * allocated vma (it depends on RCU to make sure that the
88 * anon_vma isn't actually destroyed).
89 *
90 * As a result, we need to do proper anon_vma locking even
91 * for the new allocation. At the same time, we do not want
92 * to do any locking for the common case of already having
93 * an anon_vma.
94 *
95 * This must be called with the mmap_sem held for reading.
96 */
59int anon_vma_prepare(struct vm_area_struct *vma) 97int anon_vma_prepare(struct vm_area_struct *vma)
60{ 98{
61 struct anon_vma *anon_vma = vma->anon_vma; 99 struct anon_vma *anon_vma = vma->anon_vma;
@@ -63,20 +101,17 @@ int anon_vma_prepare(struct vm_area_struct *vma)
63 might_sleep(); 101 might_sleep();
64 if (unlikely(!anon_vma)) { 102 if (unlikely(!anon_vma)) {
65 struct mm_struct *mm = vma->vm_mm; 103 struct mm_struct *mm = vma->vm_mm;
66 struct anon_vma *allocated, *locked; 104 struct anon_vma *allocated;
67 105
68 anon_vma = find_mergeable_anon_vma(vma); 106 anon_vma = find_mergeable_anon_vma(vma);
69 if (anon_vma) { 107 allocated = NULL;
70 allocated = NULL; 108 if (!anon_vma) {
71 locked = anon_vma;
72 spin_lock(&locked->lock);
73 } else {
74 anon_vma = anon_vma_alloc(); 109 anon_vma = anon_vma_alloc();
75 if (unlikely(!anon_vma)) 110 if (unlikely(!anon_vma))
76 return -ENOMEM; 111 return -ENOMEM;
77 allocated = anon_vma; 112 allocated = anon_vma;
78 locked = NULL;
79 } 113 }
114 spin_lock(&anon_vma->lock);
80 115
81 /* page_table_lock to protect against threads */ 116 /* page_table_lock to protect against threads */
82 spin_lock(&mm->page_table_lock); 117 spin_lock(&mm->page_table_lock);
@@ -87,8 +122,7 @@ int anon_vma_prepare(struct vm_area_struct *vma)
87 } 122 }
88 spin_unlock(&mm->page_table_lock); 123 spin_unlock(&mm->page_table_lock);
89 124
90 if (locked) 125 spin_unlock(&anon_vma->lock);
91 spin_unlock(&locked->lock);
92 if (unlikely(allocated)) 126 if (unlikely(allocated))
93 anon_vma_free(allocated); 127 anon_vma_free(allocated);
94 } 128 }
@@ -157,7 +191,7 @@ void __init anon_vma_init(void)
157 * Getting a lock on a stable anon_vma from a page off the LRU is 191 * Getting a lock on a stable anon_vma from a page off the LRU is
158 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 192 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
159 */ 193 */
160static struct anon_vma *page_lock_anon_vma(struct page *page) 194struct anon_vma *page_lock_anon_vma(struct page *page)
161{ 195{
162 struct anon_vma *anon_vma; 196 struct anon_vma *anon_vma;
163 unsigned long anon_mapping; 197 unsigned long anon_mapping;
@@ -177,7 +211,7 @@ out:
177 return NULL; 211 return NULL;
178} 212}
179 213
180static void page_unlock_anon_vma(struct anon_vma *anon_vma) 214void page_unlock_anon_vma(struct anon_vma *anon_vma)
181{ 215{
182 spin_unlock(&anon_vma->lock); 216 spin_unlock(&anon_vma->lock);
183 rcu_read_unlock(); 217 rcu_read_unlock();
@@ -268,6 +302,32 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
268 return NULL; 302 return NULL;
269} 303}
270 304
305/**
306 * page_mapped_in_vma - check whether a page is really mapped in a VMA
307 * @page: the page to test
308 * @vma: the VMA to test
309 *
310 * Returns 1 if the page is mapped into the page tables of the VMA, 0
311 * if the page is not mapped into the page tables of this VMA. Only
312 * valid for normal file or anonymous VMAs.
313 */
314static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
315{
316 unsigned long address;
317 pte_t *pte;
318 spinlock_t *ptl;
319
320 address = vma_address(page, vma);
321 if (address == -EFAULT) /* out of vma range */
322 return 0;
323 pte = page_check_address(page, vma->vm_mm, address, &ptl, 1);
324 if (!pte) /* the page is not in this mm */
325 return 0;
326 pte_unmap_unlock(pte, ptl);
327
328 return 1;
329}
330
271/* 331/*
272 * Subfunctions of page_referenced: page_referenced_one called 332 * Subfunctions of page_referenced: page_referenced_one called
273 * repeatedly from either page_referenced_anon or page_referenced_file. 333 * repeatedly from either page_referenced_anon or page_referenced_file.
@@ -289,10 +349,17 @@ static int page_referenced_one(struct page *page,
289 if (!pte) 349 if (!pte)
290 goto out; 350 goto out;
291 351
352 /*
353 * Don't want to elevate referenced for mlocked page that gets this far,
354 * in order that it progresses to try_to_unmap and is moved to the
355 * unevictable list.
356 */
292 if (vma->vm_flags & VM_LOCKED) { 357 if (vma->vm_flags & VM_LOCKED) {
293 referenced++;
294 *mapcount = 1; /* break early from loop */ 358 *mapcount = 1; /* break early from loop */
295 } else if (ptep_clear_flush_young_notify(vma, address, pte)) 359 goto out_unmap;
360 }
361
362 if (ptep_clear_flush_young_notify(vma, address, pte))
296 referenced++; 363 referenced++;
297 364
298 /* Pretend the page is referenced if the task has the 365 /* Pretend the page is referenced if the task has the
@@ -301,6 +368,7 @@ static int page_referenced_one(struct page *page,
301 rwsem_is_locked(&mm->mmap_sem)) 368 rwsem_is_locked(&mm->mmap_sem))
302 referenced++; 369 referenced++;
303 370
371out_unmap:
304 (*mapcount)--; 372 (*mapcount)--;
305 pte_unmap_unlock(pte, ptl); 373 pte_unmap_unlock(pte, ptl);
306out: 374out:
@@ -390,11 +458,6 @@ static int page_referenced_file(struct page *page,
390 */ 458 */
391 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) 459 if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont))
392 continue; 460 continue;
393 if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
394 == (VM_LOCKED|VM_MAYSHARE)) {
395 referenced++;
396 break;
397 }
398 referenced += page_referenced_one(page, vma, &mapcount); 461 referenced += page_referenced_one(page, vma, &mapcount);
399 if (!mapcount) 462 if (!mapcount)
400 break; 463 break;
@@ -674,8 +737,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma)
674 page_clear_dirty(page); 737 page_clear_dirty(page);
675 set_page_dirty(page); 738 set_page_dirty(page);
676 } 739 }
677 740 if (PageAnon(page))
678 mem_cgroup_uncharge_page(page); 741 mem_cgroup_uncharge_page(page);
679 __dec_zone_page_state(page, 742 __dec_zone_page_state(page,
680 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); 743 PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED);
681 /* 744 /*
@@ -717,11 +780,16 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
717 * If it's recently referenced (perhaps page_referenced 780 * If it's recently referenced (perhaps page_referenced
718 * skipped over this mm) then we should reactivate it. 781 * skipped over this mm) then we should reactivate it.
719 */ 782 */
720 if (!migration && ((vma->vm_flags & VM_LOCKED) || 783 if (!migration) {
721 (ptep_clear_flush_young_notify(vma, address, pte)))) { 784 if (vma->vm_flags & VM_LOCKED) {
722 ret = SWAP_FAIL; 785 ret = SWAP_MLOCK;
723 goto out_unmap; 786 goto out_unmap;
724 } 787 }
788 if (ptep_clear_flush_young_notify(vma, address, pte)) {
789 ret = SWAP_FAIL;
790 goto out_unmap;
791 }
792 }
725 793
726 /* Nuke the page table entry. */ 794 /* Nuke the page table entry. */
727 flush_cache_page(vma, address, page_to_pfn(page)); 795 flush_cache_page(vma, address, page_to_pfn(page));
@@ -802,12 +870,17 @@ out:
802 * For very sparsely populated VMAs this is a little inefficient - chances are 870 * For very sparsely populated VMAs this is a little inefficient - chances are
803 * there there won't be many ptes located within the scan cluster. In this case 871 * there there won't be many ptes located within the scan cluster. In this case
804 * maybe we could scan further - to the end of the pte page, perhaps. 872 * maybe we could scan further - to the end of the pte page, perhaps.
873 *
874 * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can
875 * acquire it without blocking. If vma locked, mlock the pages in the cluster,
876 * rather than unmapping them. If we encounter the "check_page" that vmscan is
877 * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
805 */ 878 */
806#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) 879#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
807#define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) 880#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
808 881
809static void try_to_unmap_cluster(unsigned long cursor, 882static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
810 unsigned int *mapcount, struct vm_area_struct *vma) 883 struct vm_area_struct *vma, struct page *check_page)
811{ 884{
812 struct mm_struct *mm = vma->vm_mm; 885 struct mm_struct *mm = vma->vm_mm;
813 pgd_t *pgd; 886 pgd_t *pgd;
@@ -819,6 +892,8 @@ static void try_to_unmap_cluster(unsigned long cursor,
819 struct page *page; 892 struct page *page;
820 unsigned long address; 893 unsigned long address;
821 unsigned long end; 894 unsigned long end;
895 int ret = SWAP_AGAIN;
896 int locked_vma = 0;
822 897
823 address = (vma->vm_start + cursor) & CLUSTER_MASK; 898 address = (vma->vm_start + cursor) & CLUSTER_MASK;
824 end = address + CLUSTER_SIZE; 899 end = address + CLUSTER_SIZE;
@@ -829,15 +904,26 @@ static void try_to_unmap_cluster(unsigned long cursor,
829 904
830 pgd = pgd_offset(mm, address); 905 pgd = pgd_offset(mm, address);
831 if (!pgd_present(*pgd)) 906 if (!pgd_present(*pgd))
832 return; 907 return ret;
833 908
834 pud = pud_offset(pgd, address); 909 pud = pud_offset(pgd, address);
835 if (!pud_present(*pud)) 910 if (!pud_present(*pud))
836 return; 911 return ret;
837 912
838 pmd = pmd_offset(pud, address); 913 pmd = pmd_offset(pud, address);
839 if (!pmd_present(*pmd)) 914 if (!pmd_present(*pmd))
840 return; 915 return ret;
916
917 /*
918 * MLOCK_PAGES => feature is configured.
919 * if we can acquire the mmap_sem for read, and vma is VM_LOCKED,
920 * keep the sem while scanning the cluster for mlocking pages.
921 */
922 if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) {
923 locked_vma = (vma->vm_flags & VM_LOCKED);
924 if (!locked_vma)
925 up_read(&vma->vm_mm->mmap_sem); /* don't need it */
926 }
841 927
842 pte = pte_offset_map_lock(mm, pmd, address, &ptl); 928 pte = pte_offset_map_lock(mm, pmd, address, &ptl);
843 929
@@ -850,6 +936,13 @@ static void try_to_unmap_cluster(unsigned long cursor,
850 page = vm_normal_page(vma, address, *pte); 936 page = vm_normal_page(vma, address, *pte);
851 BUG_ON(!page || PageAnon(page)); 937 BUG_ON(!page || PageAnon(page));
852 938
939 if (locked_vma) {
940 mlock_vma_page(page); /* no-op if already mlocked */
941 if (page == check_page)
942 ret = SWAP_MLOCK;
943 continue; /* don't unmap */
944 }
945
853 if (ptep_clear_flush_young_notify(vma, address, pte)) 946 if (ptep_clear_flush_young_notify(vma, address, pte))
854 continue; 947 continue;
855 948
@@ -871,39 +964,104 @@ static void try_to_unmap_cluster(unsigned long cursor,
871 (*mapcount)--; 964 (*mapcount)--;
872 } 965 }
873 pte_unmap_unlock(pte - 1, ptl); 966 pte_unmap_unlock(pte - 1, ptl);
967 if (locked_vma)
968 up_read(&vma->vm_mm->mmap_sem);
969 return ret;
874} 970}
875 971
876static int try_to_unmap_anon(struct page *page, int migration) 972/*
973 * common handling for pages mapped in VM_LOCKED vmas
974 */
975static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma)
976{
977 int mlocked = 0;
978
979 if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
980 if (vma->vm_flags & VM_LOCKED) {
981 mlock_vma_page(page);
982 mlocked++; /* really mlocked the page */
983 }
984 up_read(&vma->vm_mm->mmap_sem);
985 }
986 return mlocked;
987}
988
989/**
990 * try_to_unmap_anon - unmap or unlock anonymous page using the object-based
991 * rmap method
992 * @page: the page to unmap/unlock
993 * @unlock: request for unlock rather than unmap [unlikely]
994 * @migration: unmapping for migration - ignored if @unlock
995 *
996 * Find all the mappings of a page using the mapping pointer and the vma chains
997 * contained in the anon_vma struct it points to.
998 *
999 * This function is only called from try_to_unmap/try_to_munlock for
1000 * anonymous pages.
1001 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1002 * where the page was found will be held for write. So, we won't recheck
1003 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1004 * 'LOCKED.
1005 */
1006static int try_to_unmap_anon(struct page *page, int unlock, int migration)
877{ 1007{
878 struct anon_vma *anon_vma; 1008 struct anon_vma *anon_vma;
879 struct vm_area_struct *vma; 1009 struct vm_area_struct *vma;
1010 unsigned int mlocked = 0;
880 int ret = SWAP_AGAIN; 1011 int ret = SWAP_AGAIN;
881 1012
1013 if (MLOCK_PAGES && unlikely(unlock))
1014 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
1015
882 anon_vma = page_lock_anon_vma(page); 1016 anon_vma = page_lock_anon_vma(page);
883 if (!anon_vma) 1017 if (!anon_vma)
884 return ret; 1018 return ret;
885 1019
886 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { 1020 list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
887 ret = try_to_unmap_one(page, vma, migration); 1021 if (MLOCK_PAGES && unlikely(unlock)) {
888 if (ret == SWAP_FAIL || !page_mapped(page)) 1022 if (!((vma->vm_flags & VM_LOCKED) &&
889 break; 1023 page_mapped_in_vma(page, vma)))
1024 continue; /* must visit all unlocked vmas */
1025 ret = SWAP_MLOCK; /* saw at least one mlocked vma */
1026 } else {
1027 ret = try_to_unmap_one(page, vma, migration);
1028 if (ret == SWAP_FAIL || !page_mapped(page))
1029 break;
1030 }
1031 if (ret == SWAP_MLOCK) {
1032 mlocked = try_to_mlock_page(page, vma);
1033 if (mlocked)
1034 break; /* stop if actually mlocked page */
1035 }
890 } 1036 }
891 1037
892 page_unlock_anon_vma(anon_vma); 1038 page_unlock_anon_vma(anon_vma);
1039
1040 if (mlocked)
1041 ret = SWAP_MLOCK; /* actually mlocked the page */
1042 else if (ret == SWAP_MLOCK)
1043 ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
1044
893 return ret; 1045 return ret;
894} 1046}
895 1047
896/** 1048/**
897 * try_to_unmap_file - unmap file page using the object-based rmap method 1049 * try_to_unmap_file - unmap/unlock file page using the object-based rmap method
898 * @page: the page to unmap 1050 * @page: the page to unmap/unlock
899 * @migration: migration flag 1051 * @unlock: request for unlock rather than unmap [unlikely]
1052 * @migration: unmapping for migration - ignored if @unlock
900 * 1053 *
901 * Find all the mappings of a page using the mapping pointer and the vma chains 1054 * Find all the mappings of a page using the mapping pointer and the vma chains
902 * contained in the address_space struct it points to. 1055 * contained in the address_space struct it points to.
903 * 1056 *
904 * This function is only called from try_to_unmap for object-based pages. 1057 * This function is only called from try_to_unmap/try_to_munlock for
1058 * object-based pages.
1059 * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
1060 * where the page was found will be held for write. So, we won't recheck
1061 * vm_flags for that VMA. That should be OK, because that vma shouldn't be
1062 * 'LOCKED.
905 */ 1063 */
906static int try_to_unmap_file(struct page *page, int migration) 1064static int try_to_unmap_file(struct page *page, int unlock, int migration)
907{ 1065{
908 struct address_space *mapping = page->mapping; 1066 struct address_space *mapping = page->mapping;
909 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 1067 pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -914,20 +1072,44 @@ static int try_to_unmap_file(struct page *page, int migration)
914 unsigned long max_nl_cursor = 0; 1072 unsigned long max_nl_cursor = 0;
915 unsigned long max_nl_size = 0; 1073 unsigned long max_nl_size = 0;
916 unsigned int mapcount; 1074 unsigned int mapcount;
1075 unsigned int mlocked = 0;
1076
1077 if (MLOCK_PAGES && unlikely(unlock))
1078 ret = SWAP_SUCCESS; /* default for try_to_munlock() */
917 1079
918 spin_lock(&mapping->i_mmap_lock); 1080 spin_lock(&mapping->i_mmap_lock);
919 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { 1081 vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
920 ret = try_to_unmap_one(page, vma, migration); 1082 if (MLOCK_PAGES && unlikely(unlock)) {
921 if (ret == SWAP_FAIL || !page_mapped(page)) 1083 if (!(vma->vm_flags & VM_LOCKED))
922 goto out; 1084 continue; /* must visit all vmas */
1085 ret = SWAP_MLOCK;
1086 } else {
1087 ret = try_to_unmap_one(page, vma, migration);
1088 if (ret == SWAP_FAIL || !page_mapped(page))
1089 goto out;
1090 }
1091 if (ret == SWAP_MLOCK) {
1092 mlocked = try_to_mlock_page(page, vma);
1093 if (mlocked)
1094 break; /* stop if actually mlocked page */
1095 }
923 } 1096 }
924 1097
1098 if (mlocked)
1099 goto out;
1100
925 if (list_empty(&mapping->i_mmap_nonlinear)) 1101 if (list_empty(&mapping->i_mmap_nonlinear))
926 goto out; 1102 goto out;
927 1103
928 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1104 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
929 shared.vm_set.list) { 1105 shared.vm_set.list) {
930 if ((vma->vm_flags & VM_LOCKED) && !migration) 1106 if (MLOCK_PAGES && unlikely(unlock)) {
1107 if (!(vma->vm_flags & VM_LOCKED))
1108 continue; /* must visit all vmas */
1109 ret = SWAP_MLOCK; /* leave mlocked == 0 */
1110 goto out; /* no need to look further */
1111 }
1112 if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED))
931 continue; 1113 continue;
932 cursor = (unsigned long) vma->vm_private_data; 1114 cursor = (unsigned long) vma->vm_private_data;
933 if (cursor > max_nl_cursor) 1115 if (cursor > max_nl_cursor)
@@ -937,7 +1119,7 @@ static int try_to_unmap_file(struct page *page, int migration)
937 max_nl_size = cursor; 1119 max_nl_size = cursor;
938 } 1120 }
939 1121
940 if (max_nl_size == 0) { /* any nonlinears locked or reserved */ 1122 if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
941 ret = SWAP_FAIL; 1123 ret = SWAP_FAIL;
942 goto out; 1124 goto out;
943 } 1125 }
@@ -961,12 +1143,16 @@ static int try_to_unmap_file(struct page *page, int migration)
961 do { 1143 do {
962 list_for_each_entry(vma, &mapping->i_mmap_nonlinear, 1144 list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
963 shared.vm_set.list) { 1145 shared.vm_set.list) {
964 if ((vma->vm_flags & VM_LOCKED) && !migration) 1146 if (!MLOCK_PAGES && !migration &&
1147 (vma->vm_flags & VM_LOCKED))
965 continue; 1148 continue;
966 cursor = (unsigned long) vma->vm_private_data; 1149 cursor = (unsigned long) vma->vm_private_data;
967 while ( cursor < max_nl_cursor && 1150 while ( cursor < max_nl_cursor &&
968 cursor < vma->vm_end - vma->vm_start) { 1151 cursor < vma->vm_end - vma->vm_start) {
969 try_to_unmap_cluster(cursor, &mapcount, vma); 1152 ret = try_to_unmap_cluster(cursor, &mapcount,
1153 vma, page);
1154 if (ret == SWAP_MLOCK)
1155 mlocked = 2; /* to return below */
970 cursor += CLUSTER_SIZE; 1156 cursor += CLUSTER_SIZE;
971 vma->vm_private_data = (void *) cursor; 1157 vma->vm_private_data = (void *) cursor;
972 if ((int)mapcount <= 0) 1158 if ((int)mapcount <= 0)
@@ -987,6 +1173,10 @@ static int try_to_unmap_file(struct page *page, int migration)
987 vma->vm_private_data = NULL; 1173 vma->vm_private_data = NULL;
988out: 1174out:
989 spin_unlock(&mapping->i_mmap_lock); 1175 spin_unlock(&mapping->i_mmap_lock);
1176 if (mlocked)
1177 ret = SWAP_MLOCK; /* actually mlocked the page */
1178 else if (ret == SWAP_MLOCK)
1179 ret = SWAP_AGAIN; /* saw VM_LOCKED vma */
990 return ret; 1180 return ret;
991} 1181}
992 1182
@@ -1002,6 +1192,7 @@ out:
1002 * SWAP_SUCCESS - we succeeded in removing all mappings 1192 * SWAP_SUCCESS - we succeeded in removing all mappings
1003 * SWAP_AGAIN - we missed a mapping, try again later 1193 * SWAP_AGAIN - we missed a mapping, try again later
1004 * SWAP_FAIL - the page is unswappable 1194 * SWAP_FAIL - the page is unswappable
1195 * SWAP_MLOCK - page is mlocked.
1005 */ 1196 */
1006int try_to_unmap(struct page *page, int migration) 1197int try_to_unmap(struct page *page, int migration)
1007{ 1198{
@@ -1010,12 +1201,36 @@ int try_to_unmap(struct page *page, int migration)
1010 BUG_ON(!PageLocked(page)); 1201 BUG_ON(!PageLocked(page));
1011 1202
1012 if (PageAnon(page)) 1203 if (PageAnon(page))
1013 ret = try_to_unmap_anon(page, migration); 1204 ret = try_to_unmap_anon(page, 0, migration);
1014 else 1205 else
1015 ret = try_to_unmap_file(page, migration); 1206 ret = try_to_unmap_file(page, 0, migration);
1016 1207 if (ret != SWAP_MLOCK && !page_mapped(page))
1017 if (!page_mapped(page))
1018 ret = SWAP_SUCCESS; 1208 ret = SWAP_SUCCESS;
1019 return ret; 1209 return ret;
1020} 1210}
1021 1211
1212#ifdef CONFIG_UNEVICTABLE_LRU
1213/**
1214 * try_to_munlock - try to munlock a page
1215 * @page: the page to be munlocked
1216 *
1217 * Called from munlock code. Checks all of the VMAs mapping the page
1218 * to make sure nobody else has this page mlocked. The page will be
1219 * returned with PG_mlocked cleared if no other vmas have it mlocked.
1220 *
1221 * Return values are:
1222 *
1223 * SWAP_SUCCESS - no vma's holding page mlocked.
1224 * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem
1225 * SWAP_MLOCK - page is now mlocked.
1226 */
1227int try_to_munlock(struct page *page)
1228{
1229 VM_BUG_ON(!PageLocked(page) || PageLRU(page));
1230
1231 if (PageAnon(page))
1232 return try_to_unmap_anon(page, 1, 0);
1233 else
1234 return try_to_unmap_file(page, 1, 0);
1235}
1236#endif
diff --git a/mm/shmem.c b/mm/shmem.c
index 04fb4f1ab88e..0ed075215e5f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -50,14 +50,12 @@
50#include <linux/migrate.h> 50#include <linux/migrate.h>
51#include <linux/highmem.h> 51#include <linux/highmem.h>
52#include <linux/seq_file.h> 52#include <linux/seq_file.h>
53#include <linux/magic.h>
53 54
54#include <asm/uaccess.h> 55#include <asm/uaccess.h>
55#include <asm/div64.h> 56#include <asm/div64.h>
56#include <asm/pgtable.h> 57#include <asm/pgtable.h>
57 58
58/* This magic number is used in glibc for posix shared memory */
59#define TMPFS_MAGIC 0x01021994
60
61#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) 59#define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
62#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) 60#define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
63#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) 61#define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
@@ -163,8 +161,8 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
163 */ 161 */
164static inline int shmem_acct_size(unsigned long flags, loff_t size) 162static inline int shmem_acct_size(unsigned long flags, loff_t size)
165{ 163{
166 return (flags & VM_ACCOUNT)? 164 return (flags & VM_ACCOUNT) ?
167 security_vm_enough_memory(VM_ACCT(size)): 0; 165 security_vm_enough_memory_kern(VM_ACCT(size)) : 0;
168} 166}
169 167
170static inline void shmem_unacct_size(unsigned long flags, loff_t size) 168static inline void shmem_unacct_size(unsigned long flags, loff_t size)
@@ -181,8 +179,8 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size)
181 */ 179 */
182static inline int shmem_acct_block(unsigned long flags) 180static inline int shmem_acct_block(unsigned long flags)
183{ 181{
184 return (flags & VM_ACCOUNT)? 182 return (flags & VM_ACCOUNT) ?
185 0: security_vm_enough_memory(VM_ACCT(PAGE_CACHE_SIZE)); 183 0 : security_vm_enough_memory_kern(VM_ACCT(PAGE_CACHE_SIZE));
186} 184}
187 185
188static inline void shmem_unacct_blocks(unsigned long flags, long pages) 186static inline void shmem_unacct_blocks(unsigned long flags, long pages)
@@ -201,7 +199,7 @@ static struct vm_operations_struct shmem_vm_ops;
201 199
202static struct backing_dev_info shmem_backing_dev_info __read_mostly = { 200static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
203 .ra_pages = 0, /* No readahead */ 201 .ra_pages = 0, /* No readahead */
204 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 202 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
205 .unplug_io_fn = default_unplug_io_fn, 203 .unplug_io_fn = default_unplug_io_fn,
206}; 204};
207 205
@@ -1369,6 +1367,7 @@ repeat:
1369 error = -ENOMEM; 1367 error = -ENOMEM;
1370 goto failed; 1368 goto failed;
1371 } 1369 }
1370 SetPageSwapBacked(filepage);
1372 1371
1373 /* Precharge page while we can wait, compensate after */ 1372 /* Precharge page while we can wait, compensate after */
1374 error = mem_cgroup_cache_charge(filepage, current->mm, 1373 error = mem_cgroup_cache_charge(filepage, current->mm,
@@ -1478,12 +1477,16 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
1478 if (!user_shm_lock(inode->i_size, user)) 1477 if (!user_shm_lock(inode->i_size, user))
1479 goto out_nomem; 1478 goto out_nomem;
1480 info->flags |= VM_LOCKED; 1479 info->flags |= VM_LOCKED;
1480 mapping_set_unevictable(file->f_mapping);
1481 } 1481 }
1482 if (!lock && (info->flags & VM_LOCKED) && user) { 1482 if (!lock && (info->flags & VM_LOCKED) && user) {
1483 user_shm_unlock(inode->i_size, user); 1483 user_shm_unlock(inode->i_size, user);
1484 info->flags &= ~VM_LOCKED; 1484 info->flags &= ~VM_LOCKED;
1485 mapping_clear_unevictable(file->f_mapping);
1486 scan_mapping_unevictable_pages(file->f_mapping);
1485 } 1487 }
1486 retval = 0; 1488 retval = 0;
1489
1487out_nomem: 1490out_nomem:
1488 spin_unlock(&info->lock); 1491 spin_unlock(&info->lock);
1489 return retval; 1492 return retval;
@@ -2582,6 +2585,7 @@ put_memory:
2582 shmem_unacct_size(flags, size); 2585 shmem_unacct_size(flags, size);
2583 return ERR_PTR(error); 2586 return ERR_PTR(error);
2584} 2587}
2588EXPORT_SYMBOL_GPL(shmem_file_setup);
2585 2589
2586/** 2590/**
2587 * shmem_zero_setup - setup a shared anonymous mapping 2591 * shmem_zero_setup - setup a shared anonymous mapping
diff --git a/mm/slab.c b/mm/slab.c
index e76eee466886..09187517f9dc 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -95,6 +95,7 @@
95#include <linux/init.h> 95#include <linux/init.h>
96#include <linux/compiler.h> 96#include <linux/compiler.h>
97#include <linux/cpuset.h> 97#include <linux/cpuset.h>
98#include <linux/proc_fs.h>
98#include <linux/seq_file.h> 99#include <linux/seq_file.h>
99#include <linux/notifier.h> 100#include <linux/notifier.h>
100#include <linux/kallsyms.h> 101#include <linux/kallsyms.h>
@@ -4258,7 +4259,7 @@ static int s_show(struct seq_file *m, void *p)
4258 * + further values on SMP and with statistics enabled 4259 * + further values on SMP and with statistics enabled
4259 */ 4260 */
4260 4261
4261const struct seq_operations slabinfo_op = { 4262static const struct seq_operations slabinfo_op = {
4262 .start = s_start, 4263 .start = s_start,
4263 .next = s_next, 4264 .next = s_next,
4264 .stop = s_stop, 4265 .stop = s_stop,
@@ -4315,6 +4316,19 @@ ssize_t slabinfo_write(struct file *file, const char __user * buffer,
4315 return res; 4316 return res;
4316} 4317}
4317 4318
4319static int slabinfo_open(struct inode *inode, struct file *file)
4320{
4321 return seq_open(file, &slabinfo_op);
4322}
4323
4324static const struct file_operations proc_slabinfo_operations = {
4325 .open = slabinfo_open,
4326 .read = seq_read,
4327 .write = slabinfo_write,
4328 .llseek = seq_lseek,
4329 .release = seq_release,
4330};
4331
4318#ifdef CONFIG_DEBUG_SLAB_LEAK 4332#ifdef CONFIG_DEBUG_SLAB_LEAK
4319 4333
4320static void *leaks_start(struct seq_file *m, loff_t *pos) 4334static void *leaks_start(struct seq_file *m, loff_t *pos)
@@ -4443,13 +4457,47 @@ static int leaks_show(struct seq_file *m, void *p)
4443 return 0; 4457 return 0;
4444} 4458}
4445 4459
4446const struct seq_operations slabstats_op = { 4460static const struct seq_operations slabstats_op = {
4447 .start = leaks_start, 4461 .start = leaks_start,
4448 .next = s_next, 4462 .next = s_next,
4449 .stop = s_stop, 4463 .stop = s_stop,
4450 .show = leaks_show, 4464 .show = leaks_show,
4451}; 4465};
4466
4467static int slabstats_open(struct inode *inode, struct file *file)
4468{
4469 unsigned long *n = kzalloc(PAGE_SIZE, GFP_KERNEL);
4470 int ret = -ENOMEM;
4471 if (n) {
4472 ret = seq_open(file, &slabstats_op);
4473 if (!ret) {
4474 struct seq_file *m = file->private_data;
4475 *n = PAGE_SIZE / (2 * sizeof(unsigned long));
4476 m->private = n;
4477 n = NULL;
4478 }
4479 kfree(n);
4480 }
4481 return ret;
4482}
4483
4484static const struct file_operations proc_slabstats_operations = {
4485 .open = slabstats_open,
4486 .read = seq_read,
4487 .llseek = seq_lseek,
4488 .release = seq_release_private,
4489};
4490#endif
4491
4492static int __init slab_proc_init(void)
4493{
4494 proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
4495#ifdef CONFIG_DEBUG_SLAB_LEAK
4496 proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
4452#endif 4497#endif
4498 return 0;
4499}
4500module_init(slab_proc_init);
4453#endif 4501#endif
4454 4502
4455/** 4503/**
diff --git a/mm/slob.c b/mm/slob.c
index 4c82dd41f32e..cb675d126791 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -514,9 +514,11 @@ size_t ksize(const void *block)
514 return 0; 514 return 0;
515 515
516 sp = (struct slob_page *)virt_to_page(block); 516 sp = (struct slob_page *)virt_to_page(block);
517 if (slob_page(sp)) 517 if (slob_page(sp)) {
518 return ((slob_t *)block - 1)->units + SLOB_UNIT; 518 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
519 else 519 unsigned int *m = (unsigned int *)(block - align);
520 return SLOB_UNITS(*m) * SLOB_UNIT;
521 } else
520 return sp->page.private; 522 return sp->page.private;
521} 523}
522 524
diff --git a/mm/slub.c b/mm/slub.c
index fb486d5540f8..7ad489af9561 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -14,6 +14,7 @@
14#include <linux/interrupt.h> 14#include <linux/interrupt.h>
15#include <linux/bitops.h> 15#include <linux/bitops.h>
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/proc_fs.h>
17#include <linux/seq_file.h> 18#include <linux/seq_file.h>
18#include <linux/cpu.h> 19#include <linux/cpu.h>
19#include <linux/cpuset.h> 20#include <linux/cpuset.h>
@@ -1932,6 +1933,7 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
1932 INIT_LIST_HEAD(&n->partial); 1933 INIT_LIST_HEAD(&n->partial);
1933#ifdef CONFIG_SLUB_DEBUG 1934#ifdef CONFIG_SLUB_DEBUG
1934 atomic_long_set(&n->nr_slabs, 0); 1935 atomic_long_set(&n->nr_slabs, 0);
1936 atomic_long_set(&n->total_objects, 0);
1935 INIT_LIST_HEAD(&n->full); 1937 INIT_LIST_HEAD(&n->full);
1936#endif 1938#endif
1937} 1939}
@@ -4416,14 +4418,6 @@ __initcall(slab_sysfs_init);
4416 * The /proc/slabinfo ABI 4418 * The /proc/slabinfo ABI
4417 */ 4419 */
4418#ifdef CONFIG_SLABINFO 4420#ifdef CONFIG_SLABINFO
4419
4420ssize_t slabinfo_write(struct file *file, const char __user *buffer,
4421 size_t count, loff_t *ppos)
4422{
4423 return -EINVAL;
4424}
4425
4426
4427static void print_slabinfo_header(struct seq_file *m) 4421static void print_slabinfo_header(struct seq_file *m)
4428{ 4422{
4429 seq_puts(m, "slabinfo - version: 2.1\n"); 4423 seq_puts(m, "slabinfo - version: 2.1\n");
@@ -4491,11 +4485,29 @@ static int s_show(struct seq_file *m, void *p)
4491 return 0; 4485 return 0;
4492} 4486}
4493 4487
4494const struct seq_operations slabinfo_op = { 4488static const struct seq_operations slabinfo_op = {
4495 .start = s_start, 4489 .start = s_start,
4496 .next = s_next, 4490 .next = s_next,
4497 .stop = s_stop, 4491 .stop = s_stop,
4498 .show = s_show, 4492 .show = s_show,
4499}; 4493};
4500 4494
4495static int slabinfo_open(struct inode *inode, struct file *file)
4496{
4497 return seq_open(file, &slabinfo_op);
4498}
4499
4500static const struct file_operations proc_slabinfo_operations = {
4501 .open = slabinfo_open,
4502 .read = seq_read,
4503 .llseek = seq_lseek,
4504 .release = seq_release,
4505};
4506
4507static int __init slab_proc_init(void)
4508{
4509 proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
4510 return 0;
4511}
4512module_init(slab_proc_init);
4501#endif /* CONFIG_SLABINFO */ 4513#endif /* CONFIG_SLABINFO */
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index a91b5f8fcaf6..a13ea6401ae7 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -64,7 +64,7 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
64 unsigned long pfn = pte_pfn(*pte); 64 unsigned long pfn = pte_pfn(*pte);
65 int actual_node = early_pfn_to_nid(pfn); 65 int actual_node = early_pfn_to_nid(pfn);
66 66
67 if (actual_node != node) 67 if (node_distance(actual_node, node) > LOCAL_DISTANCE)
68 printk(KERN_WARNING "[%lx-%lx] potential offnode " 68 printk(KERN_WARNING "[%lx-%lx] potential offnode "
69 "page_structs\n", start, end - 1); 69 "page_structs\n", start, end - 1);
70} 70}
diff --git a/mm/swap.c b/mm/swap.c
index 9e0cb3118079..2152e48a7b8f 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -31,11 +31,12 @@
31#include <linux/backing-dev.h> 31#include <linux/backing-dev.h>
32#include <linux/memcontrol.h> 32#include <linux/memcontrol.h>
33 33
34#include "internal.h"
35
34/* How many pages do we try to swap or page in/out together? */ 36/* How many pages do we try to swap or page in/out together? */
35int page_cluster; 37int page_cluster;
36 38
37static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs); 39static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs);
38static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs);
39static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); 40static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
40 41
41/* 42/*
@@ -116,8 +117,9 @@ static void pagevec_move_tail(struct pagevec *pvec)
116 zone = pagezone; 117 zone = pagezone;
117 spin_lock(&zone->lru_lock); 118 spin_lock(&zone->lru_lock);
118 } 119 }
119 if (PageLRU(page) && !PageActive(page)) { 120 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
120 list_move_tail(&page->lru, &zone->inactive_list); 121 int lru = page_is_file_cache(page);
122 list_move_tail(&page->lru, &zone->lru[lru].list);
121 pgmoved++; 123 pgmoved++;
122 } 124 }
123 } 125 }
@@ -136,7 +138,7 @@ static void pagevec_move_tail(struct pagevec *pvec)
136void rotate_reclaimable_page(struct page *page) 138void rotate_reclaimable_page(struct page *page)
137{ 139{
138 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && 140 if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) &&
139 PageLRU(page)) { 141 !PageUnevictable(page) && PageLRU(page)) {
140 struct pagevec *pvec; 142 struct pagevec *pvec;
141 unsigned long flags; 143 unsigned long flags;
142 144
@@ -157,12 +159,19 @@ void activate_page(struct page *page)
157 struct zone *zone = page_zone(page); 159 struct zone *zone = page_zone(page);
158 160
159 spin_lock_irq(&zone->lru_lock); 161 spin_lock_irq(&zone->lru_lock);
160 if (PageLRU(page) && !PageActive(page)) { 162 if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
161 del_page_from_inactive_list(zone, page); 163 int file = page_is_file_cache(page);
164 int lru = LRU_BASE + file;
165 del_page_from_lru_list(zone, page, lru);
166
162 SetPageActive(page); 167 SetPageActive(page);
163 add_page_to_active_list(zone, page); 168 lru += LRU_ACTIVE;
169 add_page_to_lru_list(zone, page, lru);
164 __count_vm_event(PGACTIVATE); 170 __count_vm_event(PGACTIVATE);
165 mem_cgroup_move_lists(page, true); 171 mem_cgroup_move_lists(page, lru);
172
173 zone->recent_rotated[!!file]++;
174 zone->recent_scanned[!!file]++;
166 } 175 }
167 spin_unlock_irq(&zone->lru_lock); 176 spin_unlock_irq(&zone->lru_lock);
168} 177}
@@ -176,7 +185,8 @@ void activate_page(struct page *page)
176 */ 185 */
177void mark_page_accessed(struct page *page) 186void mark_page_accessed(struct page *page)
178{ 187{
179 if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { 188 if (!PageActive(page) && !PageUnevictable(page) &&
189 PageReferenced(page) && PageLRU(page)) {
180 activate_page(page); 190 activate_page(page);
181 ClearPageReferenced(page); 191 ClearPageReferenced(page);
182 } else if (!PageReferenced(page)) { 192 } else if (!PageReferenced(page)) {
@@ -186,28 +196,73 @@ void mark_page_accessed(struct page *page)
186 196
187EXPORT_SYMBOL(mark_page_accessed); 197EXPORT_SYMBOL(mark_page_accessed);
188 198
189/** 199void __lru_cache_add(struct page *page, enum lru_list lru)
190 * lru_cache_add: add a page to the page lists
191 * @page: the page to add
192 */
193void lru_cache_add(struct page *page)
194{ 200{
195 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); 201 struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru];
196 202
197 page_cache_get(page); 203 page_cache_get(page);
198 if (!pagevec_add(pvec, page)) 204 if (!pagevec_add(pvec, page))
199 __pagevec_lru_add(pvec); 205 ____pagevec_lru_add(pvec, lru);
200 put_cpu_var(lru_add_pvecs); 206 put_cpu_var(lru_add_pvecs);
201} 207}
202 208
203void lru_cache_add_active(struct page *page) 209/**
210 * lru_cache_add_lru - add a page to a page list
211 * @page: the page to be added to the LRU.
212 * @lru: the LRU list to which the page is added.
213 */
214void lru_cache_add_lru(struct page *page, enum lru_list lru)
204{ 215{
205 struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); 216 if (PageActive(page)) {
217 VM_BUG_ON(PageUnevictable(page));
218 ClearPageActive(page);
219 } else if (PageUnevictable(page)) {
220 VM_BUG_ON(PageActive(page));
221 ClearPageUnevictable(page);
222 }
206 223
207 page_cache_get(page); 224 VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page));
208 if (!pagevec_add(pvec, page)) 225 __lru_cache_add(page, lru);
209 __pagevec_lru_add_active(pvec); 226}
210 put_cpu_var(lru_add_active_pvecs); 227
228/**
229 * add_page_to_unevictable_list - add a page to the unevictable list
230 * @page: the page to be added to the unevictable list
231 *
232 * Add page directly to its zone's unevictable list. To avoid races with
233 * tasks that might be making the page evictable, through eg. munlock,
234 * munmap or exit, while it's not on the lru, we want to add the page
235 * while it's locked or otherwise "invisible" to other tasks. This is
236 * difficult to do when using the pagevec cache, so bypass that.
237 */
238void add_page_to_unevictable_list(struct page *page)
239{
240 struct zone *zone = page_zone(page);
241
242 spin_lock_irq(&zone->lru_lock);
243 SetPageUnevictable(page);
244 SetPageLRU(page);
245 add_page_to_lru_list(zone, page, LRU_UNEVICTABLE);
246 spin_unlock_irq(&zone->lru_lock);
247}
248
249/**
250 * lru_cache_add_active_or_unevictable
251 * @page: the page to be added to LRU
252 * @vma: vma in which page is mapped for determining reclaimability
253 *
254 * place @page on active or unevictable LRU list, depending on
255 * page_evictable(). Note that if the page is not evictable,
256 * it goes directly back onto it's zone's unevictable list. It does
257 * NOT use a per cpu pagevec.
258 */
259void lru_cache_add_active_or_unevictable(struct page *page,
260 struct vm_area_struct *vma)
261{
262 if (page_evictable(page, vma))
263 lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page));
264 else
265 add_page_to_unevictable_list(page);
211} 266}
212 267
213/* 268/*
@@ -217,15 +272,15 @@ void lru_cache_add_active(struct page *page)
217 */ 272 */
218static void drain_cpu_pagevecs(int cpu) 273static void drain_cpu_pagevecs(int cpu)
219{ 274{
275 struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu);
220 struct pagevec *pvec; 276 struct pagevec *pvec;
277 int lru;
221 278
222 pvec = &per_cpu(lru_add_pvecs, cpu); 279 for_each_lru(lru) {
223 if (pagevec_count(pvec)) 280 pvec = &pvecs[lru - LRU_BASE];
224 __pagevec_lru_add(pvec); 281 if (pagevec_count(pvec))
225 282 ____pagevec_lru_add(pvec, lru);
226 pvec = &per_cpu(lru_add_active_pvecs, cpu); 283 }
227 if (pagevec_count(pvec))
228 __pagevec_lru_add_active(pvec);
229 284
230 pvec = &per_cpu(lru_rotate_pvecs, cpu); 285 pvec = &per_cpu(lru_rotate_pvecs, cpu);
231 if (pagevec_count(pvec)) { 286 if (pagevec_count(pvec)) {
@@ -244,7 +299,7 @@ void lru_add_drain(void)
244 put_cpu(); 299 put_cpu();
245} 300}
246 301
247#ifdef CONFIG_NUMA 302#if defined(CONFIG_NUMA) || defined(CONFIG_UNEVICTABLE_LRU)
248static void lru_add_drain_per_cpu(struct work_struct *dummy) 303static void lru_add_drain_per_cpu(struct work_struct *dummy)
249{ 304{
250 lru_add_drain(); 305 lru_add_drain();
@@ -308,6 +363,7 @@ void release_pages(struct page **pages, int nr, int cold)
308 363
309 if (PageLRU(page)) { 364 if (PageLRU(page)) {
310 struct zone *pagezone = page_zone(page); 365 struct zone *pagezone = page_zone(page);
366
311 if (pagezone != zone) { 367 if (pagezone != zone) {
312 if (zone) 368 if (zone)
313 spin_unlock_irqrestore(&zone->lru_lock, 369 spin_unlock_irqrestore(&zone->lru_lock,
@@ -380,10 +436,11 @@ void __pagevec_release_nonlru(struct pagevec *pvec)
380 * Add the passed pages to the LRU, then drop the caller's refcount 436 * Add the passed pages to the LRU, then drop the caller's refcount
381 * on them. Reinitialises the caller's pagevec. 437 * on them. Reinitialises the caller's pagevec.
382 */ 438 */
383void __pagevec_lru_add(struct pagevec *pvec) 439void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
384{ 440{
385 int i; 441 int i;
386 struct zone *zone = NULL; 442 struct zone *zone = NULL;
443 VM_BUG_ON(is_unevictable_lru(lru));
387 444
388 for (i = 0; i < pagevec_count(pvec); i++) { 445 for (i = 0; i < pagevec_count(pvec); i++) {
389 struct page *page = pvec->pages[i]; 446 struct page *page = pvec->pages[i];
@@ -395,9 +452,13 @@ void __pagevec_lru_add(struct pagevec *pvec)
395 zone = pagezone; 452 zone = pagezone;
396 spin_lock_irq(&zone->lru_lock); 453 spin_lock_irq(&zone->lru_lock);
397 } 454 }
455 VM_BUG_ON(PageActive(page));
456 VM_BUG_ON(PageUnevictable(page));
398 VM_BUG_ON(PageLRU(page)); 457 VM_BUG_ON(PageLRU(page));
399 SetPageLRU(page); 458 SetPageLRU(page);
400 add_page_to_inactive_list(zone, page); 459 if (is_active_lru(lru))
460 SetPageActive(page);
461 add_page_to_lru_list(zone, page, lru);
401 } 462 }
402 if (zone) 463 if (zone)
403 spin_unlock_irq(&zone->lru_lock); 464 spin_unlock_irq(&zone->lru_lock);
@@ -405,48 +466,45 @@ void __pagevec_lru_add(struct pagevec *pvec)
405 pagevec_reinit(pvec); 466 pagevec_reinit(pvec);
406} 467}
407 468
408EXPORT_SYMBOL(__pagevec_lru_add); 469EXPORT_SYMBOL(____pagevec_lru_add);
409 470
410void __pagevec_lru_add_active(struct pagevec *pvec) 471/*
472 * Try to drop buffers from the pages in a pagevec
473 */
474void pagevec_strip(struct pagevec *pvec)
411{ 475{
412 int i; 476 int i;
413 struct zone *zone = NULL;
414 477
415 for (i = 0; i < pagevec_count(pvec); i++) { 478 for (i = 0; i < pagevec_count(pvec); i++) {
416 struct page *page = pvec->pages[i]; 479 struct page *page = pvec->pages[i];
417 struct zone *pagezone = page_zone(page);
418 480
419 if (pagezone != zone) { 481 if (PagePrivate(page) && trylock_page(page)) {
420 if (zone) 482 if (PagePrivate(page))
421 spin_unlock_irq(&zone->lru_lock); 483 try_to_release_page(page, 0);
422 zone = pagezone; 484 unlock_page(page);
423 spin_lock_irq(&zone->lru_lock);
424 } 485 }
425 VM_BUG_ON(PageLRU(page));
426 SetPageLRU(page);
427 VM_BUG_ON(PageActive(page));
428 SetPageActive(page);
429 add_page_to_active_list(zone, page);
430 } 486 }
431 if (zone)
432 spin_unlock_irq(&zone->lru_lock);
433 release_pages(pvec->pages, pvec->nr, pvec->cold);
434 pagevec_reinit(pvec);
435} 487}
436 488
437/* 489/**
438 * Try to drop buffers from the pages in a pagevec 490 * pagevec_swap_free - try to free swap space from the pages in a pagevec
491 * @pvec: pagevec with swapcache pages to free the swap space of
492 *
493 * The caller needs to hold an extra reference to each page and
494 * not hold the page lock on the pages. This function uses a
495 * trylock on the page lock so it may not always free the swap
496 * space associated with a page.
439 */ 497 */
440void pagevec_strip(struct pagevec *pvec) 498void pagevec_swap_free(struct pagevec *pvec)
441{ 499{
442 int i; 500 int i;
443 501
444 for (i = 0; i < pagevec_count(pvec); i++) { 502 for (i = 0; i < pagevec_count(pvec); i++) {
445 struct page *page = pvec->pages[i]; 503 struct page *page = pvec->pages[i];
446 504
447 if (PagePrivate(page) && trylock_page(page)) { 505 if (PageSwapCache(page) && trylock_page(page)) {
448 if (PagePrivate(page)) 506 if (PageSwapCache(page))
449 try_to_release_page(page, 0); 507 remove_exclusive_swap_page_ref(page);
450 unlock_page(page); 508 unlock_page(page);
451 } 509 }
452 } 510 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 797c3831cbec..3353c9029cef 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -33,7 +33,7 @@ static const struct address_space_operations swap_aops = {
33}; 33};
34 34
35static struct backing_dev_info swap_backing_dev_info = { 35static struct backing_dev_info swap_backing_dev_info = {
36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 36 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
37 .unplug_io_fn = swap_unplug_io_fn, 37 .unplug_io_fn = swap_unplug_io_fn,
38}; 38};
39 39
@@ -75,6 +75,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
75 BUG_ON(!PageLocked(page)); 75 BUG_ON(!PageLocked(page));
76 BUG_ON(PageSwapCache(page)); 76 BUG_ON(PageSwapCache(page));
77 BUG_ON(PagePrivate(page)); 77 BUG_ON(PagePrivate(page));
78 BUG_ON(!PageSwapBacked(page));
78 error = radix_tree_preload(gfp_mask); 79 error = radix_tree_preload(gfp_mask);
79 if (!error) { 80 if (!error) {
80 page_cache_get(page); 81 page_cache_get(page);
@@ -302,17 +303,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
302 * re-using the just freed swap entry for an existing page. 303 * re-using the just freed swap entry for an existing page.
303 * May fail (-ENOMEM) if radix-tree node allocation failed. 304 * May fail (-ENOMEM) if radix-tree node allocation failed.
304 */ 305 */
305 set_page_locked(new_page); 306 __set_page_locked(new_page);
307 SetPageSwapBacked(new_page);
306 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); 308 err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
307 if (likely(!err)) { 309 if (likely(!err)) {
308 /* 310 /*
309 * Initiate read into locked page and return. 311 * Initiate read into locked page and return.
310 */ 312 */
311 lru_cache_add_active(new_page); 313 lru_cache_add_anon(new_page);
312 swap_readpage(NULL, new_page); 314 swap_readpage(NULL, new_page);
313 return new_page; 315 return new_page;
314 } 316 }
315 clear_page_locked(new_page); 317 ClearPageSwapBacked(new_page);
318 __clear_page_locked(new_page);
316 swap_free(entry); 319 swap_free(entry);
317 } while (err != -ENOMEM); 320 } while (err != -ENOMEM);
318 321
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1e330f2998fa..90cb67a5417c 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -344,7 +344,7 @@ int can_share_swap_page(struct page *page)
344 * Work out if there are any other processes sharing this 344 * Work out if there are any other processes sharing this
345 * swap cache page. Free it if you can. Return success. 345 * swap cache page. Free it if you can. Return success.
346 */ 346 */
347int remove_exclusive_swap_page(struct page *page) 347static int remove_exclusive_swap_page_count(struct page *page, int count)
348{ 348{
349 int retval; 349 int retval;
350 struct swap_info_struct * p; 350 struct swap_info_struct * p;
@@ -357,7 +357,7 @@ int remove_exclusive_swap_page(struct page *page)
357 return 0; 357 return 0;
358 if (PageWriteback(page)) 358 if (PageWriteback(page))
359 return 0; 359 return 0;
360 if (page_count(page) != 2) /* 2: us + cache */ 360 if (page_count(page) != count) /* us + cache + ptes */
361 return 0; 361 return 0;
362 362
363 entry.val = page_private(page); 363 entry.val = page_private(page);
@@ -370,7 +370,7 @@ int remove_exclusive_swap_page(struct page *page)
370 if (p->swap_map[swp_offset(entry)] == 1) { 370 if (p->swap_map[swp_offset(entry)] == 1) {
371 /* Recheck the page count with the swapcache lock held.. */ 371 /* Recheck the page count with the swapcache lock held.. */
372 spin_lock_irq(&swapper_space.tree_lock); 372 spin_lock_irq(&swapper_space.tree_lock);
373 if ((page_count(page) == 2) && !PageWriteback(page)) { 373 if ((page_count(page) == count) && !PageWriteback(page)) {
374 __delete_from_swap_cache(page); 374 __delete_from_swap_cache(page);
375 SetPageDirty(page); 375 SetPageDirty(page);
376 retval = 1; 376 retval = 1;
@@ -388,6 +388,25 @@ int remove_exclusive_swap_page(struct page *page)
388} 388}
389 389
390/* 390/*
391 * Most of the time the page should have two references: one for the
392 * process and one for the swap cache.
393 */
394int remove_exclusive_swap_page(struct page *page)
395{
396 return remove_exclusive_swap_page_count(page, 2);
397}
398
399/*
400 * The pageout code holds an extra reference to the page. That raises
401 * the reference count to test for to 2 for a page that is only in the
402 * swap cache plus 1 for each process that maps the page.
403 */
404int remove_exclusive_swap_page_ref(struct page *page)
405{
406 return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page));
407}
408
409/*
391 * Free the swap entry like above, but also try to 410 * Free the swap entry like above, but also try to
392 * free the page cache entry if it is the last user. 411 * free the page cache entry if it is the last user.
393 */ 412 */
@@ -403,7 +422,7 @@ void free_swap_and_cache(swp_entry_t entry)
403 if (p) { 422 if (p) {
404 if (swap_entry_free(p, swp_offset(entry)) == 1) { 423 if (swap_entry_free(p, swp_offset(entry)) == 1) {
405 page = find_get_page(&swapper_space, entry.val); 424 page = find_get_page(&swapper_space, entry.val);
406 if (page && unlikely(!trylock_page(page))) { 425 if (page && !trylock_page(page)) {
407 page_cache_release(page); 426 page_cache_release(page);
408 page = NULL; 427 page = NULL;
409 } 428 }
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
index ae532f501943..3e67d575ee6e 100644
--- a/mm/tiny-shmem.c
+++ b/mm/tiny-shmem.c
@@ -65,36 +65,37 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
65 if (!dentry) 65 if (!dentry)
66 goto put_memory; 66 goto put_memory;
67 67
68 error = -ENFILE;
69 file = get_empty_filp();
70 if (!file)
71 goto put_dentry;
72
68 error = -ENOSPC; 73 error = -ENOSPC;
69 inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0); 74 inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
70 if (!inode) 75 if (!inode)
71 goto put_dentry; 76 goto close_file;
72 77
73 d_instantiate(dentry, inode); 78 d_instantiate(dentry, inode);
74 error = -ENFILE; 79 inode->i_size = size;
75 file = alloc_file(shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
76 &ramfs_file_operations);
77 if (!file)
78 goto put_dentry;
79
80 inode->i_nlink = 0; /* It is unlinked */ 80 inode->i_nlink = 0; /* It is unlinked */
81 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
82 &ramfs_file_operations);
81 83
82 /* notify everyone as to the change of file size */ 84#ifndef CONFIG_MMU
83 error = do_truncate(dentry, size, 0, file); 85 error = ramfs_nommu_expand_for_mapping(inode, size);
84 if (error < 0) 86 if (error)
85 goto close_file; 87 goto close_file;
86 88#endif
87 return file; 89 return file;
88 90
89close_file: 91close_file:
90 put_filp(file); 92 put_filp(file);
91 return ERR_PTR(error);
92
93put_dentry: 93put_dentry:
94 dput(dentry); 94 dput(dentry);
95put_memory: 95put_memory:
96 return ERR_PTR(error); 96 return ERR_PTR(error);
97} 97}
98EXPORT_SYMBOL_GPL(shmem_file_setup);
98 99
99/** 100/**
100 * shmem_zero_setup - setup a shared anonymous mapping 101 * shmem_zero_setup - setup a shared anonymous mapping
diff --git a/mm/truncate.c b/mm/truncate.c
index 6650c1d878b4..1229211104f8 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (C) 2002, Linus Torvalds 4 * Copyright (C) 2002, Linus Torvalds
5 * 5 *
6 * 10Sep2002 akpm@zip.com.au 6 * 10Sep2002 Andrew Morton
7 * Initial version. 7 * Initial version.
8 */ 8 */
9 9
@@ -18,6 +18,7 @@
18#include <linux/task_io_accounting_ops.h> 18#include <linux/task_io_accounting_ops.h>
19#include <linux/buffer_head.h> /* grr. try_to_release_page, 19#include <linux/buffer_head.h> /* grr. try_to_release_page,
20 do_invalidatepage */ 20 do_invalidatepage */
21#include "internal.h"
21 22
22 23
23/** 24/**
@@ -103,6 +104,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
103 104
104 cancel_dirty_page(page, PAGE_CACHE_SIZE); 105 cancel_dirty_page(page, PAGE_CACHE_SIZE);
105 106
107 clear_page_mlock(page);
106 remove_from_page_cache(page); 108 remove_from_page_cache(page);
107 ClearPageMappedToDisk(page); 109 ClearPageMappedToDisk(page);
108 page_cache_release(page); /* pagecache ref */ 110 page_cache_release(page); /* pagecache ref */
@@ -127,6 +129,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
127 if (PagePrivate(page) && !try_to_release_page(page, 0)) 129 if (PagePrivate(page) && !try_to_release_page(page, 0))
128 return 0; 130 return 0;
129 131
132 clear_page_mlock(page);
130 ret = remove_mapping(mapping, page); 133 ret = remove_mapping(mapping, page);
131 134
132 return ret; 135 return ret;
@@ -352,6 +355,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
352 if (PageDirty(page)) 355 if (PageDirty(page))
353 goto failed; 356 goto failed;
354 357
358 clear_page_mlock(page);
355 BUG_ON(PagePrivate(page)); 359 BUG_ON(PagePrivate(page));
356 __remove_from_page_cache(page); 360 __remove_from_page_cache(page);
357 spin_unlock_irq(&mapping->tree_lock); 361 spin_unlock_irq(&mapping->tree_lock);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 85b9a0d2c877..30f826d484f0 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -8,26 +8,28 @@
8 * Numa awareness, Christoph Lameter, SGI, June 2005 8 * Numa awareness, Christoph Lameter, SGI, June 2005
9 */ 9 */
10 10
11#include <linux/vmalloc.h>
11#include <linux/mm.h> 12#include <linux/mm.h>
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/highmem.h> 14#include <linux/highmem.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/spinlock.h> 16#include <linux/spinlock.h>
16#include <linux/interrupt.h> 17#include <linux/interrupt.h>
18#include <linux/proc_fs.h>
17#include <linux/seq_file.h> 19#include <linux/seq_file.h>
18#include <linux/debugobjects.h> 20#include <linux/debugobjects.h>
19#include <linux/vmalloc.h>
20#include <linux/kallsyms.h> 21#include <linux/kallsyms.h>
22#include <linux/list.h>
23#include <linux/rbtree.h>
24#include <linux/radix-tree.h>
25#include <linux/rcupdate.h>
21 26
27#include <asm/atomic.h>
22#include <asm/uaccess.h> 28#include <asm/uaccess.h>
23#include <asm/tlbflush.h> 29#include <asm/tlbflush.h>
24 30
25 31
26DEFINE_RWLOCK(vmlist_lock); 32/*** Page table manipulation functions ***/
27struct vm_struct *vmlist;
28
29static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
30 int node, void *caller);
31 33
32static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) 34static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
33{ 35{
@@ -40,8 +42,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
40 } while (pte++, addr += PAGE_SIZE, addr != end); 42 } while (pte++, addr += PAGE_SIZE, addr != end);
41} 43}
42 44
43static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, 45static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
44 unsigned long end)
45{ 46{
46 pmd_t *pmd; 47 pmd_t *pmd;
47 unsigned long next; 48 unsigned long next;
@@ -55,8 +56,7 @@ static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr,
55 } while (pmd++, addr = next, addr != end); 56 } while (pmd++, addr = next, addr != end);
56} 57}
57 58
58static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, 59static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
59 unsigned long end)
60{ 60{
61 pud_t *pud; 61 pud_t *pud;
62 unsigned long next; 62 unsigned long next;
@@ -70,12 +70,10 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
70 } while (pud++, addr = next, addr != end); 70 } while (pud++, addr = next, addr != end);
71} 71}
72 72
73void unmap_kernel_range(unsigned long addr, unsigned long size) 73static void vunmap_page_range(unsigned long addr, unsigned long end)
74{ 74{
75 pgd_t *pgd; 75 pgd_t *pgd;
76 unsigned long next; 76 unsigned long next;
77 unsigned long start = addr;
78 unsigned long end = addr + size;
79 77
80 BUG_ON(addr >= end); 78 BUG_ON(addr >= end);
81 pgd = pgd_offset_k(addr); 79 pgd = pgd_offset_k(addr);
@@ -86,35 +84,36 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
86 continue; 84 continue;
87 vunmap_pud_range(pgd, addr, next); 85 vunmap_pud_range(pgd, addr, next);
88 } while (pgd++, addr = next, addr != end); 86 } while (pgd++, addr = next, addr != end);
89 flush_tlb_kernel_range(start, end);
90}
91
92static void unmap_vm_area(struct vm_struct *area)
93{
94 unmap_kernel_range((unsigned long)area->addr, area->size);
95} 87}
96 88
97static int vmap_pte_range(pmd_t *pmd, unsigned long addr, 89static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
98 unsigned long end, pgprot_t prot, struct page ***pages) 90 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
99{ 91{
100 pte_t *pte; 92 pte_t *pte;
101 93
94 /*
95 * nr is a running index into the array which helps higher level
96 * callers keep track of where we're up to.
97 */
98
102 pte = pte_alloc_kernel(pmd, addr); 99 pte = pte_alloc_kernel(pmd, addr);
103 if (!pte) 100 if (!pte)
104 return -ENOMEM; 101 return -ENOMEM;
105 do { 102 do {
106 struct page *page = **pages; 103 struct page *page = pages[*nr];
107 WARN_ON(!pte_none(*pte)); 104
108 if (!page) 105 if (WARN_ON(!pte_none(*pte)))
106 return -EBUSY;
107 if (WARN_ON(!page))
109 return -ENOMEM; 108 return -ENOMEM;
110 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); 109 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
111 (*pages)++; 110 (*nr)++;
112 } while (pte++, addr += PAGE_SIZE, addr != end); 111 } while (pte++, addr += PAGE_SIZE, addr != end);
113 return 0; 112 return 0;
114} 113}
115 114
116static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, 115static int vmap_pmd_range(pud_t *pud, unsigned long addr,
117 unsigned long end, pgprot_t prot, struct page ***pages) 116 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
118{ 117{
119 pmd_t *pmd; 118 pmd_t *pmd;
120 unsigned long next; 119 unsigned long next;
@@ -124,14 +123,14 @@ static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
124 return -ENOMEM; 123 return -ENOMEM;
125 do { 124 do {
126 next = pmd_addr_end(addr, end); 125 next = pmd_addr_end(addr, end);
127 if (vmap_pte_range(pmd, addr, next, prot, pages)) 126 if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
128 return -ENOMEM; 127 return -ENOMEM;
129 } while (pmd++, addr = next, addr != end); 128 } while (pmd++, addr = next, addr != end);
130 return 0; 129 return 0;
131} 130}
132 131
133static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, 132static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
134 unsigned long end, pgprot_t prot, struct page ***pages) 133 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
135{ 134{
136 pud_t *pud; 135 pud_t *pud;
137 unsigned long next; 136 unsigned long next;
@@ -141,50 +140,78 @@ static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr,
141 return -ENOMEM; 140 return -ENOMEM;
142 do { 141 do {
143 next = pud_addr_end(addr, end); 142 next = pud_addr_end(addr, end);
144 if (vmap_pmd_range(pud, addr, next, prot, pages)) 143 if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
145 return -ENOMEM; 144 return -ENOMEM;
146 } while (pud++, addr = next, addr != end); 145 } while (pud++, addr = next, addr != end);
147 return 0; 146 return 0;
148} 147}
149 148
150int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) 149/*
150 * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
151 * will have pfns corresponding to the "pages" array.
152 *
153 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
154 */
155static int vmap_page_range(unsigned long addr, unsigned long end,
156 pgprot_t prot, struct page **pages)
151{ 157{
152 pgd_t *pgd; 158 pgd_t *pgd;
153 unsigned long next; 159 unsigned long next;
154 unsigned long addr = (unsigned long) area->addr; 160 int err = 0;
155 unsigned long end = addr + area->size - PAGE_SIZE; 161 int nr = 0;
156 int err;
157 162
158 BUG_ON(addr >= end); 163 BUG_ON(addr >= end);
159 pgd = pgd_offset_k(addr); 164 pgd = pgd_offset_k(addr);
160 do { 165 do {
161 next = pgd_addr_end(addr, end); 166 next = pgd_addr_end(addr, end);
162 err = vmap_pud_range(pgd, addr, next, prot, pages); 167 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
163 if (err) 168 if (err)
164 break; 169 break;
165 } while (pgd++, addr = next, addr != end); 170 } while (pgd++, addr = next, addr != end);
166 flush_cache_vmap((unsigned long) area->addr, end); 171 flush_cache_vmap(addr, end);
167 return err; 172
173 if (unlikely(err))
174 return err;
175 return nr;
176}
177
178static inline int is_vmalloc_or_module_addr(const void *x)
179{
180 /*
181 * ARM, x86-64 and sparc64 put modules in a special place,
182 * and fall back on vmalloc() if that fails. Others
183 * just put it in the vmalloc space.
184 */
185#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
186 unsigned long addr = (unsigned long)x;
187 if (addr >= MODULES_VADDR && addr < MODULES_END)
188 return 1;
189#endif
190 return is_vmalloc_addr(x);
168} 191}
169EXPORT_SYMBOL_GPL(map_vm_area);
170 192
171/* 193/*
172 * Map a vmalloc()-space virtual address to the physical page. 194 * Walk a vmap address to the struct page it maps.
173 */ 195 */
174struct page *vmalloc_to_page(const void *vmalloc_addr) 196struct page *vmalloc_to_page(const void *vmalloc_addr)
175{ 197{
176 unsigned long addr = (unsigned long) vmalloc_addr; 198 unsigned long addr = (unsigned long) vmalloc_addr;
177 struct page *page = NULL; 199 struct page *page = NULL;
178 pgd_t *pgd = pgd_offset_k(addr); 200 pgd_t *pgd = pgd_offset_k(addr);
179 pud_t *pud; 201
180 pmd_t *pmd; 202 /*
181 pte_t *ptep, pte; 203 * XXX we might need to change this if we add VIRTUAL_BUG_ON for
204 * architectures that do not vmalloc module space
205 */
206 VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
182 207
183 if (!pgd_none(*pgd)) { 208 if (!pgd_none(*pgd)) {
184 pud = pud_offset(pgd, addr); 209 pud_t *pud = pud_offset(pgd, addr);
185 if (!pud_none(*pud)) { 210 if (!pud_none(*pud)) {
186 pmd = pmd_offset(pud, addr); 211 pmd_t *pmd = pmd_offset(pud, addr);
187 if (!pmd_none(*pmd)) { 212 if (!pmd_none(*pmd)) {
213 pte_t *ptep, pte;
214
188 ptep = pte_offset_map(pmd, addr); 215 ptep = pte_offset_map(pmd, addr);
189 pte = *ptep; 216 pte = *ptep;
190 if (pte_present(pte)) 217 if (pte_present(pte))
@@ -206,13 +233,770 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
206} 233}
207EXPORT_SYMBOL(vmalloc_to_pfn); 234EXPORT_SYMBOL(vmalloc_to_pfn);
208 235
209static struct vm_struct * 236
210__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, 237/*** Global kva allocator ***/
211 unsigned long end, int node, gfp_t gfp_mask, void *caller) 238
239#define VM_LAZY_FREE 0x01
240#define VM_LAZY_FREEING 0x02
241#define VM_VM_AREA 0x04
242
243struct vmap_area {
244 unsigned long va_start;
245 unsigned long va_end;
246 unsigned long flags;
247 struct rb_node rb_node; /* address sorted rbtree */
248 struct list_head list; /* address sorted list */
249 struct list_head purge_list; /* "lazy purge" list */
250 void *private;
251 struct rcu_head rcu_head;
252};
253
254static DEFINE_SPINLOCK(vmap_area_lock);
255static struct rb_root vmap_area_root = RB_ROOT;
256static LIST_HEAD(vmap_area_list);
257
258static struct vmap_area *__find_vmap_area(unsigned long addr)
212{ 259{
213 struct vm_struct **p, *tmp, *area; 260 struct rb_node *n = vmap_area_root.rb_node;
214 unsigned long align = 1; 261
262 while (n) {
263 struct vmap_area *va;
264
265 va = rb_entry(n, struct vmap_area, rb_node);
266 if (addr < va->va_start)
267 n = n->rb_left;
268 else if (addr > va->va_start)
269 n = n->rb_right;
270 else
271 return va;
272 }
273
274 return NULL;
275}
276
277static void __insert_vmap_area(struct vmap_area *va)
278{
279 struct rb_node **p = &vmap_area_root.rb_node;
280 struct rb_node *parent = NULL;
281 struct rb_node *tmp;
282
283 while (*p) {
284 struct vmap_area *tmp;
285
286 parent = *p;
287 tmp = rb_entry(parent, struct vmap_area, rb_node);
288 if (va->va_start < tmp->va_end)
289 p = &(*p)->rb_left;
290 else if (va->va_end > tmp->va_start)
291 p = &(*p)->rb_right;
292 else
293 BUG();
294 }
295
296 rb_link_node(&va->rb_node, parent, p);
297 rb_insert_color(&va->rb_node, &vmap_area_root);
298
299 /* address-sort this list so it is usable like the vmlist */
300 tmp = rb_prev(&va->rb_node);
301 if (tmp) {
302 struct vmap_area *prev;
303 prev = rb_entry(tmp, struct vmap_area, rb_node);
304 list_add_rcu(&va->list, &prev->list);
305 } else
306 list_add_rcu(&va->list, &vmap_area_list);
307}
308
309static void purge_vmap_area_lazy(void);
310
311/*
312 * Allocate a region of KVA of the specified size and alignment, within the
313 * vstart and vend.
314 */
315static struct vmap_area *alloc_vmap_area(unsigned long size,
316 unsigned long align,
317 unsigned long vstart, unsigned long vend,
318 int node, gfp_t gfp_mask)
319{
320 struct vmap_area *va;
321 struct rb_node *n;
322 unsigned long addr;
323 int purged = 0;
324
325 BUG_ON(size & ~PAGE_MASK);
326
327 va = kmalloc_node(sizeof(struct vmap_area),
328 gfp_mask & GFP_RECLAIM_MASK, node);
329 if (unlikely(!va))
330 return ERR_PTR(-ENOMEM);
331
332retry:
333 addr = ALIGN(vstart, align);
334
335 spin_lock(&vmap_area_lock);
336 /* XXX: could have a last_hole cache */
337 n = vmap_area_root.rb_node;
338 if (n) {
339 struct vmap_area *first = NULL;
340
341 do {
342 struct vmap_area *tmp;
343 tmp = rb_entry(n, struct vmap_area, rb_node);
344 if (tmp->va_end >= addr) {
345 if (!first && tmp->va_start < addr + size)
346 first = tmp;
347 n = n->rb_left;
348 } else {
349 first = tmp;
350 n = n->rb_right;
351 }
352 } while (n);
353
354 if (!first)
355 goto found;
356
357 if (first->va_end < addr) {
358 n = rb_next(&first->rb_node);
359 if (n)
360 first = rb_entry(n, struct vmap_area, rb_node);
361 else
362 goto found;
363 }
364
365 while (addr + size > first->va_start && addr + size <= vend) {
366 addr = ALIGN(first->va_end + PAGE_SIZE, align);
367
368 n = rb_next(&first->rb_node);
369 if (n)
370 first = rb_entry(n, struct vmap_area, rb_node);
371 else
372 goto found;
373 }
374 }
375found:
376 if (addr + size > vend) {
377 spin_unlock(&vmap_area_lock);
378 if (!purged) {
379 purge_vmap_area_lazy();
380 purged = 1;
381 goto retry;
382 }
383 if (printk_ratelimit())
384 printk(KERN_WARNING "vmap allocation failed: "
385 "use vmalloc=<size> to increase size.\n");
386 return ERR_PTR(-EBUSY);
387 }
388
389 BUG_ON(addr & (align-1));
390
391 va->va_start = addr;
392 va->va_end = addr + size;
393 va->flags = 0;
394 __insert_vmap_area(va);
395 spin_unlock(&vmap_area_lock);
396
397 return va;
398}
399
400static void rcu_free_va(struct rcu_head *head)
401{
402 struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
403
404 kfree(va);
405}
406
407static void __free_vmap_area(struct vmap_area *va)
408{
409 BUG_ON(RB_EMPTY_NODE(&va->rb_node));
410 rb_erase(&va->rb_node, &vmap_area_root);
411 RB_CLEAR_NODE(&va->rb_node);
412 list_del_rcu(&va->list);
413
414 call_rcu(&va->rcu_head, rcu_free_va);
415}
416
417/*
418 * Free a region of KVA allocated by alloc_vmap_area
419 */
420static void free_vmap_area(struct vmap_area *va)
421{
422 spin_lock(&vmap_area_lock);
423 __free_vmap_area(va);
424 spin_unlock(&vmap_area_lock);
425}
426
427/*
428 * Clear the pagetable entries of a given vmap_area
429 */
430static void unmap_vmap_area(struct vmap_area *va)
431{
432 vunmap_page_range(va->va_start, va->va_end);
433}
434
435/*
436 * lazy_max_pages is the maximum amount of virtual address space we gather up
437 * before attempting to purge with a TLB flush.
438 *
439 * There is a tradeoff here: a larger number will cover more kernel page tables
440 * and take slightly longer to purge, but it will linearly reduce the number of
441 * global TLB flushes that must be performed. It would seem natural to scale
442 * this number up linearly with the number of CPUs (because vmapping activity
443 * could also scale linearly with the number of CPUs), however it is likely
444 * that in practice, workloads might be constrained in other ways that mean
445 * vmap activity will not scale linearly with CPUs. Also, I want to be
446 * conservative and not introduce a big latency on huge systems, so go with
447 * a less aggressive log scale. It will still be an improvement over the old
448 * code, and it will be simple to change the scale factor if we find that it
449 * becomes a problem on bigger systems.
450 */
451static unsigned long lazy_max_pages(void)
452{
453 unsigned int log;
454
455 log = fls(num_online_cpus());
456
457 return log * (32UL * 1024 * 1024 / PAGE_SIZE);
458}
459
460static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
461
462/*
463 * Purges all lazily-freed vmap areas.
464 *
465 * If sync is 0 then don't purge if there is already a purge in progress.
466 * If force_flush is 1, then flush kernel TLBs between *start and *end even
467 * if we found no lazy vmap areas to unmap (callers can use this to optimise
468 * their own TLB flushing).
469 * Returns with *start = min(*start, lowest purged address)
470 * *end = max(*end, highest purged address)
471 */
472static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
473 int sync, int force_flush)
474{
475 static DEFINE_SPINLOCK(purge_lock);
476 LIST_HEAD(valist);
477 struct vmap_area *va;
478 int nr = 0;
479
480 /*
481 * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
482 * should not expect such behaviour. This just simplifies locking for
483 * the case that isn't actually used at the moment anyway.
484 */
485 if (!sync && !force_flush) {
486 if (!spin_trylock(&purge_lock))
487 return;
488 } else
489 spin_lock(&purge_lock);
490
491 rcu_read_lock();
492 list_for_each_entry_rcu(va, &vmap_area_list, list) {
493 if (va->flags & VM_LAZY_FREE) {
494 if (va->va_start < *start)
495 *start = va->va_start;
496 if (va->va_end > *end)
497 *end = va->va_end;
498 nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
499 unmap_vmap_area(va);
500 list_add_tail(&va->purge_list, &valist);
501 va->flags |= VM_LAZY_FREEING;
502 va->flags &= ~VM_LAZY_FREE;
503 }
504 }
505 rcu_read_unlock();
506
507 if (nr) {
508 BUG_ON(nr > atomic_read(&vmap_lazy_nr));
509 atomic_sub(nr, &vmap_lazy_nr);
510 }
511
512 if (nr || force_flush)
513 flush_tlb_kernel_range(*start, *end);
514
515 if (nr) {
516 spin_lock(&vmap_area_lock);
517 list_for_each_entry(va, &valist, purge_list)
518 __free_vmap_area(va);
519 spin_unlock(&vmap_area_lock);
520 }
521 spin_unlock(&purge_lock);
522}
523
524/*
525 * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
526 * is already purging.
527 */
528static void try_purge_vmap_area_lazy(void)
529{
530 unsigned long start = ULONG_MAX, end = 0;
531
532 __purge_vmap_area_lazy(&start, &end, 0, 0);
533}
534
535/*
536 * Kick off a purge of the outstanding lazy areas.
537 */
538static void purge_vmap_area_lazy(void)
539{
540 unsigned long start = ULONG_MAX, end = 0;
541
542 __purge_vmap_area_lazy(&start, &end, 1, 0);
543}
544
545/*
546 * Free and unmap a vmap area
547 */
548static void free_unmap_vmap_area(struct vmap_area *va)
549{
550 va->flags |= VM_LAZY_FREE;
551 atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
552 if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
553 try_purge_vmap_area_lazy();
554}
555
556static struct vmap_area *find_vmap_area(unsigned long addr)
557{
558 struct vmap_area *va;
559
560 spin_lock(&vmap_area_lock);
561 va = __find_vmap_area(addr);
562 spin_unlock(&vmap_area_lock);
563
564 return va;
565}
566
567static void free_unmap_vmap_area_addr(unsigned long addr)
568{
569 struct vmap_area *va;
570
571 va = find_vmap_area(addr);
572 BUG_ON(!va);
573 free_unmap_vmap_area(va);
574}
575
576
577/*** Per cpu kva allocator ***/
578
579/*
580 * vmap space is limited especially on 32 bit architectures. Ensure there is
581 * room for at least 16 percpu vmap blocks per CPU.
582 */
583/*
584 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
585 * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess
586 * instead (we just need a rough idea)
587 */
588#if BITS_PER_LONG == 32
589#define VMALLOC_SPACE (128UL*1024*1024)
590#else
591#define VMALLOC_SPACE (128UL*1024*1024*1024)
592#endif
593
594#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
595#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */
596#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */
597#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
598#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
599#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
600#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
601 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
602 VMALLOC_PAGES / NR_CPUS / 16))
603
604#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
605
606static bool vmap_initialized __read_mostly = false;
607
608struct vmap_block_queue {
609 spinlock_t lock;
610 struct list_head free;
611 struct list_head dirty;
612 unsigned int nr_dirty;
613};
614
615struct vmap_block {
616 spinlock_t lock;
617 struct vmap_area *va;
618 struct vmap_block_queue *vbq;
619 unsigned long free, dirty;
620 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
621 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
622 union {
623 struct {
624 struct list_head free_list;
625 struct list_head dirty_list;
626 };
627 struct rcu_head rcu_head;
628 };
629};
630
631/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
632static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
633
634/*
635 * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
636 * in the free path. Could get rid of this if we change the API to return a
637 * "cookie" from alloc, to be passed to free. But no big deal yet.
638 */
639static DEFINE_SPINLOCK(vmap_block_tree_lock);
640static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
641
642/*
643 * We should probably have a fallback mechanism to allocate virtual memory
644 * out of partially filled vmap blocks. However vmap block sizing should be
645 * fairly reasonable according to the vmalloc size, so it shouldn't be a
646 * big problem.
647 */
648
649static unsigned long addr_to_vb_idx(unsigned long addr)
650{
651 addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
652 addr /= VMAP_BLOCK_SIZE;
653 return addr;
654}
655
656static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
657{
658 struct vmap_block_queue *vbq;
659 struct vmap_block *vb;
660 struct vmap_area *va;
661 unsigned long vb_idx;
662 int node, err;
663
664 node = numa_node_id();
665
666 vb = kmalloc_node(sizeof(struct vmap_block),
667 gfp_mask & GFP_RECLAIM_MASK, node);
668 if (unlikely(!vb))
669 return ERR_PTR(-ENOMEM);
670
671 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
672 VMALLOC_START, VMALLOC_END,
673 node, gfp_mask);
674 if (unlikely(IS_ERR(va))) {
675 kfree(vb);
676 return ERR_PTR(PTR_ERR(va));
677 }
678
679 err = radix_tree_preload(gfp_mask);
680 if (unlikely(err)) {
681 kfree(vb);
682 free_vmap_area(va);
683 return ERR_PTR(err);
684 }
685
686 spin_lock_init(&vb->lock);
687 vb->va = va;
688 vb->free = VMAP_BBMAP_BITS;
689 vb->dirty = 0;
690 bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
691 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
692 INIT_LIST_HEAD(&vb->free_list);
693 INIT_LIST_HEAD(&vb->dirty_list);
694
695 vb_idx = addr_to_vb_idx(va->va_start);
696 spin_lock(&vmap_block_tree_lock);
697 err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
698 spin_unlock(&vmap_block_tree_lock);
699 BUG_ON(err);
700 radix_tree_preload_end();
701
702 vbq = &get_cpu_var(vmap_block_queue);
703 vb->vbq = vbq;
704 spin_lock(&vbq->lock);
705 list_add(&vb->free_list, &vbq->free);
706 spin_unlock(&vbq->lock);
707 put_cpu_var(vmap_cpu_blocks);
708
709 return vb;
710}
711
712static void rcu_free_vb(struct rcu_head *head)
713{
714 struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
715
716 kfree(vb);
717}
718
719static void free_vmap_block(struct vmap_block *vb)
720{
721 struct vmap_block *tmp;
722 unsigned long vb_idx;
723
724 spin_lock(&vb->vbq->lock);
725 if (!list_empty(&vb->free_list))
726 list_del(&vb->free_list);
727 if (!list_empty(&vb->dirty_list))
728 list_del(&vb->dirty_list);
729 spin_unlock(&vb->vbq->lock);
730
731 vb_idx = addr_to_vb_idx(vb->va->va_start);
732 spin_lock(&vmap_block_tree_lock);
733 tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
734 spin_unlock(&vmap_block_tree_lock);
735 BUG_ON(tmp != vb);
736
737 free_unmap_vmap_area(vb->va);
738 call_rcu(&vb->rcu_head, rcu_free_vb);
739}
740
741static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
742{
743 struct vmap_block_queue *vbq;
744 struct vmap_block *vb;
745 unsigned long addr = 0;
746 unsigned int order;
747
748 BUG_ON(size & ~PAGE_MASK);
749 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
750 order = get_order(size);
751
752again:
753 rcu_read_lock();
754 vbq = &get_cpu_var(vmap_block_queue);
755 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
756 int i;
757
758 spin_lock(&vb->lock);
759 i = bitmap_find_free_region(vb->alloc_map,
760 VMAP_BBMAP_BITS, order);
761
762 if (i >= 0) {
763 addr = vb->va->va_start + (i << PAGE_SHIFT);
764 BUG_ON(addr_to_vb_idx(addr) !=
765 addr_to_vb_idx(vb->va->va_start));
766 vb->free -= 1UL << order;
767 if (vb->free == 0) {
768 spin_lock(&vbq->lock);
769 list_del_init(&vb->free_list);
770 spin_unlock(&vbq->lock);
771 }
772 spin_unlock(&vb->lock);
773 break;
774 }
775 spin_unlock(&vb->lock);
776 }
777 put_cpu_var(vmap_cpu_blocks);
778 rcu_read_unlock();
779
780 if (!addr) {
781 vb = new_vmap_block(gfp_mask);
782 if (IS_ERR(vb))
783 return vb;
784 goto again;
785 }
786
787 return (void *)addr;
788}
789
790static void vb_free(const void *addr, unsigned long size)
791{
792 unsigned long offset;
793 unsigned long vb_idx;
794 unsigned int order;
795 struct vmap_block *vb;
796
797 BUG_ON(size & ~PAGE_MASK);
798 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
799 order = get_order(size);
800
801 offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
802
803 vb_idx = addr_to_vb_idx((unsigned long)addr);
804 rcu_read_lock();
805 vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
806 rcu_read_unlock();
807 BUG_ON(!vb);
808
809 spin_lock(&vb->lock);
810 bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
811 if (!vb->dirty) {
812 spin_lock(&vb->vbq->lock);
813 list_add(&vb->dirty_list, &vb->vbq->dirty);
814 spin_unlock(&vb->vbq->lock);
815 }
816 vb->dirty += 1UL << order;
817 if (vb->dirty == VMAP_BBMAP_BITS) {
818 BUG_ON(vb->free || !list_empty(&vb->free_list));
819 spin_unlock(&vb->lock);
820 free_vmap_block(vb);
821 } else
822 spin_unlock(&vb->lock);
823}
824
825/**
826 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
827 *
828 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
829 * to amortize TLB flushing overheads. What this means is that any page you
830 * have now, may, in a former life, have been mapped into kernel virtual
831 * address by the vmap layer and so there might be some CPUs with TLB entries
832 * still referencing that page (additional to the regular 1:1 kernel mapping).
833 *
834 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
835 * be sure that none of the pages we have control over will have any aliases
836 * from the vmap layer.
837 */
838void vm_unmap_aliases(void)
839{
840 unsigned long start = ULONG_MAX, end = 0;
841 int cpu;
842 int flush = 0;
843
844 if (unlikely(!vmap_initialized))
845 return;
846
847 for_each_possible_cpu(cpu) {
848 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
849 struct vmap_block *vb;
850
851 rcu_read_lock();
852 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
853 int i;
854
855 spin_lock(&vb->lock);
856 i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
857 while (i < VMAP_BBMAP_BITS) {
858 unsigned long s, e;
859 int j;
860 j = find_next_zero_bit(vb->dirty_map,
861 VMAP_BBMAP_BITS, i);
862
863 s = vb->va->va_start + (i << PAGE_SHIFT);
864 e = vb->va->va_start + (j << PAGE_SHIFT);
865 vunmap_page_range(s, e);
866 flush = 1;
867
868 if (s < start)
869 start = s;
870 if (e > end)
871 end = e;
872
873 i = j;
874 i = find_next_bit(vb->dirty_map,
875 VMAP_BBMAP_BITS, i);
876 }
877 spin_unlock(&vb->lock);
878 }
879 rcu_read_unlock();
880 }
881
882 __purge_vmap_area_lazy(&start, &end, 1, flush);
883}
884EXPORT_SYMBOL_GPL(vm_unmap_aliases);
885
886/**
887 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
888 * @mem: the pointer returned by vm_map_ram
889 * @count: the count passed to that vm_map_ram call (cannot unmap partial)
890 */
891void vm_unmap_ram(const void *mem, unsigned int count)
892{
893 unsigned long size = count << PAGE_SHIFT;
894 unsigned long addr = (unsigned long)mem;
895
896 BUG_ON(!addr);
897 BUG_ON(addr < VMALLOC_START);
898 BUG_ON(addr > VMALLOC_END);
899 BUG_ON(addr & (PAGE_SIZE-1));
900
901 debug_check_no_locks_freed(mem, size);
902
903 if (likely(count <= VMAP_MAX_ALLOC))
904 vb_free(mem, size);
905 else
906 free_unmap_vmap_area_addr(addr);
907}
908EXPORT_SYMBOL(vm_unmap_ram);
909
910/**
911 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
912 * @pages: an array of pointers to the pages to be mapped
913 * @count: number of pages
914 * @node: prefer to allocate data structures on this node
915 * @prot: memory protection to use. PAGE_KERNEL for regular RAM
916 *
917 * Returns: a pointer to the address that has been mapped, or %NULL on failure
918 */
919void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
920{
921 unsigned long size = count << PAGE_SHIFT;
215 unsigned long addr; 922 unsigned long addr;
923 void *mem;
924
925 if (likely(count <= VMAP_MAX_ALLOC)) {
926 mem = vb_alloc(size, GFP_KERNEL);
927 if (IS_ERR(mem))
928 return NULL;
929 addr = (unsigned long)mem;
930 } else {
931 struct vmap_area *va;
932 va = alloc_vmap_area(size, PAGE_SIZE,
933 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
934 if (IS_ERR(va))
935 return NULL;
936
937 addr = va->va_start;
938 mem = (void *)addr;
939 }
940 if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
941 vm_unmap_ram(mem, count);
942 return NULL;
943 }
944 return mem;
945}
946EXPORT_SYMBOL(vm_map_ram);
947
948void __init vmalloc_init(void)
949{
950 int i;
951
952 for_each_possible_cpu(i) {
953 struct vmap_block_queue *vbq;
954
955 vbq = &per_cpu(vmap_block_queue, i);
956 spin_lock_init(&vbq->lock);
957 INIT_LIST_HEAD(&vbq->free);
958 INIT_LIST_HEAD(&vbq->dirty);
959 vbq->nr_dirty = 0;
960 }
961
962 vmap_initialized = true;
963}
964
965void unmap_kernel_range(unsigned long addr, unsigned long size)
966{
967 unsigned long end = addr + size;
968 vunmap_page_range(addr, end);
969 flush_tlb_kernel_range(addr, end);
970}
971
972int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
973{
974 unsigned long addr = (unsigned long)area->addr;
975 unsigned long end = addr + area->size - PAGE_SIZE;
976 int err;
977
978 err = vmap_page_range(addr, end, prot, *pages);
979 if (err > 0) {
980 *pages += err;
981 err = 0;
982 }
983
984 return err;
985}
986EXPORT_SYMBOL_GPL(map_vm_area);
987
988/*** Old vmalloc interfaces ***/
989DEFINE_RWLOCK(vmlist_lock);
990struct vm_struct *vmlist;
991
992static struct vm_struct *__get_vm_area_node(unsigned long size,
993 unsigned long flags, unsigned long start, unsigned long end,
994 int node, gfp_t gfp_mask, void *caller)
995{
996 static struct vmap_area *va;
997 struct vm_struct *area;
998 struct vm_struct *tmp, **p;
999 unsigned long align = 1;
216 1000
217 BUG_ON(in_interrupt()); 1001 BUG_ON(in_interrupt());
218 if (flags & VM_IOREMAP) { 1002 if (flags & VM_IOREMAP) {
@@ -225,13 +1009,12 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
225 1009
226 align = 1ul << bit; 1010 align = 1ul << bit;
227 } 1011 }
228 addr = ALIGN(start, align); 1012
229 size = PAGE_ALIGN(size); 1013 size = PAGE_ALIGN(size);
230 if (unlikely(!size)) 1014 if (unlikely(!size))
231 return NULL; 1015 return NULL;
232 1016
233 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 1017 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
234
235 if (unlikely(!area)) 1018 if (unlikely(!area))
236 return NULL; 1019 return NULL;
237 1020
@@ -240,48 +1023,32 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
240 */ 1023 */
241 size += PAGE_SIZE; 1024 size += PAGE_SIZE;
242 1025
243 write_lock(&vmlist_lock); 1026 va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
244 for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) { 1027 if (IS_ERR(va)) {
245 if ((unsigned long)tmp->addr < addr) { 1028 kfree(area);
246 if((unsigned long)tmp->addr + tmp->size >= addr) 1029 return NULL;
247 addr = ALIGN(tmp->size +
248 (unsigned long)tmp->addr, align);
249 continue;
250 }
251 if ((size + addr) < addr)
252 goto out;
253 if (size + addr <= (unsigned long)tmp->addr)
254 goto found;
255 addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align);
256 if (addr > end - size)
257 goto out;
258 } 1030 }
259 if ((size + addr) < addr)
260 goto out;
261 if (addr > end - size)
262 goto out;
263
264found:
265 area->next = *p;
266 *p = area;
267 1031
268 area->flags = flags; 1032 area->flags = flags;
269 area->addr = (void *)addr; 1033 area->addr = (void *)va->va_start;
270 area->size = size; 1034 area->size = size;
271 area->pages = NULL; 1035 area->pages = NULL;
272 area->nr_pages = 0; 1036 area->nr_pages = 0;
273 area->phys_addr = 0; 1037 area->phys_addr = 0;
274 area->caller = caller; 1038 area->caller = caller;
1039 va->private = area;
1040 va->flags |= VM_VM_AREA;
1041
1042 write_lock(&vmlist_lock);
1043 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1044 if (tmp->addr >= area->addr)
1045 break;
1046 }
1047 area->next = *p;
1048 *p = area;
275 write_unlock(&vmlist_lock); 1049 write_unlock(&vmlist_lock);
276 1050
277 return area; 1051 return area;
278
279out:
280 write_unlock(&vmlist_lock);
281 kfree(area);
282 if (printk_ratelimit())
283 printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n");
284 return NULL;
285} 1052}
286 1053
287struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 1054struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
@@ -321,39 +1088,15 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
321 gfp_mask, __builtin_return_address(0)); 1088 gfp_mask, __builtin_return_address(0));
322} 1089}
323 1090
324/* Caller must hold vmlist_lock */ 1091static struct vm_struct *find_vm_area(const void *addr)
325static struct vm_struct *__find_vm_area(const void *addr)
326{ 1092{
327 struct vm_struct *tmp; 1093 struct vmap_area *va;
328
329 for (tmp = vmlist; tmp != NULL; tmp = tmp->next) {
330 if (tmp->addr == addr)
331 break;
332 }
333 1094
334 return tmp; 1095 va = find_vmap_area((unsigned long)addr);
335} 1096 if (va && va->flags & VM_VM_AREA)
336 1097 return va->private;
337/* Caller must hold vmlist_lock */
338static struct vm_struct *__remove_vm_area(const void *addr)
339{
340 struct vm_struct **p, *tmp;
341 1098
342 for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) {
343 if (tmp->addr == addr)
344 goto found;
345 }
346 return NULL; 1099 return NULL;
347
348found:
349 unmap_vm_area(tmp);
350 *p = tmp->next;
351
352 /*
353 * Remove the guard page.
354 */
355 tmp->size -= PAGE_SIZE;
356 return tmp;
357} 1100}
358 1101
359/** 1102/**
@@ -366,11 +1109,24 @@ found:
366 */ 1109 */
367struct vm_struct *remove_vm_area(const void *addr) 1110struct vm_struct *remove_vm_area(const void *addr)
368{ 1111{
369 struct vm_struct *v; 1112 struct vmap_area *va;
370 write_lock(&vmlist_lock); 1113
371 v = __remove_vm_area(addr); 1114 va = find_vmap_area((unsigned long)addr);
372 write_unlock(&vmlist_lock); 1115 if (va && va->flags & VM_VM_AREA) {
373 return v; 1116 struct vm_struct *vm = va->private;
1117 struct vm_struct *tmp, **p;
1118 free_unmap_vmap_area(va);
1119 vm->size -= PAGE_SIZE;
1120
1121 write_lock(&vmlist_lock);
1122 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
1123 ;
1124 *p = tmp->next;
1125 write_unlock(&vmlist_lock);
1126
1127 return vm;
1128 }
1129 return NULL;
374} 1130}
375 1131
376static void __vunmap(const void *addr, int deallocate_pages) 1132static void __vunmap(const void *addr, int deallocate_pages)
@@ -480,6 +1236,8 @@ void *vmap(struct page **pages, unsigned int count,
480} 1236}
481EXPORT_SYMBOL(vmap); 1237EXPORT_SYMBOL(vmap);
482 1238
1239static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1240 int node, void *caller);
483static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1241static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
484 pgprot_t prot, int node, void *caller) 1242 pgprot_t prot, int node, void *caller)
485{ 1243{
@@ -606,10 +1364,8 @@ void *vmalloc_user(unsigned long size)
606 1364
607 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); 1365 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
608 if (ret) { 1366 if (ret) {
609 write_lock(&vmlist_lock); 1367 area = find_vm_area(ret);
610 area = __find_vm_area(ret);
611 area->flags |= VM_USERMAP; 1368 area->flags |= VM_USERMAP;
612 write_unlock(&vmlist_lock);
613 } 1369 }
614 return ret; 1370 return ret;
615} 1371}
@@ -689,10 +1445,8 @@ void *vmalloc_32_user(unsigned long size)
689 1445
690 ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); 1446 ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL);
691 if (ret) { 1447 if (ret) {
692 write_lock(&vmlist_lock); 1448 area = find_vm_area(ret);
693 area = __find_vm_area(ret);
694 area->flags |= VM_USERMAP; 1449 area->flags |= VM_USERMAP;
695 write_unlock(&vmlist_lock);
696 } 1450 }
697 return ret; 1451 return ret;
698} 1452}
@@ -793,26 +1547,25 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
793 struct vm_struct *area; 1547 struct vm_struct *area;
794 unsigned long uaddr = vma->vm_start; 1548 unsigned long uaddr = vma->vm_start;
795 unsigned long usize = vma->vm_end - vma->vm_start; 1549 unsigned long usize = vma->vm_end - vma->vm_start;
796 int ret;
797 1550
798 if ((PAGE_SIZE-1) & (unsigned long)addr) 1551 if ((PAGE_SIZE-1) & (unsigned long)addr)
799 return -EINVAL; 1552 return -EINVAL;
800 1553
801 read_lock(&vmlist_lock); 1554 area = find_vm_area(addr);
802 area = __find_vm_area(addr);
803 if (!area) 1555 if (!area)
804 goto out_einval_locked; 1556 return -EINVAL;
805 1557
806 if (!(area->flags & VM_USERMAP)) 1558 if (!(area->flags & VM_USERMAP))
807 goto out_einval_locked; 1559 return -EINVAL;
808 1560
809 if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) 1561 if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
810 goto out_einval_locked; 1562 return -EINVAL;
811 read_unlock(&vmlist_lock);
812 1563
813 addr += pgoff << PAGE_SHIFT; 1564 addr += pgoff << PAGE_SHIFT;
814 do { 1565 do {
815 struct page *page = vmalloc_to_page(addr); 1566 struct page *page = vmalloc_to_page(addr);
1567 int ret;
1568
816 ret = vm_insert_page(vma, uaddr, page); 1569 ret = vm_insert_page(vma, uaddr, page);
817 if (ret) 1570 if (ret)
818 return ret; 1571 return ret;
@@ -825,11 +1578,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
825 /* Prevent "things" like memory migration? VM_flags need a cleanup... */ 1578 /* Prevent "things" like memory migration? VM_flags need a cleanup... */
826 vma->vm_flags |= VM_RESERVED; 1579 vma->vm_flags |= VM_RESERVED;
827 1580
828 return ret; 1581 return 0;
829
830out_einval_locked:
831 read_unlock(&vmlist_lock);
832 return -EINVAL;
833} 1582}
834EXPORT_SYMBOL(remap_vmalloc_range); 1583EXPORT_SYMBOL(remap_vmalloc_range);
835 1584
@@ -989,11 +1738,41 @@ static int s_show(struct seq_file *m, void *p)
989 return 0; 1738 return 0;
990} 1739}
991 1740
992const struct seq_operations vmalloc_op = { 1741static const struct seq_operations vmalloc_op = {
993 .start = s_start, 1742 .start = s_start,
994 .next = s_next, 1743 .next = s_next,
995 .stop = s_stop, 1744 .stop = s_stop,
996 .show = s_show, 1745 .show = s_show,
997}; 1746};
1747
1748static int vmalloc_open(struct inode *inode, struct file *file)
1749{
1750 unsigned int *ptr = NULL;
1751 int ret;
1752
1753 if (NUMA_BUILD)
1754 ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL);
1755 ret = seq_open(file, &vmalloc_op);
1756 if (!ret) {
1757 struct seq_file *m = file->private_data;
1758 m->private = ptr;
1759 } else
1760 kfree(ptr);
1761 return ret;
1762}
1763
1764static const struct file_operations proc_vmalloc_operations = {
1765 .open = vmalloc_open,
1766 .read = seq_read,
1767 .llseek = seq_lseek,
1768 .release = seq_release_private,
1769};
1770
1771static int __init proc_vmalloc_init(void)
1772{
1773 proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations);
1774 return 0;
1775}
1776module_init(proc_vmalloc_init);
998#endif 1777#endif
999 1778
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1ff1a58e7c10..7ea1440b53db 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -39,6 +39,7 @@
39#include <linux/freezer.h> 39#include <linux/freezer.h>
40#include <linux/memcontrol.h> 40#include <linux/memcontrol.h>
41#include <linux/delayacct.h> 41#include <linux/delayacct.h>
42#include <linux/sysctl.h>
42 43
43#include <asm/tlbflush.h> 44#include <asm/tlbflush.h>
44#include <asm/div64.h> 45#include <asm/div64.h>
@@ -78,7 +79,7 @@ struct scan_control {
78 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, 79 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
79 unsigned long *scanned, int order, int mode, 80 unsigned long *scanned, int order, int mode,
80 struct zone *z, struct mem_cgroup *mem_cont, 81 struct zone *z, struct mem_cgroup *mem_cont,
81 int active); 82 int active, int file);
82}; 83};
83 84
84#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) 85#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -470,6 +471,85 @@ int remove_mapping(struct address_space *mapping, struct page *page)
470 return 0; 471 return 0;
471} 472}
472 473
474/**
475 * putback_lru_page - put previously isolated page onto appropriate LRU list
476 * @page: page to be put back to appropriate lru list
477 *
478 * Add previously isolated @page to appropriate LRU list.
479 * Page may still be unevictable for other reasons.
480 *
481 * lru_lock must not be held, interrupts must be enabled.
482 */
483#ifdef CONFIG_UNEVICTABLE_LRU
484void putback_lru_page(struct page *page)
485{
486 int lru;
487 int active = !!TestClearPageActive(page);
488 int was_unevictable = PageUnevictable(page);
489
490 VM_BUG_ON(PageLRU(page));
491
492redo:
493 ClearPageUnevictable(page);
494
495 if (page_evictable(page, NULL)) {
496 /*
497 * For evictable pages, we can use the cache.
498 * In event of a race, worst case is we end up with an
499 * unevictable page on [in]active list.
500 * We know how to handle that.
501 */
502 lru = active + page_is_file_cache(page);
503 lru_cache_add_lru(page, lru);
504 } else {
505 /*
506 * Put unevictable pages directly on zone's unevictable
507 * list.
508 */
509 lru = LRU_UNEVICTABLE;
510 add_page_to_unevictable_list(page);
511 }
512 mem_cgroup_move_lists(page, lru);
513
514 /*
515 * page's status can change while we move it among lru. If an evictable
516 * page is on unevictable list, it never be freed. To avoid that,
517 * check after we added it to the list, again.
518 */
519 if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) {
520 if (!isolate_lru_page(page)) {
521 put_page(page);
522 goto redo;
523 }
524 /* This means someone else dropped this page from LRU
525 * So, it will be freed or putback to LRU again. There is
526 * nothing to do here.
527 */
528 }
529
530 if (was_unevictable && lru != LRU_UNEVICTABLE)
531 count_vm_event(UNEVICTABLE_PGRESCUED);
532 else if (!was_unevictable && lru == LRU_UNEVICTABLE)
533 count_vm_event(UNEVICTABLE_PGCULLED);
534
535 put_page(page); /* drop ref from isolate */
536}
537
538#else /* CONFIG_UNEVICTABLE_LRU */
539
540void putback_lru_page(struct page *page)
541{
542 int lru;
543 VM_BUG_ON(PageLRU(page));
544
545 lru = !!TestClearPageActive(page) + page_is_file_cache(page);
546 lru_cache_add_lru(page, lru);
547 mem_cgroup_move_lists(page, lru);
548 put_page(page);
549}
550#endif /* CONFIG_UNEVICTABLE_LRU */
551
552
473/* 553/*
474 * shrink_page_list() returns the number of reclaimed pages 554 * shrink_page_list() returns the number of reclaimed pages
475 */ 555 */
@@ -503,6 +583,9 @@ static unsigned long shrink_page_list(struct list_head *page_list,
503 583
504 sc->nr_scanned++; 584 sc->nr_scanned++;
505 585
586 if (unlikely(!page_evictable(page, NULL)))
587 goto cull_mlocked;
588
506 if (!sc->may_swap && page_mapped(page)) 589 if (!sc->may_swap && page_mapped(page))
507 goto keep_locked; 590 goto keep_locked;
508 591
@@ -539,9 +622,22 @@ static unsigned long shrink_page_list(struct list_head *page_list,
539 * Anonymous process memory has backing store? 622 * Anonymous process memory has backing store?
540 * Try to allocate it some swap space here. 623 * Try to allocate it some swap space here.
541 */ 624 */
542 if (PageAnon(page) && !PageSwapCache(page)) 625 if (PageAnon(page) && !PageSwapCache(page)) {
626 if (!(sc->gfp_mask & __GFP_IO))
627 goto keep_locked;
628 switch (try_to_munlock(page)) {
629 case SWAP_FAIL: /* shouldn't happen */
630 case SWAP_AGAIN:
631 goto keep_locked;
632 case SWAP_MLOCK:
633 goto cull_mlocked;
634 case SWAP_SUCCESS:
635 ; /* fall thru'; add to swap cache */
636 }
543 if (!add_to_swap(page, GFP_ATOMIC)) 637 if (!add_to_swap(page, GFP_ATOMIC))
544 goto activate_locked; 638 goto activate_locked;
639 may_enter_fs = 1;
640 }
545#endif /* CONFIG_SWAP */ 641#endif /* CONFIG_SWAP */
546 642
547 mapping = page_mapping(page); 643 mapping = page_mapping(page);
@@ -556,6 +652,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
556 goto activate_locked; 652 goto activate_locked;
557 case SWAP_AGAIN: 653 case SWAP_AGAIN:
558 goto keep_locked; 654 goto keep_locked;
655 case SWAP_MLOCK:
656 goto cull_mlocked;
559 case SWAP_SUCCESS: 657 case SWAP_SUCCESS:
560 ; /* try to free the page below */ 658 ; /* try to free the page below */
561 } 659 }
@@ -602,7 +700,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
602 * possible for a page to have PageDirty set, but it is actually 700 * possible for a page to have PageDirty set, but it is actually
603 * clean (all its buffers are clean). This happens if the 701 * clean (all its buffers are clean). This happens if the
604 * buffers were written out directly, with submit_bh(). ext3 702 * buffers were written out directly, with submit_bh(). ext3
605 * will do this, as well as the blockdev mapping. 703 * will do this, as well as the blockdev mapping.
606 * try_to_release_page() will discover that cleanness and will 704 * try_to_release_page() will discover that cleanness and will
607 * drop the buffers and mark the page clean - it can be freed. 705 * drop the buffers and mark the page clean - it can be freed.
608 * 706 *
@@ -637,7 +735,14 @@ static unsigned long shrink_page_list(struct list_head *page_list,
637 if (!mapping || !__remove_mapping(mapping, page)) 735 if (!mapping || !__remove_mapping(mapping, page))
638 goto keep_locked; 736 goto keep_locked;
639 737
640 unlock_page(page); 738 /*
739 * At this point, we have no other references and there is
740 * no way to pick any more up (removed from LRU, removed
741 * from pagecache). Can use non-atomic bitops now (and
742 * we obviously don't have to worry about waking up a process
743 * waiting on the page lock, because there are no references.
744 */
745 __clear_page_locked(page);
641free_it: 746free_it:
642 nr_reclaimed++; 747 nr_reclaimed++;
643 if (!pagevec_add(&freed_pvec, page)) { 748 if (!pagevec_add(&freed_pvec, page)) {
@@ -646,14 +751,23 @@ free_it:
646 } 751 }
647 continue; 752 continue;
648 753
754cull_mlocked:
755 unlock_page(page);
756 putback_lru_page(page);
757 continue;
758
649activate_locked: 759activate_locked:
760 /* Not a candidate for swapping, so reclaim swap space. */
761 if (PageSwapCache(page) && vm_swap_full())
762 remove_exclusive_swap_page_ref(page);
763 VM_BUG_ON(PageActive(page));
650 SetPageActive(page); 764 SetPageActive(page);
651 pgactivate++; 765 pgactivate++;
652keep_locked: 766keep_locked:
653 unlock_page(page); 767 unlock_page(page);
654keep: 768keep:
655 list_add(&page->lru, &ret_pages); 769 list_add(&page->lru, &ret_pages);
656 VM_BUG_ON(PageLRU(page)); 770 VM_BUG_ON(PageLRU(page) || PageUnevictable(page));
657 } 771 }
658 list_splice(&ret_pages, page_list); 772 list_splice(&ret_pages, page_list);
659 if (pagevec_count(&freed_pvec)) 773 if (pagevec_count(&freed_pvec))
@@ -677,7 +791,7 @@ keep:
677 * 791 *
678 * returns 0 on success, -ve errno on failure. 792 * returns 0 on success, -ve errno on failure.
679 */ 793 */
680int __isolate_lru_page(struct page *page, int mode) 794int __isolate_lru_page(struct page *page, int mode, int file)
681{ 795{
682 int ret = -EINVAL; 796 int ret = -EINVAL;
683 797
@@ -693,6 +807,17 @@ int __isolate_lru_page(struct page *page, int mode)
693 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) 807 if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
694 return ret; 808 return ret;
695 809
810 if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file))
811 return ret;
812
813 /*
814 * When this function is being called for lumpy reclaim, we
815 * initially look into all LRU pages, active, inactive and
816 * unevictable; only give shrink_page_list evictable pages.
817 */
818 if (PageUnevictable(page))
819 return ret;
820
696 ret = -EBUSY; 821 ret = -EBUSY;
697 if (likely(get_page_unless_zero(page))) { 822 if (likely(get_page_unless_zero(page))) {
698 /* 823 /*
@@ -723,12 +848,13 @@ int __isolate_lru_page(struct page *page, int mode)
723 * @scanned: The number of pages that were scanned. 848 * @scanned: The number of pages that were scanned.
724 * @order: The caller's attempted allocation order 849 * @order: The caller's attempted allocation order
725 * @mode: One of the LRU isolation modes 850 * @mode: One of the LRU isolation modes
851 * @file: True [1] if isolating file [!anon] pages
726 * 852 *
727 * returns how many pages were moved onto *@dst. 853 * returns how many pages were moved onto *@dst.
728 */ 854 */
729static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 855static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
730 struct list_head *src, struct list_head *dst, 856 struct list_head *src, struct list_head *dst,
731 unsigned long *scanned, int order, int mode) 857 unsigned long *scanned, int order, int mode, int file)
732{ 858{
733 unsigned long nr_taken = 0; 859 unsigned long nr_taken = 0;
734 unsigned long scan; 860 unsigned long scan;
@@ -745,7 +871,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
745 871
746 VM_BUG_ON(!PageLRU(page)); 872 VM_BUG_ON(!PageLRU(page));
747 873
748 switch (__isolate_lru_page(page, mode)) { 874 switch (__isolate_lru_page(page, mode, file)) {
749 case 0: 875 case 0:
750 list_move(&page->lru, dst); 876 list_move(&page->lru, dst);
751 nr_taken++; 877 nr_taken++;
@@ -788,10 +914,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
788 break; 914 break;
789 915
790 cursor_page = pfn_to_page(pfn); 916 cursor_page = pfn_to_page(pfn);
917
791 /* Check that we have not crossed a zone boundary. */ 918 /* Check that we have not crossed a zone boundary. */
792 if (unlikely(page_zone_id(cursor_page) != zone_id)) 919 if (unlikely(page_zone_id(cursor_page) != zone_id))
793 continue; 920 continue;
794 switch (__isolate_lru_page(cursor_page, mode)) { 921 switch (__isolate_lru_page(cursor_page, mode, file)) {
795 case 0: 922 case 0:
796 list_move(&cursor_page->lru, dst); 923 list_move(&cursor_page->lru, dst);
797 nr_taken++; 924 nr_taken++;
@@ -802,7 +929,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
802 /* else it is being freed elsewhere */ 929 /* else it is being freed elsewhere */
803 list_move(&cursor_page->lru, src); 930 list_move(&cursor_page->lru, src);
804 default: 931 default:
805 break; 932 break; /* ! on LRU or wrong list */
806 } 933 }
807 } 934 }
808 } 935 }
@@ -816,40 +943,93 @@ static unsigned long isolate_pages_global(unsigned long nr,
816 unsigned long *scanned, int order, 943 unsigned long *scanned, int order,
817 int mode, struct zone *z, 944 int mode, struct zone *z,
818 struct mem_cgroup *mem_cont, 945 struct mem_cgroup *mem_cont,
819 int active) 946 int active, int file)
820{ 947{
948 int lru = LRU_BASE;
821 if (active) 949 if (active)
822 return isolate_lru_pages(nr, &z->active_list, dst, 950 lru += LRU_ACTIVE;
823 scanned, order, mode); 951 if (file)
824 else 952 lru += LRU_FILE;
825 return isolate_lru_pages(nr, &z->inactive_list, dst, 953 return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order,
826 scanned, order, mode); 954 mode, !!file);
827} 955}
828 956
829/* 957/*
830 * clear_active_flags() is a helper for shrink_active_list(), clearing 958 * clear_active_flags() is a helper for shrink_active_list(), clearing
831 * any active bits from the pages in the list. 959 * any active bits from the pages in the list.
832 */ 960 */
833static unsigned long clear_active_flags(struct list_head *page_list) 961static unsigned long clear_active_flags(struct list_head *page_list,
962 unsigned int *count)
834{ 963{
835 int nr_active = 0; 964 int nr_active = 0;
965 int lru;
836 struct page *page; 966 struct page *page;
837 967
838 list_for_each_entry(page, page_list, lru) 968 list_for_each_entry(page, page_list, lru) {
969 lru = page_is_file_cache(page);
839 if (PageActive(page)) { 970 if (PageActive(page)) {
971 lru += LRU_ACTIVE;
840 ClearPageActive(page); 972 ClearPageActive(page);
841 nr_active++; 973 nr_active++;
842 } 974 }
975 count[lru]++;
976 }
843 977
844 return nr_active; 978 return nr_active;
845} 979}
846 980
981/**
982 * isolate_lru_page - tries to isolate a page from its LRU list
983 * @page: page to isolate from its LRU list
984 *
985 * Isolates a @page from an LRU list, clears PageLRU and adjusts the
986 * vmstat statistic corresponding to whatever LRU list the page was on.
987 *
988 * Returns 0 if the page was removed from an LRU list.
989 * Returns -EBUSY if the page was not on an LRU list.
990 *
991 * The returned page will have PageLRU() cleared. If it was found on
992 * the active list, it will have PageActive set. If it was found on
993 * the unevictable list, it will have the PageUnevictable bit set. That flag
994 * may need to be cleared by the caller before letting the page go.
995 *
996 * The vmstat statistic corresponding to the list on which the page was
997 * found will be decremented.
998 *
999 * Restrictions:
1000 * (1) Must be called with an elevated refcount on the page. This is a
1001 * fundamentnal difference from isolate_lru_pages (which is called
1002 * without a stable reference).
1003 * (2) the lru_lock must not be held.
1004 * (3) interrupts must be enabled.
1005 */
1006int isolate_lru_page(struct page *page)
1007{
1008 int ret = -EBUSY;
1009
1010 if (PageLRU(page)) {
1011 struct zone *zone = page_zone(page);
1012
1013 spin_lock_irq(&zone->lru_lock);
1014 if (PageLRU(page) && get_page_unless_zero(page)) {
1015 int lru = page_lru(page);
1016 ret = 0;
1017 ClearPageLRU(page);
1018
1019 del_page_from_lru_list(zone, page, lru);
1020 }
1021 spin_unlock_irq(&zone->lru_lock);
1022 }
1023 return ret;
1024}
1025
847/* 1026/*
848 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number 1027 * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
849 * of reclaimed pages 1028 * of reclaimed pages
850 */ 1029 */
851static unsigned long shrink_inactive_list(unsigned long max_scan, 1030static unsigned long shrink_inactive_list(unsigned long max_scan,
852 struct zone *zone, struct scan_control *sc) 1031 struct zone *zone, struct scan_control *sc,
1032 int priority, int file)
853{ 1033{
854 LIST_HEAD(page_list); 1034 LIST_HEAD(page_list);
855 struct pagevec pvec; 1035 struct pagevec pvec;
@@ -866,20 +1046,43 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
866 unsigned long nr_scan; 1046 unsigned long nr_scan;
867 unsigned long nr_freed; 1047 unsigned long nr_freed;
868 unsigned long nr_active; 1048 unsigned long nr_active;
1049 unsigned int count[NR_LRU_LISTS] = { 0, };
1050 int mode = ISOLATE_INACTIVE;
1051
1052 /*
1053 * If we need a large contiguous chunk of memory, or have
1054 * trouble getting a small set of contiguous pages, we
1055 * will reclaim both active and inactive pages.
1056 *
1057 * We use the same threshold as pageout congestion_wait below.
1058 */
1059 if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
1060 mode = ISOLATE_BOTH;
1061 else if (sc->order && priority < DEF_PRIORITY - 2)
1062 mode = ISOLATE_BOTH;
869 1063
870 nr_taken = sc->isolate_pages(sc->swap_cluster_max, 1064 nr_taken = sc->isolate_pages(sc->swap_cluster_max,
871 &page_list, &nr_scan, sc->order, 1065 &page_list, &nr_scan, sc->order, mode,
872 (sc->order > PAGE_ALLOC_COSTLY_ORDER)? 1066 zone, sc->mem_cgroup, 0, file);
873 ISOLATE_BOTH : ISOLATE_INACTIVE, 1067 nr_active = clear_active_flags(&page_list, count);
874 zone, sc->mem_cgroup, 0);
875 nr_active = clear_active_flags(&page_list);
876 __count_vm_events(PGDEACTIVATE, nr_active); 1068 __count_vm_events(PGDEACTIVATE, nr_active);
877 1069
878 __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); 1070 __mod_zone_page_state(zone, NR_ACTIVE_FILE,
879 __mod_zone_page_state(zone, NR_INACTIVE, 1071 -count[LRU_ACTIVE_FILE]);
880 -(nr_taken - nr_active)); 1072 __mod_zone_page_state(zone, NR_INACTIVE_FILE,
881 if (scan_global_lru(sc)) 1073 -count[LRU_INACTIVE_FILE]);
1074 __mod_zone_page_state(zone, NR_ACTIVE_ANON,
1075 -count[LRU_ACTIVE_ANON]);
1076 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1077 -count[LRU_INACTIVE_ANON]);
1078
1079 if (scan_global_lru(sc)) {
882 zone->pages_scanned += nr_scan; 1080 zone->pages_scanned += nr_scan;
1081 zone->recent_scanned[0] += count[LRU_INACTIVE_ANON];
1082 zone->recent_scanned[0] += count[LRU_ACTIVE_ANON];
1083 zone->recent_scanned[1] += count[LRU_INACTIVE_FILE];
1084 zone->recent_scanned[1] += count[LRU_ACTIVE_FILE];
1085 }
883 spin_unlock_irq(&zone->lru_lock); 1086 spin_unlock_irq(&zone->lru_lock);
884 1087
885 nr_scanned += nr_scan; 1088 nr_scanned += nr_scan;
@@ -899,7 +1102,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
899 * The attempt at page out may have made some 1102 * The attempt at page out may have made some
900 * of the pages active, mark them inactive again. 1103 * of the pages active, mark them inactive again.
901 */ 1104 */
902 nr_active = clear_active_flags(&page_list); 1105 nr_active = clear_active_flags(&page_list, count);
903 count_vm_events(PGDEACTIVATE, nr_active); 1106 count_vm_events(PGDEACTIVATE, nr_active);
904 1107
905 nr_freed += shrink_page_list(&page_list, sc, 1108 nr_freed += shrink_page_list(&page_list, sc,
@@ -924,14 +1127,24 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
924 * Put back any unfreeable pages. 1127 * Put back any unfreeable pages.
925 */ 1128 */
926 while (!list_empty(&page_list)) { 1129 while (!list_empty(&page_list)) {
1130 int lru;
927 page = lru_to_page(&page_list); 1131 page = lru_to_page(&page_list);
928 VM_BUG_ON(PageLRU(page)); 1132 VM_BUG_ON(PageLRU(page));
929 SetPageLRU(page);
930 list_del(&page->lru); 1133 list_del(&page->lru);
931 if (PageActive(page)) 1134 if (unlikely(!page_evictable(page, NULL))) {
932 add_page_to_active_list(zone, page); 1135 spin_unlock_irq(&zone->lru_lock);
933 else 1136 putback_lru_page(page);
934 add_page_to_inactive_list(zone, page); 1137 spin_lock_irq(&zone->lru_lock);
1138 continue;
1139 }
1140 SetPageLRU(page);
1141 lru = page_lru(page);
1142 add_page_to_lru_list(zone, page, lru);
1143 mem_cgroup_move_lists(page, lru);
1144 if (PageActive(page) && scan_global_lru(sc)) {
1145 int file = !!page_is_file_cache(page);
1146 zone->recent_rotated[file]++;
1147 }
935 if (!pagevec_add(&pvec, page)) { 1148 if (!pagevec_add(&pvec, page)) {
936 spin_unlock_irq(&zone->lru_lock); 1149 spin_unlock_irq(&zone->lru_lock);
937 __pagevec_release(&pvec); 1150 __pagevec_release(&pvec);
@@ -962,115 +1175,7 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
962 1175
963static inline int zone_is_near_oom(struct zone *zone) 1176static inline int zone_is_near_oom(struct zone *zone)
964{ 1177{
965 return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE) 1178 return zone->pages_scanned >= (zone_lru_pages(zone) * 3);
966 + zone_page_state(zone, NR_INACTIVE))*3;
967}
968
969/*
970 * Determine we should try to reclaim mapped pages.
971 * This is called only when sc->mem_cgroup is NULL.
972 */
973static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
974 int priority)
975{
976 long mapped_ratio;
977 long distress;
978 long swap_tendency;
979 long imbalance;
980 int reclaim_mapped = 0;
981 int prev_priority;
982
983 if (scan_global_lru(sc) && zone_is_near_oom(zone))
984 return 1;
985 /*
986 * `distress' is a measure of how much trouble we're having
987 * reclaiming pages. 0 -> no problems. 100 -> great trouble.
988 */
989 if (scan_global_lru(sc))
990 prev_priority = zone->prev_priority;
991 else
992 prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
993
994 distress = 100 >> min(prev_priority, priority);
995
996 /*
997 * The point of this algorithm is to decide when to start
998 * reclaiming mapped memory instead of just pagecache. Work out
999 * how much memory
1000 * is mapped.
1001 */
1002 if (scan_global_lru(sc))
1003 mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
1004 global_page_state(NR_ANON_PAGES)) * 100) /
1005 vm_total_pages;
1006 else
1007 mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);
1008
1009 /*
1010 * Now decide how much we really want to unmap some pages. The
1011 * mapped ratio is downgraded - just because there's a lot of
1012 * mapped memory doesn't necessarily mean that page reclaim
1013 * isn't succeeding.
1014 *
1015 * The distress ratio is important - we don't want to start
1016 * going oom.
1017 *
1018 * A 100% value of vm_swappiness overrides this algorithm
1019 * altogether.
1020 */
1021 swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
1022
1023 /*
1024 * If there's huge imbalance between active and inactive
1025 * (think active 100 times larger than inactive) we should
1026 * become more permissive, or the system will take too much
1027 * cpu before it start swapping during memory pressure.
1028 * Distress is about avoiding early-oom, this is about
1029 * making swappiness graceful despite setting it to low
1030 * values.
1031 *
1032 * Avoid div by zero with nr_inactive+1, and max resulting
1033 * value is vm_total_pages.
1034 */
1035 if (scan_global_lru(sc)) {
1036 imbalance = zone_page_state(zone, NR_ACTIVE);
1037 imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
1038 } else
1039 imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);
1040
1041 /*
1042 * Reduce the effect of imbalance if swappiness is low,
1043 * this means for a swappiness very low, the imbalance
1044 * must be much higher than 100 for this logic to make
1045 * the difference.
1046 *
1047 * Max temporary value is vm_total_pages*100.
1048 */
1049 imbalance *= (vm_swappiness + 1);
1050 imbalance /= 100;
1051
1052 /*
1053 * If not much of the ram is mapped, makes the imbalance
1054 * less relevant, it's high priority we refill the inactive
1055 * list with mapped pages only in presence of high ratio of
1056 * mapped pages.
1057 *
1058 * Max temporary value is vm_total_pages*100.
1059 */
1060 imbalance *= mapped_ratio;
1061 imbalance /= 100;
1062
1063 /* apply imbalance feedback to swap_tendency */
1064 swap_tendency += imbalance;
1065
1066 /*
1067 * Now use this metric to decide whether to start moving mapped
1068 * memory onto the inactive list.
1069 */
1070 if (swap_tendency >= 100)
1071 reclaim_mapped = 1;
1072
1073 return reclaim_mapped;
1074} 1179}
1075 1180
1076/* 1181/*
@@ -1093,53 +1198,71 @@ static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
1093 1198
1094 1199
1095static void shrink_active_list(unsigned long nr_pages, struct zone *zone, 1200static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1096 struct scan_control *sc, int priority) 1201 struct scan_control *sc, int priority, int file)
1097{ 1202{
1098 unsigned long pgmoved; 1203 unsigned long pgmoved;
1099 int pgdeactivate = 0; 1204 int pgdeactivate = 0;
1100 unsigned long pgscanned; 1205 unsigned long pgscanned;
1101 LIST_HEAD(l_hold); /* The pages which were snipped off */ 1206 LIST_HEAD(l_hold); /* The pages which were snipped off */
1102 LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ 1207 LIST_HEAD(l_inactive);
1103 LIST_HEAD(l_active); /* Pages to go onto the active_list */
1104 struct page *page; 1208 struct page *page;
1105 struct pagevec pvec; 1209 struct pagevec pvec;
1106 int reclaim_mapped = 0; 1210 enum lru_list lru;
1107
1108 if (sc->may_swap)
1109 reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
1110 1211
1111 lru_add_drain(); 1212 lru_add_drain();
1112 spin_lock_irq(&zone->lru_lock); 1213 spin_lock_irq(&zone->lru_lock);
1113 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, 1214 pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
1114 ISOLATE_ACTIVE, zone, 1215 ISOLATE_ACTIVE, zone,
1115 sc->mem_cgroup, 1); 1216 sc->mem_cgroup, 1, file);
1116 /* 1217 /*
1117 * zone->pages_scanned is used for detect zone's oom 1218 * zone->pages_scanned is used for detect zone's oom
1118 * mem_cgroup remembers nr_scan by itself. 1219 * mem_cgroup remembers nr_scan by itself.
1119 */ 1220 */
1120 if (scan_global_lru(sc)) 1221 if (scan_global_lru(sc)) {
1121 zone->pages_scanned += pgscanned; 1222 zone->pages_scanned += pgscanned;
1223 zone->recent_scanned[!!file] += pgmoved;
1224 }
1122 1225
1123 __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); 1226 if (file)
1227 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
1228 else
1229 __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved);
1124 spin_unlock_irq(&zone->lru_lock); 1230 spin_unlock_irq(&zone->lru_lock);
1125 1231
1232 pgmoved = 0;
1126 while (!list_empty(&l_hold)) { 1233 while (!list_empty(&l_hold)) {
1127 cond_resched(); 1234 cond_resched();
1128 page = lru_to_page(&l_hold); 1235 page = lru_to_page(&l_hold);
1129 list_del(&page->lru); 1236 list_del(&page->lru);
1130 if (page_mapped(page)) { 1237
1131 if (!reclaim_mapped || 1238 if (unlikely(!page_evictable(page, NULL))) {
1132 (total_swap_pages == 0 && PageAnon(page)) || 1239 putback_lru_page(page);
1133 page_referenced(page, 0, sc->mem_cgroup)) { 1240 continue;
1134 list_add(&page->lru, &l_active);
1135 continue;
1136 }
1137 } 1241 }
1242
1243 /* page_referenced clears PageReferenced */
1244 if (page_mapping_inuse(page) &&
1245 page_referenced(page, 0, sc->mem_cgroup))
1246 pgmoved++;
1247
1138 list_add(&page->lru, &l_inactive); 1248 list_add(&page->lru, &l_inactive);
1139 } 1249 }
1140 1250
1251 /*
1252 * Count referenced pages from currently used mappings as
1253 * rotated, even though they are moved to the inactive list.
1254 * This helps balance scan pressure between file and anonymous
1255 * pages in get_scan_ratio.
1256 */
1257 zone->recent_rotated[!!file] += pgmoved;
1258
1259 /*
1260 * Move the pages to the [file or anon] inactive list.
1261 */
1141 pagevec_init(&pvec, 1); 1262 pagevec_init(&pvec, 1);
1263
1142 pgmoved = 0; 1264 pgmoved = 0;
1265 lru = LRU_BASE + file * LRU_FILE;
1143 spin_lock_irq(&zone->lru_lock); 1266 spin_lock_irq(&zone->lru_lock);
1144 while (!list_empty(&l_inactive)) { 1267 while (!list_empty(&l_inactive)) {
1145 page = lru_to_page(&l_inactive); 1268 page = lru_to_page(&l_inactive);
@@ -1149,11 +1272,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1149 VM_BUG_ON(!PageActive(page)); 1272 VM_BUG_ON(!PageActive(page));
1150 ClearPageActive(page); 1273 ClearPageActive(page);
1151 1274
1152 list_move(&page->lru, &zone->inactive_list); 1275 list_move(&page->lru, &zone->lru[lru].list);
1153 mem_cgroup_move_lists(page, false); 1276 mem_cgroup_move_lists(page, lru);
1154 pgmoved++; 1277 pgmoved++;
1155 if (!pagevec_add(&pvec, page)) { 1278 if (!pagevec_add(&pvec, page)) {
1156 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 1279 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1157 spin_unlock_irq(&zone->lru_lock); 1280 spin_unlock_irq(&zone->lru_lock);
1158 pgdeactivate += pgmoved; 1281 pgdeactivate += pgmoved;
1159 pgmoved = 0; 1282 pgmoved = 0;
@@ -1163,104 +1286,189 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1163 spin_lock_irq(&zone->lru_lock); 1286 spin_lock_irq(&zone->lru_lock);
1164 } 1287 }
1165 } 1288 }
1166 __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); 1289 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1167 pgdeactivate += pgmoved; 1290 pgdeactivate += pgmoved;
1168 if (buffer_heads_over_limit) { 1291 if (buffer_heads_over_limit) {
1169 spin_unlock_irq(&zone->lru_lock); 1292 spin_unlock_irq(&zone->lru_lock);
1170 pagevec_strip(&pvec); 1293 pagevec_strip(&pvec);
1171 spin_lock_irq(&zone->lru_lock); 1294 spin_lock_irq(&zone->lru_lock);
1172 } 1295 }
1173
1174 pgmoved = 0;
1175 while (!list_empty(&l_active)) {
1176 page = lru_to_page(&l_active);
1177 prefetchw_prev_lru_page(page, &l_active, flags);
1178 VM_BUG_ON(PageLRU(page));
1179 SetPageLRU(page);
1180 VM_BUG_ON(!PageActive(page));
1181
1182 list_move(&page->lru, &zone->active_list);
1183 mem_cgroup_move_lists(page, true);
1184 pgmoved++;
1185 if (!pagevec_add(&pvec, page)) {
1186 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
1187 pgmoved = 0;
1188 spin_unlock_irq(&zone->lru_lock);
1189 __pagevec_release(&pvec);
1190 spin_lock_irq(&zone->lru_lock);
1191 }
1192 }
1193 __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
1194
1195 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1296 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1196 __count_vm_events(PGDEACTIVATE, pgdeactivate); 1297 __count_vm_events(PGDEACTIVATE, pgdeactivate);
1197 spin_unlock_irq(&zone->lru_lock); 1298 spin_unlock_irq(&zone->lru_lock);
1299 if (vm_swap_full())
1300 pagevec_swap_free(&pvec);
1198 1301
1199 pagevec_release(&pvec); 1302 pagevec_release(&pvec);
1200} 1303}
1201 1304
1305static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1306 struct zone *zone, struct scan_control *sc, int priority)
1307{
1308 int file = is_file_lru(lru);
1309
1310 if (lru == LRU_ACTIVE_FILE) {
1311 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1312 return 0;
1313 }
1314
1315 if (lru == LRU_ACTIVE_ANON &&
1316 (!scan_global_lru(sc) || inactive_anon_is_low(zone))) {
1317 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1318 return 0;
1319 }
1320 return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
1321}
1322
1323/*
1324 * Determine how aggressively the anon and file LRU lists should be
1325 * scanned. The relative value of each set of LRU lists is determined
1326 * by looking at the fraction of the pages scanned we did rotate back
1327 * onto the active list instead of evict.
1328 *
1329 * percent[0] specifies how much pressure to put on ram/swap backed
1330 * memory, while percent[1] determines pressure on the file LRUs.
1331 */
1332static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1333 unsigned long *percent)
1334{
1335 unsigned long anon, file, free;
1336 unsigned long anon_prio, file_prio;
1337 unsigned long ap, fp;
1338
1339 anon = zone_page_state(zone, NR_ACTIVE_ANON) +
1340 zone_page_state(zone, NR_INACTIVE_ANON);
1341 file = zone_page_state(zone, NR_ACTIVE_FILE) +
1342 zone_page_state(zone, NR_INACTIVE_FILE);
1343 free = zone_page_state(zone, NR_FREE_PAGES);
1344
1345 /* If we have no swap space, do not bother scanning anon pages. */
1346 if (nr_swap_pages <= 0) {
1347 percent[0] = 0;
1348 percent[1] = 100;
1349 return;
1350 }
1351
1352 /* If we have very few page cache pages, force-scan anon pages. */
1353 if (unlikely(file + free <= zone->pages_high)) {
1354 percent[0] = 100;
1355 percent[1] = 0;
1356 return;
1357 }
1358
1359 /*
1360 * OK, so we have swap space and a fair amount of page cache
1361 * pages. We use the recently rotated / recently scanned
1362 * ratios to determine how valuable each cache is.
1363 *
1364 * Because workloads change over time (and to avoid overflow)
1365 * we keep these statistics as a floating average, which ends
1366 * up weighing recent references more than old ones.
1367 *
1368 * anon in [0], file in [1]
1369 */
1370 if (unlikely(zone->recent_scanned[0] > anon / 4)) {
1371 spin_lock_irq(&zone->lru_lock);
1372 zone->recent_scanned[0] /= 2;
1373 zone->recent_rotated[0] /= 2;
1374 spin_unlock_irq(&zone->lru_lock);
1375 }
1376
1377 if (unlikely(zone->recent_scanned[1] > file / 4)) {
1378 spin_lock_irq(&zone->lru_lock);
1379 zone->recent_scanned[1] /= 2;
1380 zone->recent_rotated[1] /= 2;
1381 spin_unlock_irq(&zone->lru_lock);
1382 }
1383
1384 /*
1385 * With swappiness at 100, anonymous and file have the same priority.
1386 * This scanning priority is essentially the inverse of IO cost.
1387 */
1388 anon_prio = sc->swappiness;
1389 file_prio = 200 - sc->swappiness;
1390
1391 /*
1392 * The amount of pressure on anon vs file pages is inversely
1393 * proportional to the fraction of recently scanned pages on
1394 * each list that were recently referenced and in active use.
1395 */
1396 ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1);
1397 ap /= zone->recent_rotated[0] + 1;
1398
1399 fp = (file_prio + 1) * (zone->recent_scanned[1] + 1);
1400 fp /= zone->recent_rotated[1] + 1;
1401
1402 /* Normalize to percentages */
1403 percent[0] = 100 * ap / (ap + fp + 1);
1404 percent[1] = 100 - percent[0];
1405}
1406
1407
1202/* 1408/*
1203 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1409 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1204 */ 1410 */
1205static unsigned long shrink_zone(int priority, struct zone *zone, 1411static unsigned long shrink_zone(int priority, struct zone *zone,
1206 struct scan_control *sc) 1412 struct scan_control *sc)
1207{ 1413{
1208 unsigned long nr_active; 1414 unsigned long nr[NR_LRU_LISTS];
1209 unsigned long nr_inactive;
1210 unsigned long nr_to_scan; 1415 unsigned long nr_to_scan;
1211 unsigned long nr_reclaimed = 0; 1416 unsigned long nr_reclaimed = 0;
1417 unsigned long percent[2]; /* anon @ 0; file @ 1 */
1418 enum lru_list l;
1212 1419
1213 if (scan_global_lru(sc)) { 1420 get_scan_ratio(zone, sc, percent);
1214 /*
1215 * Add one to nr_to_scan just to make sure that the kernel
1216 * will slowly sift through the active list.
1217 */
1218 zone->nr_scan_active +=
1219 (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
1220 nr_active = zone->nr_scan_active;
1221 zone->nr_scan_inactive +=
1222 (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
1223 nr_inactive = zone->nr_scan_inactive;
1224 if (nr_inactive >= sc->swap_cluster_max)
1225 zone->nr_scan_inactive = 0;
1226 else
1227 nr_inactive = 0;
1228
1229 if (nr_active >= sc->swap_cluster_max)
1230 zone->nr_scan_active = 0;
1231 else
1232 nr_active = 0;
1233 } else {
1234 /*
1235 * This reclaim occurs not because zone memory shortage but
1236 * because memory controller hits its limit.
1237 * Then, don't modify zone reclaim related data.
1238 */
1239 nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup,
1240 zone, priority);
1241
1242 nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup,
1243 zone, priority);
1244 }
1245 1421
1422 for_each_evictable_lru(l) {
1423 if (scan_global_lru(sc)) {
1424 int file = is_file_lru(l);
1425 int scan;
1246 1426
1247 while (nr_active || nr_inactive) { 1427 scan = zone_page_state(zone, NR_LRU_BASE + l);
1248 if (nr_active) { 1428 if (priority) {
1249 nr_to_scan = min(nr_active, 1429 scan >>= priority;
1250 (unsigned long)sc->swap_cluster_max); 1430 scan = (scan * percent[file]) / 100;
1251 nr_active -= nr_to_scan; 1431 }
1252 shrink_active_list(nr_to_scan, zone, sc, priority); 1432 zone->lru[l].nr_scan += scan;
1433 nr[l] = zone->lru[l].nr_scan;
1434 if (nr[l] >= sc->swap_cluster_max)
1435 zone->lru[l].nr_scan = 0;
1436 else
1437 nr[l] = 0;
1438 } else {
1439 /*
1440 * This reclaim occurs not because zone memory shortage
1441 * but because memory controller hits its limit.
1442 * Don't modify zone reclaim related data.
1443 */
1444 nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
1445 priority, l);
1253 } 1446 }
1447 }
1254 1448
1255 if (nr_inactive) { 1449 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1256 nr_to_scan = min(nr_inactive, 1450 nr[LRU_INACTIVE_FILE]) {
1451 for_each_evictable_lru(l) {
1452 if (nr[l]) {
1453 nr_to_scan = min(nr[l],
1257 (unsigned long)sc->swap_cluster_max); 1454 (unsigned long)sc->swap_cluster_max);
1258 nr_inactive -= nr_to_scan; 1455 nr[l] -= nr_to_scan;
1259 nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, 1456
1260 sc); 1457 nr_reclaimed += shrink_list(l, nr_to_scan,
1458 zone, sc, priority);
1459 }
1261 } 1460 }
1262 } 1461 }
1263 1462
1463 /*
1464 * Even if we did not try to evict anon pages at all, we want to
1465 * rebalance the anon lru active/inactive ratio.
1466 */
1467 if (!scan_global_lru(sc) || inactive_anon_is_low(zone))
1468 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1469 else if (!scan_global_lru(sc))
1470 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1471
1264 throttle_vm_writeout(sc->gfp_mask); 1472 throttle_vm_writeout(sc->gfp_mask);
1265 return nr_reclaimed; 1473 return nr_reclaimed;
1266} 1474}
@@ -1321,7 +1529,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
1321 1529
1322 return nr_reclaimed; 1530 return nr_reclaimed;
1323} 1531}
1324 1532
1325/* 1533/*
1326 * This is the main entry point to direct page reclaim. 1534 * This is the main entry point to direct page reclaim.
1327 * 1535 *
@@ -1364,8 +1572,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1364 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1572 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1365 continue; 1573 continue;
1366 1574
1367 lru_pages += zone_page_state(zone, NR_ACTIVE) 1575 lru_pages += zone_lru_pages(zone);
1368 + zone_page_state(zone, NR_INACTIVE);
1369 } 1576 }
1370 } 1577 }
1371 1578
@@ -1555,6 +1762,14 @@ loop_again:
1555 priority != DEF_PRIORITY) 1762 priority != DEF_PRIORITY)
1556 continue; 1763 continue;
1557 1764
1765 /*
1766 * Do some background aging of the anon list, to give
1767 * pages a chance to be referenced before reclaiming.
1768 */
1769 if (inactive_anon_is_low(zone))
1770 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1771 &sc, priority, 0);
1772
1558 if (!zone_watermark_ok(zone, order, zone->pages_high, 1773 if (!zone_watermark_ok(zone, order, zone->pages_high,
1559 0, 0)) { 1774 0, 0)) {
1560 end_zone = i; 1775 end_zone = i;
@@ -1567,8 +1782,7 @@ loop_again:
1567 for (i = 0; i <= end_zone; i++) { 1782 for (i = 0; i <= end_zone; i++) {
1568 struct zone *zone = pgdat->node_zones + i; 1783 struct zone *zone = pgdat->node_zones + i;
1569 1784
1570 lru_pages += zone_page_state(zone, NR_ACTIVE) 1785 lru_pages += zone_lru_pages(zone);
1571 + zone_page_state(zone, NR_INACTIVE);
1572 } 1786 }
1573 1787
1574 /* 1788 /*
@@ -1612,8 +1826,7 @@ loop_again:
1612 if (zone_is_all_unreclaimable(zone)) 1826 if (zone_is_all_unreclaimable(zone))
1613 continue; 1827 continue;
1614 if (nr_slab == 0 && zone->pages_scanned >= 1828 if (nr_slab == 0 && zone->pages_scanned >=
1615 (zone_page_state(zone, NR_ACTIVE) 1829 (zone_lru_pages(zone) * 6))
1616 + zone_page_state(zone, NR_INACTIVE)) * 6)
1617 zone_set_flag(zone, 1830 zone_set_flag(zone,
1618 ZONE_ALL_UNRECLAIMABLE); 1831 ZONE_ALL_UNRECLAIMABLE);
1619 /* 1832 /*
@@ -1667,7 +1880,7 @@ out:
1667 1880
1668/* 1881/*
1669 * The background pageout daemon, started as a kernel thread 1882 * The background pageout daemon, started as a kernel thread
1670 * from the init process. 1883 * from the init process.
1671 * 1884 *
1672 * This basically trickles out pages so that we have _some_ 1885 * This basically trickles out pages so that we have _some_
1673 * free memory available even if there is no other activity 1886 * free memory available even if there is no other activity
@@ -1761,6 +1974,14 @@ void wakeup_kswapd(struct zone *zone, int order)
1761 wake_up_interruptible(&pgdat->kswapd_wait); 1974 wake_up_interruptible(&pgdat->kswapd_wait);
1762} 1975}
1763 1976
1977unsigned long global_lru_pages(void)
1978{
1979 return global_page_state(NR_ACTIVE_ANON)
1980 + global_page_state(NR_ACTIVE_FILE)
1981 + global_page_state(NR_INACTIVE_ANON)
1982 + global_page_state(NR_INACTIVE_FILE);
1983}
1984
1764#ifdef CONFIG_PM 1985#ifdef CONFIG_PM
1765/* 1986/*
1766 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 1987 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
@@ -1774,6 +1995,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
1774{ 1995{
1775 struct zone *zone; 1996 struct zone *zone;
1776 unsigned long nr_to_scan, ret = 0; 1997 unsigned long nr_to_scan, ret = 0;
1998 enum lru_list l;
1777 1999
1778 for_each_zone(zone) { 2000 for_each_zone(zone) {
1779 2001
@@ -1783,38 +2005,31 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
1783 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) 2005 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
1784 continue; 2006 continue;
1785 2007
1786 /* For pass = 0 we don't shrink the active list */ 2008 for_each_evictable_lru(l) {
1787 if (pass > 0) { 2009 /* For pass = 0, we don't shrink the active list */
1788 zone->nr_scan_active += 2010 if (pass == 0 &&
1789 (zone_page_state(zone, NR_ACTIVE) >> prio) + 1; 2011 (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE))
1790 if (zone->nr_scan_active >= nr_pages || pass > 3) { 2012 continue;
1791 zone->nr_scan_active = 0; 2013
2014 zone->lru[l].nr_scan +=
2015 (zone_page_state(zone, NR_LRU_BASE + l)
2016 >> prio) + 1;
2017 if (zone->lru[l].nr_scan >= nr_pages || pass > 3) {
2018 zone->lru[l].nr_scan = 0;
1792 nr_to_scan = min(nr_pages, 2019 nr_to_scan = min(nr_pages,
1793 zone_page_state(zone, NR_ACTIVE)); 2020 zone_page_state(zone,
1794 shrink_active_list(nr_to_scan, zone, sc, prio); 2021 NR_LRU_BASE + l));
2022 ret += shrink_list(l, nr_to_scan, zone,
2023 sc, prio);
2024 if (ret >= nr_pages)
2025 return ret;
1795 } 2026 }
1796 } 2027 }
1797
1798 zone->nr_scan_inactive +=
1799 (zone_page_state(zone, NR_INACTIVE) >> prio) + 1;
1800 if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
1801 zone->nr_scan_inactive = 0;
1802 nr_to_scan = min(nr_pages,
1803 zone_page_state(zone, NR_INACTIVE));
1804 ret += shrink_inactive_list(nr_to_scan, zone, sc);
1805 if (ret >= nr_pages)
1806 return ret;
1807 }
1808 } 2028 }
1809 2029
1810 return ret; 2030 return ret;
1811} 2031}
1812 2032
1813static unsigned long count_lru_pages(void)
1814{
1815 return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE);
1816}
1817
1818/* 2033/*
1819 * Try to free `nr_pages' of memory, system-wide, and return the number of 2034 * Try to free `nr_pages' of memory, system-wide, and return the number of
1820 * freed pages. 2035 * freed pages.
@@ -1840,7 +2055,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1840 2055
1841 current->reclaim_state = &reclaim_state; 2056 current->reclaim_state = &reclaim_state;
1842 2057
1843 lru_pages = count_lru_pages(); 2058 lru_pages = global_lru_pages();
1844 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); 2059 nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
1845 /* If slab caches are huge, it's better to hit them first */ 2060 /* If slab caches are huge, it's better to hit them first */
1846 while (nr_slab >= lru_pages) { 2061 while (nr_slab >= lru_pages) {
@@ -1883,7 +2098,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1883 2098
1884 reclaim_state.reclaimed_slab = 0; 2099 reclaim_state.reclaimed_slab = 0;
1885 shrink_slab(sc.nr_scanned, sc.gfp_mask, 2100 shrink_slab(sc.nr_scanned, sc.gfp_mask,
1886 count_lru_pages()); 2101 global_lru_pages());
1887 ret += reclaim_state.reclaimed_slab; 2102 ret += reclaim_state.reclaimed_slab;
1888 if (ret >= nr_pages) 2103 if (ret >= nr_pages)
1889 goto out; 2104 goto out;
@@ -1900,7 +2115,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
1900 if (!ret) { 2115 if (!ret) {
1901 do { 2116 do {
1902 reclaim_state.reclaimed_slab = 0; 2117 reclaim_state.reclaimed_slab = 0;
1903 shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); 2118 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
1904 ret += reclaim_state.reclaimed_slab; 2119 ret += reclaim_state.reclaimed_slab;
1905 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); 2120 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
1906 } 2121 }
@@ -2128,3 +2343,250 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2128 return ret; 2343 return ret;
2129} 2344}
2130#endif 2345#endif
2346
2347#ifdef CONFIG_UNEVICTABLE_LRU
2348/*
2349 * page_evictable - test whether a page is evictable
2350 * @page: the page to test
2351 * @vma: the VMA in which the page is or will be mapped, may be NULL
2352 *
2353 * Test whether page is evictable--i.e., should be placed on active/inactive
2354 * lists vs unevictable list. The vma argument is !NULL when called from the
2355 * fault path to determine how to instantate a new page.
2356 *
2357 * Reasons page might not be evictable:
2358 * (1) page's mapping marked unevictable
2359 * (2) page is part of an mlocked VMA
2360 *
2361 */
2362int page_evictable(struct page *page, struct vm_area_struct *vma)
2363{
2364
2365 if (mapping_unevictable(page_mapping(page)))
2366 return 0;
2367
2368 if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
2369 return 0;
2370
2371 return 1;
2372}
2373
2374/**
2375 * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
2376 * @page: page to check evictability and move to appropriate lru list
2377 * @zone: zone page is in
2378 *
2379 * Checks a page for evictability and moves the page to the appropriate
2380 * zone lru list.
2381 *
2382 * Restrictions: zone->lru_lock must be held, page must be on LRU and must
2383 * have PageUnevictable set.
2384 */
2385static void check_move_unevictable_page(struct page *page, struct zone *zone)
2386{
2387 VM_BUG_ON(PageActive(page));
2388
2389retry:
2390 ClearPageUnevictable(page);
2391 if (page_evictable(page, NULL)) {
2392 enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page);
2393
2394 __dec_zone_state(zone, NR_UNEVICTABLE);
2395 list_move(&page->lru, &zone->lru[l].list);
2396 __inc_zone_state(zone, NR_INACTIVE_ANON + l);
2397 __count_vm_event(UNEVICTABLE_PGRESCUED);
2398 } else {
2399 /*
2400 * rotate unevictable list
2401 */
2402 SetPageUnevictable(page);
2403 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
2404 if (page_evictable(page, NULL))
2405 goto retry;
2406 }
2407}
2408
2409/**
2410 * scan_mapping_unevictable_pages - scan an address space for evictable pages
2411 * @mapping: struct address_space to scan for evictable pages
2412 *
2413 * Scan all pages in mapping. Check unevictable pages for
2414 * evictability and move them to the appropriate zone lru list.
2415 */
2416void scan_mapping_unevictable_pages(struct address_space *mapping)
2417{
2418 pgoff_t next = 0;
2419 pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
2420 PAGE_CACHE_SHIFT;
2421 struct zone *zone;
2422 struct pagevec pvec;
2423
2424 if (mapping->nrpages == 0)
2425 return;
2426
2427 pagevec_init(&pvec, 0);
2428 while (next < end &&
2429 pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
2430 int i;
2431 int pg_scanned = 0;
2432
2433 zone = NULL;
2434
2435 for (i = 0; i < pagevec_count(&pvec); i++) {
2436 struct page *page = pvec.pages[i];
2437 pgoff_t page_index = page->index;
2438 struct zone *pagezone = page_zone(page);
2439
2440 pg_scanned++;
2441 if (page_index > next)
2442 next = page_index;
2443 next++;
2444
2445 if (pagezone != zone) {
2446 if (zone)
2447 spin_unlock_irq(&zone->lru_lock);
2448 zone = pagezone;
2449 spin_lock_irq(&zone->lru_lock);
2450 }
2451
2452 if (PageLRU(page) && PageUnevictable(page))
2453 check_move_unevictable_page(page, zone);
2454 }
2455 if (zone)
2456 spin_unlock_irq(&zone->lru_lock);
2457 pagevec_release(&pvec);
2458
2459 count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
2460 }
2461
2462}
2463
2464/**
2465 * scan_zone_unevictable_pages - check unevictable list for evictable pages
2466 * @zone - zone of which to scan the unevictable list
2467 *
2468 * Scan @zone's unevictable LRU lists to check for pages that have become
2469 * evictable. Move those that have to @zone's inactive list where they
2470 * become candidates for reclaim, unless shrink_inactive_zone() decides
2471 * to reactivate them. Pages that are still unevictable are rotated
2472 * back onto @zone's unevictable list.
2473 */
2474#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
2475void scan_zone_unevictable_pages(struct zone *zone)
2476{
2477 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
2478 unsigned long scan;
2479 unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
2480
2481 while (nr_to_scan > 0) {
2482 unsigned long batch_size = min(nr_to_scan,
2483 SCAN_UNEVICTABLE_BATCH_SIZE);
2484
2485 spin_lock_irq(&zone->lru_lock);
2486 for (scan = 0; scan < batch_size; scan++) {
2487 struct page *page = lru_to_page(l_unevictable);
2488
2489 if (!trylock_page(page))
2490 continue;
2491
2492 prefetchw_prev_lru_page(page, l_unevictable, flags);
2493
2494 if (likely(PageLRU(page) && PageUnevictable(page)))
2495 check_move_unevictable_page(page, zone);
2496
2497 unlock_page(page);
2498 }
2499 spin_unlock_irq(&zone->lru_lock);
2500
2501 nr_to_scan -= batch_size;
2502 }
2503}
2504
2505
2506/**
2507 * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
2508 *
2509 * A really big hammer: scan all zones' unevictable LRU lists to check for
2510 * pages that have become evictable. Move those back to the zones'
2511 * inactive list where they become candidates for reclaim.
2512 * This occurs when, e.g., we have unswappable pages on the unevictable lists,
2513 * and we add swap to the system. As such, it runs in the context of a task
2514 * that has possibly/probably made some previously unevictable pages
2515 * evictable.
2516 */
2517void scan_all_zones_unevictable_pages(void)
2518{
2519 struct zone *zone;
2520
2521 for_each_zone(zone) {
2522 scan_zone_unevictable_pages(zone);
2523 }
2524}
2525
2526/*
2527 * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of
2528 * all nodes' unevictable lists for evictable pages
2529 */
2530unsigned long scan_unevictable_pages;
2531
2532int scan_unevictable_handler(struct ctl_table *table, int write,
2533 struct file *file, void __user *buffer,
2534 size_t *length, loff_t *ppos)
2535{
2536 proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
2537
2538 if (write && *(unsigned long *)table->data)
2539 scan_all_zones_unevictable_pages();
2540
2541 scan_unevictable_pages = 0;
2542 return 0;
2543}
2544
2545/*
2546 * per node 'scan_unevictable_pages' attribute. On demand re-scan of
2547 * a specified node's per zone unevictable lists for evictable pages.
2548 */
2549
2550static ssize_t read_scan_unevictable_node(struct sys_device *dev,
2551 struct sysdev_attribute *attr,
2552 char *buf)
2553{
2554 return sprintf(buf, "0\n"); /* always zero; should fit... */
2555}
2556
2557static ssize_t write_scan_unevictable_node(struct sys_device *dev,
2558 struct sysdev_attribute *attr,
2559 const char *buf, size_t count)
2560{
2561 struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
2562 struct zone *zone;
2563 unsigned long res;
2564 unsigned long req = strict_strtoul(buf, 10, &res);
2565
2566 if (!req)
2567 return 1; /* zero is no-op */
2568
2569 for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
2570 if (!populated_zone(zone))
2571 continue;
2572 scan_zone_unevictable_pages(zone);
2573 }
2574 return 1;
2575}
2576
2577
2578static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
2579 read_scan_unevictable_node,
2580 write_scan_unevictable_node);
2581
2582int scan_unevictable_register_node(struct node *node)
2583{
2584 return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
2585}
2586
2587void scan_unevictable_unregister_node(struct node *node)
2588{
2589 sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
2590}
2591
2592#endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index d7826af2fb07..c3ccfda23adc 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -8,7 +8,7 @@
8 * Copyright (C) 2006 Silicon Graphics, Inc., 8 * Copyright (C) 2006 Silicon Graphics, Inc.,
9 * Christoph Lameter <christoph@lameter.com> 9 * Christoph Lameter <christoph@lameter.com>
10 */ 10 */
11 11#include <linux/fs.h>
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/err.h> 13#include <linux/err.h>
14#include <linux/module.h> 14#include <linux/module.h>
@@ -384,7 +384,7 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z)
384#endif 384#endif
385 385
386#ifdef CONFIG_PROC_FS 386#ifdef CONFIG_PROC_FS
387 387#include <linux/proc_fs.h>
388#include <linux/seq_file.h> 388#include <linux/seq_file.h>
389 389
390static char * const migratetype_names[MIGRATE_TYPES] = { 390static char * const migratetype_names[MIGRATE_TYPES] = {
@@ -581,20 +581,44 @@ static int pagetypeinfo_show(struct seq_file *m, void *arg)
581 return 0; 581 return 0;
582} 582}
583 583
584const struct seq_operations fragmentation_op = { 584static const struct seq_operations fragmentation_op = {
585 .start = frag_start, 585 .start = frag_start,
586 .next = frag_next, 586 .next = frag_next,
587 .stop = frag_stop, 587 .stop = frag_stop,
588 .show = frag_show, 588 .show = frag_show,
589}; 589};
590 590
591const struct seq_operations pagetypeinfo_op = { 591static int fragmentation_open(struct inode *inode, struct file *file)
592{
593 return seq_open(file, &fragmentation_op);
594}
595
596static const struct file_operations fragmentation_file_operations = {
597 .open = fragmentation_open,
598 .read = seq_read,
599 .llseek = seq_lseek,
600 .release = seq_release,
601};
602
603static const struct seq_operations pagetypeinfo_op = {
592 .start = frag_start, 604 .start = frag_start,
593 .next = frag_next, 605 .next = frag_next,
594 .stop = frag_stop, 606 .stop = frag_stop,
595 .show = pagetypeinfo_show, 607 .show = pagetypeinfo_show,
596}; 608};
597 609
610static int pagetypeinfo_open(struct inode *inode, struct file *file)
611{
612 return seq_open(file, &pagetypeinfo_op);
613}
614
615static const struct file_operations pagetypeinfo_file_ops = {
616 .open = pagetypeinfo_open,
617 .read = seq_read,
618 .llseek = seq_lseek,
619 .release = seq_release,
620};
621
598#ifdef CONFIG_ZONE_DMA 622#ifdef CONFIG_ZONE_DMA
599#define TEXT_FOR_DMA(xx) xx "_dma", 623#define TEXT_FOR_DMA(xx) xx "_dma",
600#else 624#else
@@ -619,8 +643,14 @@ const struct seq_operations pagetypeinfo_op = {
619static const char * const vmstat_text[] = { 643static const char * const vmstat_text[] = {
620 /* Zoned VM counters */ 644 /* Zoned VM counters */
621 "nr_free_pages", 645 "nr_free_pages",
622 "nr_inactive", 646 "nr_inactive_anon",
623 "nr_active", 647 "nr_active_anon",
648 "nr_inactive_file",
649 "nr_active_file",
650#ifdef CONFIG_UNEVICTABLE_LRU
651 "nr_unevictable",
652 "nr_mlock",
653#endif
624 "nr_anon_pages", 654 "nr_anon_pages",
625 "nr_mapped", 655 "nr_mapped",
626 "nr_file_pages", 656 "nr_file_pages",
@@ -675,6 +705,16 @@ static const char * const vmstat_text[] = {
675 "htlb_buddy_alloc_success", 705 "htlb_buddy_alloc_success",
676 "htlb_buddy_alloc_fail", 706 "htlb_buddy_alloc_fail",
677#endif 707#endif
708#ifdef CONFIG_UNEVICTABLE_LRU
709 "unevictable_pgs_culled",
710 "unevictable_pgs_scanned",
711 "unevictable_pgs_rescued",
712 "unevictable_pgs_mlocked",
713 "unevictable_pgs_munlocked",
714 "unevictable_pgs_cleared",
715 "unevictable_pgs_stranded",
716 "unevictable_pgs_mlockfreed",
717#endif
678#endif 718#endif
679}; 719};
680 720
@@ -688,7 +728,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
688 "\n min %lu" 728 "\n min %lu"
689 "\n low %lu" 729 "\n low %lu"
690 "\n high %lu" 730 "\n high %lu"
691 "\n scanned %lu (a: %lu i: %lu)" 731 "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)"
692 "\n spanned %lu" 732 "\n spanned %lu"
693 "\n present %lu", 733 "\n present %lu",
694 zone_page_state(zone, NR_FREE_PAGES), 734 zone_page_state(zone, NR_FREE_PAGES),
@@ -696,7 +736,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
696 zone->pages_low, 736 zone->pages_low,
697 zone->pages_high, 737 zone->pages_high,
698 zone->pages_scanned, 738 zone->pages_scanned,
699 zone->nr_scan_active, zone->nr_scan_inactive, 739 zone->lru[LRU_ACTIVE_ANON].nr_scan,
740 zone->lru[LRU_INACTIVE_ANON].nr_scan,
741 zone->lru[LRU_ACTIVE_FILE].nr_scan,
742 zone->lru[LRU_INACTIVE_FILE].nr_scan,
700 zone->spanned_pages, 743 zone->spanned_pages,
701 zone->present_pages); 744 zone->present_pages);
702 745
@@ -733,10 +776,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
733 seq_printf(m, 776 seq_printf(m,
734 "\n all_unreclaimable: %u" 777 "\n all_unreclaimable: %u"
735 "\n prev_priority: %i" 778 "\n prev_priority: %i"
736 "\n start_pfn: %lu", 779 "\n start_pfn: %lu"
780 "\n inactive_ratio: %u",
737 zone_is_all_unreclaimable(zone), 781 zone_is_all_unreclaimable(zone),
738 zone->prev_priority, 782 zone->prev_priority,
739 zone->zone_start_pfn); 783 zone->zone_start_pfn,
784 zone->inactive_ratio);
740 seq_putc(m, '\n'); 785 seq_putc(m, '\n');
741} 786}
742 787
@@ -750,7 +795,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg)
750 return 0; 795 return 0;
751} 796}
752 797
753const struct seq_operations zoneinfo_op = { 798static const struct seq_operations zoneinfo_op = {
754 .start = frag_start, /* iterate over all zones. The same as in 799 .start = frag_start, /* iterate over all zones. The same as in
755 * fragmentation. */ 800 * fragmentation. */
756 .next = frag_next, 801 .next = frag_next,
@@ -758,6 +803,18 @@ const struct seq_operations zoneinfo_op = {
758 .show = zoneinfo_show, 803 .show = zoneinfo_show,
759}; 804};
760 805
806static int zoneinfo_open(struct inode *inode, struct file *file)
807{
808 return seq_open(file, &zoneinfo_op);
809}
810
811static const struct file_operations proc_zoneinfo_file_operations = {
812 .open = zoneinfo_open,
813 .read = seq_read,
814 .llseek = seq_lseek,
815 .release = seq_release,
816};
817
761static void *vmstat_start(struct seq_file *m, loff_t *pos) 818static void *vmstat_start(struct seq_file *m, loff_t *pos)
762{ 819{
763 unsigned long *v; 820 unsigned long *v;
@@ -813,13 +870,24 @@ static void vmstat_stop(struct seq_file *m, void *arg)
813 m->private = NULL; 870 m->private = NULL;
814} 871}
815 872
816const struct seq_operations vmstat_op = { 873static const struct seq_operations vmstat_op = {
817 .start = vmstat_start, 874 .start = vmstat_start,
818 .next = vmstat_next, 875 .next = vmstat_next,
819 .stop = vmstat_stop, 876 .stop = vmstat_stop,
820 .show = vmstat_show, 877 .show = vmstat_show,
821}; 878};
822 879
880static int vmstat_open(struct inode *inode, struct file *file)
881{
882 return seq_open(file, &vmstat_op);
883}
884
885static const struct file_operations proc_vmstat_file_operations = {
886 .open = vmstat_open,
887 .read = seq_read,
888 .llseek = seq_lseek,
889 .release = seq_release,
890};
823#endif /* CONFIG_PROC_FS */ 891#endif /* CONFIG_PROC_FS */
824 892
825#ifdef CONFIG_SMP 893#ifdef CONFIG_SMP
@@ -877,9 +945,11 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
877 945
878static struct notifier_block __cpuinitdata vmstat_notifier = 946static struct notifier_block __cpuinitdata vmstat_notifier =
879 { &vmstat_cpuup_callback, NULL, 0 }; 947 { &vmstat_cpuup_callback, NULL, 0 };
948#endif
880 949
881static int __init setup_vmstat(void) 950static int __init setup_vmstat(void)
882{ 951{
952#ifdef CONFIG_SMP
883 int cpu; 953 int cpu;
884 954
885 refresh_zone_stat_thresholds(); 955 refresh_zone_stat_thresholds();
@@ -887,7 +957,13 @@ static int __init setup_vmstat(void)
887 957
888 for_each_online_cpu(cpu) 958 for_each_online_cpu(cpu)
889 start_cpu_timer(cpu); 959 start_cpu_timer(cpu);
960#endif
961#ifdef CONFIG_PROC_FS
962 proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
963 proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
964 proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
965 proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
966#endif
890 return 0; 967 return 0;
891} 968}
892module_init(setup_vmstat) 969module_init(setup_vmstat)
893#endif