aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorJames Morris <jmorris@namei.org>2009-02-05 19:01:45 -0500
committerJames Morris <jmorris@namei.org>2009-02-05 19:01:45 -0500
commitcb5629b10d64a8006622ce3a52bc887d91057d69 (patch)
tree7c06d8f30783115e3384721046258ce615b129c5 /mm
parent8920d5ad6ba74ae8ab020e90cc4d976980e68701 (diff)
parentf01d1d546abb2f4028b5299092f529eefb01253a (diff)
Merge branch 'master' into next
Conflicts: fs/namei.c Manually merged per: diff --cc fs/namei.c index 734f2b5,bbc15c2..0000000 --- a/fs/namei.c +++ b/fs/namei.c @@@ -860,9 -848,8 +849,10 @@@ static int __link_path_walk(const char nd->flags |= LOOKUP_CONTINUE; err = exec_permission_lite(inode); if (err == -EAGAIN) - err = vfs_permission(nd, MAY_EXEC); + err = inode_permission(nd->path.dentry->d_inode, + MAY_EXEC); + if (!err) + err = ima_path_check(&nd->path, MAY_EXEC); if (err) break; @@@ -1525,14 -1506,9 +1509,14 @@@ int may_open(struct path *path, int acc flag &= ~O_TRUNC; } - error = vfs_permission(nd, acc_mode); + error = inode_permission(inode, acc_mode); if (error) return error; + - error = ima_path_check(&nd->path, ++ error = ima_path_check(path, + acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); + if (error) + return error; /* * An append-only file must be opened in append mode for writing. */ Signed-off-by: James Morris <jmorris@namei.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile4
-rw-r--r--mm/backing-dev.c8
-rw-r--r--mm/bootmem.c8
-rw-r--r--mm/fadvise.c18
-rw-r--r--mm/filemap.c56
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/fremap.c6
-rw-r--r--mm/hugetlb.c46
-rw-r--r--mm/internal.h2
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memcontrol.c1898
-rw-r--r--mm/memory.c234
-rw-r--r--mm/memory_hotplug.c20
-rw-r--r--mm/mempolicy.c24
-rw-r--r--mm/migrate.c139
-rw-r--r--mm/mincore.c4
-rw-r--r--mm/mlock.c64
-rw-r--r--mm/mmap.c117
-rw-r--r--mm/mprotect.c12
-rw-r--r--mm/mremap.c8
-rw-r--r--mm/msync.c4
-rw-r--r--mm/nommu.c1054
-rw-r--r--mm/oom_kill.c119
-rw-r--r--mm/page-writeback.c254
-rw-r--r--mm/page_alloc.c143
-rw-r--r--mm/page_cgroup.c209
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/pdflush.c16
-rw-r--r--mm/rmap.c60
-rw-r--r--mm/shmem.c104
-rw-r--r--mm/slab.c2
-rw-r--r--mm/slub.c24
-rw-r--r--mm/swap.c77
-rw-r--r--mm/swap_state.c35
-rw-r--r--mm/swapfile.c607
-rw-r--r--mm/tiny-shmem.c134
-rw-r--r--mm/vmalloc.c57
-rw-r--r--mm/vmscan.c328
-rw-r--r--mm/vmstat.c4
40 files changed, 4061 insertions, 1854 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 5b5790f8a816..a5b77811fdf2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -181,12 +181,6 @@ config MIGRATION
181 example on NUMA systems to put pages nearer to the processors accessing 181 example on NUMA systems to put pages nearer to the processors accessing
182 the page. 182 the page.
183 183
184config RESOURCES_64BIT
185 bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
186 default 64BIT
187 help
188 This option allows memory and IO resources to be 64 bit.
189
190config PHYS_ADDR_T_64BIT 184config PHYS_ADDR_T_64BIT
191 def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT 185 def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
192 186
diff --git a/mm/Makefile b/mm/Makefile
index 51c27709cc7c..72255be57f89 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,7 +9,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o pdflush.o \ 11 maccess.o page_alloc.o page-writeback.o pdflush.o \
12 readahead.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o $(mmu-y) 14 page_isolation.o mm_init.o $(mmu-y)
15 15
@@ -21,9 +21,7 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o
21obj-$(CONFIG_NUMA) += mempolicy.o 21obj-$(CONFIG_NUMA) += mempolicy.o
22obj-$(CONFIG_SPARSEMEM) += sparse.o 22obj-$(CONFIG_SPARSEMEM) += sparse.o
23obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o 23obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
24obj-$(CONFIG_SHMEM) += shmem.o
25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o 24obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
26obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
27obj-$(CONFIG_SLOB) += slob.o 25obj-$(CONFIG_SLOB) += slob.o
28obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 26obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
29obj-$(CONFIG_SLAB) += slab.o 27obj-$(CONFIG_SLAB) += slab.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 801c08b046e6..8e8587444132 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -24,9 +24,9 @@ static void bdi_debug_init(void)
24static int bdi_debug_stats_show(struct seq_file *m, void *v) 24static int bdi_debug_stats_show(struct seq_file *m, void *v)
25{ 25{
26 struct backing_dev_info *bdi = m->private; 26 struct backing_dev_info *bdi = m->private;
27 long background_thresh; 27 unsigned long background_thresh;
28 long dirty_thresh; 28 unsigned long dirty_thresh;
29 long bdi_thresh; 29 unsigned long bdi_thresh;
30 30
31 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); 31 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
32 32
@@ -223,7 +223,7 @@ int bdi_init(struct backing_dev_info *bdi)
223 bdi->max_prop_frac = PROP_FRAC_BASE; 223 bdi->max_prop_frac = PROP_FRAC_BASE;
224 224
225 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 225 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
226 err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0); 226 err = percpu_counter_init(&bdi->bdi_stat[i], 0);
227 if (err) 227 if (err)
228 goto err; 228 goto err;
229 } 229 }
diff --git a/mm/bootmem.c b/mm/bootmem.c
index ac5a891f142a..51a0ccf61e0e 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -435,6 +435,10 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
435 unsigned long fallback = 0; 435 unsigned long fallback = 0;
436 unsigned long min, max, start, sidx, midx, step; 436 unsigned long min, max, start, sidx, midx, step;
437 437
438 bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
439 bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
440 align, goal, limit);
441
438 BUG_ON(!size); 442 BUG_ON(!size);
439 BUG_ON(align & (align - 1)); 443 BUG_ON(align & (align - 1));
440 BUG_ON(limit && goal + size > limit); 444 BUG_ON(limit && goal + size > limit);
@@ -442,10 +446,6 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
442 if (!bdata->node_bootmem_map) 446 if (!bdata->node_bootmem_map)
443 return NULL; 447 return NULL;
444 448
445 bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
446 bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
447 align, goal, limit);
448
449 min = bdata->node_min_pfn; 449 min = bdata->node_min_pfn;
450 max = bdata->node_low_pfn; 450 max = bdata->node_low_pfn;
451 451
diff --git a/mm/fadvise.c b/mm/fadvise.c
index a1da969bd980..54a0f8040afa 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -24,7 +24,7 @@
24 * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could 24 * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
25 * deactivate the pages and clear PG_Referenced. 25 * deactivate the pages and clear PG_Referenced.
26 */ 26 */
27asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) 27SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice)
28{ 28{
29 struct file *file = fget(fd); 29 struct file *file = fget(fd);
30 struct address_space *mapping; 30 struct address_space *mapping;
@@ -126,12 +126,26 @@ out:
126 fput(file); 126 fput(file);
127 return ret; 127 return ret;
128} 128}
129#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
130asmlinkage long SyS_fadvise64_64(long fd, loff_t offset, loff_t len, long advice)
131{
132 return SYSC_fadvise64_64((int) fd, offset, len, (int) advice);
133}
134SYSCALL_ALIAS(sys_fadvise64_64, SyS_fadvise64_64);
135#endif
129 136
130#ifdef __ARCH_WANT_SYS_FADVISE64 137#ifdef __ARCH_WANT_SYS_FADVISE64
131 138
132asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice) 139SYSCALL_DEFINE(fadvise64)(int fd, loff_t offset, size_t len, int advice)
133{ 140{
134 return sys_fadvise64_64(fd, offset, len, advice); 141 return sys_fadvise64_64(fd, offset, len, advice);
135} 142}
143#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
144asmlinkage long SyS_fadvise64(long fd, loff_t offset, long len, long advice)
145{
146 return SYSC_fadvise64((int) fd, offset, (size_t)len, (int)advice);
147}
148SYSCALL_ALIAS(sys_fadvise64, SyS_fadvise64);
149#endif
136 150
137#endif 151#endif
diff --git a/mm/filemap.c b/mm/filemap.c
index f3e5f8944d17..23acefe51808 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -210,7 +210,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
210 int ret; 210 int ret;
211 struct writeback_control wbc = { 211 struct writeback_control wbc = {
212 .sync_mode = sync_mode, 212 .sync_mode = sync_mode,
213 .nr_to_write = mapping->nrpages * 2, 213 .nr_to_write = LONG_MAX,
214 .range_start = start, 214 .range_start = start,
215 .range_end = end, 215 .range_end = end,
216 }; 216 };
@@ -460,7 +460,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
460 VM_BUG_ON(!PageLocked(page)); 460 VM_BUG_ON(!PageLocked(page));
461 461
462 error = mem_cgroup_cache_charge(page, current->mm, 462 error = mem_cgroup_cache_charge(page, current->mm,
463 gfp_mask & ~__GFP_HIGHMEM); 463 gfp_mask & GFP_RECLAIM_MASK);
464 if (error) 464 if (error)
465 goto out; 465 goto out;
466 466
@@ -741,7 +741,14 @@ repeat:
741 page = __page_cache_alloc(gfp_mask); 741 page = __page_cache_alloc(gfp_mask);
742 if (!page) 742 if (!page)
743 return NULL; 743 return NULL;
744 err = add_to_page_cache_lru(page, mapping, index, gfp_mask); 744 /*
745 * We want a regular kernel memory (not highmem or DMA etc)
746 * allocation for the radix tree nodes, but we need to honour
747 * the context-specific requirements the caller has asked for.
748 * GFP_RECLAIM_MASK collects those requirements.
749 */
750 err = add_to_page_cache_lru(page, mapping, index,
751 (gfp_mask & GFP_RECLAIM_MASK));
745 if (unlikely(err)) { 752 if (unlikely(err)) {
746 page_cache_release(page); 753 page_cache_release(page);
747 page = NULL; 754 page = NULL;
@@ -950,7 +957,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
950 return NULL; 957 return NULL;
951 } 958 }
952 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); 959 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
953 if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { 960 if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
954 page_cache_release(page); 961 page_cache_release(page);
955 page = NULL; 962 page = NULL;
956 } 963 }
@@ -1317,7 +1324,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1317 goto out; /* skip atime */ 1324 goto out; /* skip atime */
1318 size = i_size_read(inode); 1325 size = i_size_read(inode);
1319 if (pos < size) { 1326 if (pos < size) {
1320 retval = filemap_write_and_wait(mapping); 1327 retval = filemap_write_and_wait_range(mapping, pos,
1328 pos + iov_length(iov, nr_segs) - 1);
1321 if (!retval) { 1329 if (!retval) {
1322 retval = mapping->a_ops->direct_IO(READ, iocb, 1330 retval = mapping->a_ops->direct_IO(READ, iocb,
1323 iov, pos, nr_segs); 1331 iov, pos, nr_segs);
@@ -1366,7 +1374,7 @@ do_readahead(struct address_space *mapping, struct file *filp,
1366 return 0; 1374 return 0;
1367} 1375}
1368 1376
1369asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) 1377SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count)
1370{ 1378{
1371 ssize_t ret; 1379 ssize_t ret;
1372 struct file *file; 1380 struct file *file;
@@ -1385,6 +1393,13 @@ asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1385 } 1393 }
1386 return ret; 1394 return ret;
1387} 1395}
1396#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
1397asmlinkage long SyS_readahead(long fd, loff_t offset, long count)
1398{
1399 return SYSC_readahead((int) fd, offset, (size_t) count);
1400}
1401SYSCALL_ALIAS(sys_readahead, SyS_readahead);
1402#endif
1388 1403
1389#ifdef CONFIG_MMU 1404#ifdef CONFIG_MMU
1390/** 1405/**
@@ -1530,7 +1545,6 @@ retry_find:
1530 /* 1545 /*
1531 * Found the page and have a reference on it. 1546 * Found the page and have a reference on it.
1532 */ 1547 */
1533 mark_page_accessed(page);
1534 ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; 1548 ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
1535 vmf->page = page; 1549 vmf->page = page;
1536 return ret | VM_FAULT_LOCKED; 1550 return ret | VM_FAULT_LOCKED;
@@ -1766,7 +1780,7 @@ int should_remove_suid(struct dentry *dentry)
1766 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP))) 1780 if (unlikely((mode & S_ISGID) && (mode & S_IXGRP)))
1767 kill |= ATTR_KILL_SGID; 1781 kill |= ATTR_KILL_SGID;
1768 1782
1769 if (unlikely(kill && !capable(CAP_FSETID))) 1783 if (unlikely(kill && !capable(CAP_FSETID) && S_ISREG(mode)))
1770 return kill; 1784 return kill;
1771 1785
1772 return 0; 1786 return 0;
@@ -2060,18 +2074,10 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2060 if (count != ocount) 2074 if (count != ocount)
2061 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); 2075 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
2062 2076
2063 /*
2064 * Unmap all mmappings of the file up-front.
2065 *
2066 * This will cause any pte dirty bits to be propagated into the
2067 * pageframes for the subsequent filemap_write_and_wait().
2068 */
2069 write_len = iov_length(iov, *nr_segs); 2077 write_len = iov_length(iov, *nr_segs);
2070 end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; 2078 end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
2071 if (mapping_mapped(mapping))
2072 unmap_mapping_range(mapping, pos, write_len, 0);
2073 2079
2074 written = filemap_write_and_wait(mapping); 2080 written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
2075 if (written) 2081 if (written)
2076 goto out; 2082 goto out;
2077 2083
@@ -2140,19 +2146,24 @@ EXPORT_SYMBOL(generic_file_direct_write);
2140 * Find or create a page at the given pagecache position. Return the locked 2146 * Find or create a page at the given pagecache position. Return the locked
2141 * page. This function is specifically for buffered writes. 2147 * page. This function is specifically for buffered writes.
2142 */ 2148 */
2143struct page *__grab_cache_page(struct address_space *mapping, pgoff_t index) 2149struct page *grab_cache_page_write_begin(struct address_space *mapping,
2150 pgoff_t index, unsigned flags)
2144{ 2151{
2145 int status; 2152 int status;
2146 struct page *page; 2153 struct page *page;
2154 gfp_t gfp_notmask = 0;
2155 if (flags & AOP_FLAG_NOFS)
2156 gfp_notmask = __GFP_FS;
2147repeat: 2157repeat:
2148 page = find_lock_page(mapping, index); 2158 page = find_lock_page(mapping, index);
2149 if (likely(page)) 2159 if (likely(page))
2150 return page; 2160 return page;
2151 2161
2152 page = page_cache_alloc(mapping); 2162 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask);
2153 if (!page) 2163 if (!page)
2154 return NULL; 2164 return NULL;
2155 status = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); 2165 status = add_to_page_cache_lru(page, mapping, index,
2166 GFP_KERNEL & ~gfp_notmask);
2156 if (unlikely(status)) { 2167 if (unlikely(status)) {
2157 page_cache_release(page); 2168 page_cache_release(page);
2158 if (status == -EEXIST) 2169 if (status == -EEXIST)
@@ -2161,7 +2172,7 @@ repeat:
2161 } 2172 }
2162 return page; 2173 return page;
2163} 2174}
2164EXPORT_SYMBOL(__grab_cache_page); 2175EXPORT_SYMBOL(grab_cache_page_write_begin);
2165 2176
2166static ssize_t generic_perform_write(struct file *file, 2177static ssize_t generic_perform_write(struct file *file,
2167 struct iov_iter *i, loff_t pos) 2178 struct iov_iter *i, loff_t pos)
@@ -2286,7 +2297,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2286 * the file data here, to try to honour O_DIRECT expectations. 2297 * the file data here, to try to honour O_DIRECT expectations.
2287 */ 2298 */
2288 if (unlikely(file->f_flags & O_DIRECT) && written) 2299 if (unlikely(file->f_flags & O_DIRECT) && written)
2289 status = filemap_write_and_wait(mapping); 2300 status = filemap_write_and_wait_range(mapping,
2301 pos, pos + written - 1);
2290 2302
2291 return written ? written : status; 2303 return written ? written : status;
2292} 2304}
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b5167dfb2f2d..0c04615651b7 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -193,7 +193,7 @@ retry:
193 /* Nuke the page table entry. */ 193 /* Nuke the page table entry. */
194 flush_cache_page(vma, address, pte_pfn(*pte)); 194 flush_cache_page(vma, address, pte_pfn(*pte));
195 pteval = ptep_clear_flush_notify(vma, address, pte); 195 pteval = ptep_clear_flush_notify(vma, address, pte);
196 page_remove_rmap(page, vma); 196 page_remove_rmap(page);
197 dec_mm_counter(mm, file_rss); 197 dec_mm_counter(mm, file_rss);
198 BUG_ON(pte_dirty(pteval)); 198 BUG_ON(pte_dirty(pteval));
199 pte_unmap_unlock(pte, ptl); 199 pte_unmap_unlock(pte, ptl);
diff --git a/mm/fremap.c b/mm/fremap.c
index 7d12ca70ef7b..736ba7f3306a 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -37,7 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
37 if (page) { 37 if (page) {
38 if (pte_dirty(pte)) 38 if (pte_dirty(pte))
39 set_page_dirty(page); 39 set_page_dirty(page);
40 page_remove_rmap(page, vma); 40 page_remove_rmap(page);
41 page_cache_release(page); 41 page_cache_release(page);
42 update_hiwater_rss(mm); 42 update_hiwater_rss(mm);
43 dec_mm_counter(mm, file_rss); 43 dec_mm_counter(mm, file_rss);
@@ -120,8 +120,8 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma,
120 * and the vma's default protection is used. Arbitrary protections 120 * and the vma's default protection is used. Arbitrary protections
121 * might be implemented in the future. 121 * might be implemented in the future.
122 */ 122 */
123asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, 123SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
124 unsigned long prot, unsigned long pgoff, unsigned long flags) 124 unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
125{ 125{
126 struct mm_struct *mm = current->mm; 126 struct mm_struct *mm = current->mm;
127 struct address_space *mapping; 127 struct address_space *mapping;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6058b53dcb89..618e98304080 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -220,6 +220,35 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
220} 220}
221 221
222/* 222/*
223 * Return the size of the pages allocated when backing a VMA. In the majority
224 * cases this will be same size as used by the page table entries.
225 */
226unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
227{
228 struct hstate *hstate;
229
230 if (!is_vm_hugetlb_page(vma))
231 return PAGE_SIZE;
232
233 hstate = hstate_vma(vma);
234
235 return 1UL << (hstate->order + PAGE_SHIFT);
236}
237
238/*
239 * Return the page size being used by the MMU to back a VMA. In the majority
240 * of cases, the page size used by the kernel matches the MMU size. On
241 * architectures where it differs, an architecture-specific version of this
242 * function is required.
243 */
244#ifndef vma_mmu_pagesize
245unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
246{
247 return vma_kernel_pagesize(vma);
248}
249#endif
250
251/*
223 * Flags for MAP_PRIVATE reservations. These are stored in the bottom 252 * Flags for MAP_PRIVATE reservations. These are stored in the bottom
224 * bits of the reservation map pointer, which are always clear due to 253 * bits of the reservation map pointer, which are always clear due to
225 * alignment. 254 * alignment.
@@ -371,8 +400,10 @@ static void clear_huge_page(struct page *page,
371{ 400{
372 int i; 401 int i;
373 402
374 if (unlikely(sz > MAX_ORDER_NR_PAGES)) 403 if (unlikely(sz > MAX_ORDER_NR_PAGES)) {
375 return clear_gigantic_page(page, addr, sz); 404 clear_gigantic_page(page, addr, sz);
405 return;
406 }
376 407
377 might_sleep(); 408 might_sleep();
378 for (i = 0; i < sz/PAGE_SIZE; i++) { 409 for (i = 0; i < sz/PAGE_SIZE; i++) {
@@ -404,8 +435,10 @@ static void copy_huge_page(struct page *dst, struct page *src,
404 int i; 435 int i;
405 struct hstate *h = hstate_vma(vma); 436 struct hstate *h = hstate_vma(vma);
406 437
407 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) 438 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
408 return copy_gigantic_page(dst, src, addr, vma); 439 copy_gigantic_page(dst, src, addr, vma);
440 return;
441 }
409 442
410 might_sleep(); 443 might_sleep();
411 for (i = 0; i < pages_per_huge_page(h); i++) { 444 for (i = 0; i < pages_per_huge_page(h); i++) {
@@ -972,7 +1005,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
972 return page; 1005 return page;
973} 1006}
974 1007
975__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) 1008int __weak alloc_bootmem_huge_page(struct hstate *h)
976{ 1009{
977 struct huge_bootmem_page *m; 1010 struct huge_bootmem_page *m;
978 int nr_nodes = nodes_weight(node_online_map); 1011 int nr_nodes = nodes_weight(node_online_map);
@@ -991,8 +1024,7 @@ __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
991 * puts them into the mem_map). 1024 * puts them into the mem_map).
992 */ 1025 */
993 m = addr; 1026 m = addr;
994 if (m) 1027 goto found;
995 goto found;
996 } 1028 }
997 hstate_next_node(h); 1029 hstate_next_node(h);
998 nr_nodes--; 1030 nr_nodes--;
diff --git a/mm/internal.h b/mm/internal.h
index 13333bc2eb68..478223b73a2a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -49,6 +49,7 @@ extern void putback_lru_page(struct page *page);
49/* 49/*
50 * in mm/page_alloc.c 50 * in mm/page_alloc.c
51 */ 51 */
52extern unsigned long highest_memmap_pfn;
52extern void __free_pages_bootmem(struct page *page, unsigned int order); 53extern void __free_pages_bootmem(struct page *page, unsigned int order);
53 54
54/* 55/*
@@ -275,6 +276,7 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
275#define GUP_FLAGS_WRITE 0x1 276#define GUP_FLAGS_WRITE 0x1
276#define GUP_FLAGS_FORCE 0x2 277#define GUP_FLAGS_FORCE 0x2
277#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 278#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
279#define GUP_FLAGS_IGNORE_SIGKILL 0x8
278 280
279int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 281int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
280 unsigned long start, int len, int flags, 282 unsigned long start, int len, int flags,
diff --git a/mm/madvise.c b/mm/madvise.c
index f9349c18a1b5..b9ce574827c8 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -281,7 +281,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev,
281 * -EBADF - map exists, but area maps something that isn't a file. 281 * -EBADF - map exists, but area maps something that isn't a file.
282 * -EAGAIN - a kernel resource was temporarily unavailable. 282 * -EAGAIN - a kernel resource was temporarily unavailable.
283 */ 283 */
284asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) 284SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior)
285{ 285{
286 unsigned long end, tmp; 286 unsigned long end, tmp;
287 struct vm_area_struct * vma, *prev; 287 struct vm_area_struct * vma, *prev;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 866dcc7eeb0c..8e4be9cb2a6a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -21,11 +21,13 @@
21#include <linux/memcontrol.h> 21#include <linux/memcontrol.h>
22#include <linux/cgroup.h> 22#include <linux/cgroup.h>
23#include <linux/mm.h> 23#include <linux/mm.h>
24#include <linux/pagemap.h>
24#include <linux/smp.h> 25#include <linux/smp.h>
25#include <linux/page-flags.h> 26#include <linux/page-flags.h>
26#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
27#include <linux/bit_spinlock.h> 28#include <linux/bit_spinlock.h>
28#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30#include <linux/mutex.h>
29#include <linux/slab.h> 31#include <linux/slab.h>
30#include <linux/swap.h> 32#include <linux/swap.h>
31#include <linux/spinlock.h> 33#include <linux/spinlock.h>
@@ -34,12 +36,23 @@
34#include <linux/vmalloc.h> 36#include <linux/vmalloc.h>
35#include <linux/mm_inline.h> 37#include <linux/mm_inline.h>
36#include <linux/page_cgroup.h> 38#include <linux/page_cgroup.h>
39#include "internal.h"
37 40
38#include <asm/uaccess.h> 41#include <asm/uaccess.h>
39 42
40struct cgroup_subsys mem_cgroup_subsys __read_mostly; 43struct cgroup_subsys mem_cgroup_subsys __read_mostly;
41#define MEM_CGROUP_RECLAIM_RETRIES 5 44#define MEM_CGROUP_RECLAIM_RETRIES 5
42 45
46#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
47/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
48int do_swap_account __read_mostly;
49static int really_do_swap_account __initdata = 1; /* for remember boot option*/
50#else
51#define do_swap_account (0)
52#endif
53
54static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
55
43/* 56/*
44 * Statistics for memory cgroup. 57 * Statistics for memory cgroup.
45 */ 58 */
@@ -60,7 +73,7 @@ struct mem_cgroup_stat_cpu {
60} ____cacheline_aligned_in_smp; 73} ____cacheline_aligned_in_smp;
61 74
62struct mem_cgroup_stat { 75struct mem_cgroup_stat {
63 struct mem_cgroup_stat_cpu cpustat[NR_CPUS]; 76 struct mem_cgroup_stat_cpu cpustat[0];
64}; 77};
65 78
66/* 79/*
@@ -89,9 +102,10 @@ struct mem_cgroup_per_zone {
89 /* 102 /*
90 * spin_lock to protect the per cgroup LRU 103 * spin_lock to protect the per cgroup LRU
91 */ 104 */
92 spinlock_t lru_lock;
93 struct list_head lists[NR_LRU_LISTS]; 105 struct list_head lists[NR_LRU_LISTS];
94 unsigned long count[NR_LRU_LISTS]; 106 unsigned long count[NR_LRU_LISTS];
107
108 struct zone_reclaim_stat reclaim_stat;
95}; 109};
96/* Macro for accessing counter */ 110/* Macro for accessing counter */
97#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 111#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -122,44 +136,74 @@ struct mem_cgroup {
122 */ 136 */
123 struct res_counter res; 137 struct res_counter res;
124 /* 138 /*
139 * the counter to account for mem+swap usage.
140 */
141 struct res_counter memsw;
142 /*
125 * Per cgroup active and inactive list, similar to the 143 * Per cgroup active and inactive list, similar to the
126 * per zone LRU lists. 144 * per zone LRU lists.
127 */ 145 */
128 struct mem_cgroup_lru_info info; 146 struct mem_cgroup_lru_info info;
129 147
148 /*
149 protect against reclaim related member.
150 */
151 spinlock_t reclaim_param_lock;
152
130 int prev_priority; /* for recording reclaim priority */ 153 int prev_priority; /* for recording reclaim priority */
154
155 /*
156 * While reclaiming in a hiearchy, we cache the last child we
157 * reclaimed from. Protected by hierarchy_mutex
158 */
159 struct mem_cgroup *last_scanned_child;
131 /* 160 /*
132 * statistics. 161 * Should the accounting and control be hierarchical, per subtree?
162 */
163 bool use_hierarchy;
164 unsigned long last_oom_jiffies;
165 atomic_t refcnt;
166
167 unsigned int swappiness;
168
169 /*
170 * statistics. This must be placed at the end of memcg.
133 */ 171 */
134 struct mem_cgroup_stat stat; 172 struct mem_cgroup_stat stat;
135}; 173};
136static struct mem_cgroup init_mem_cgroup;
137 174
138enum charge_type { 175enum charge_type {
139 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 176 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
140 MEM_CGROUP_CHARGE_TYPE_MAPPED, 177 MEM_CGROUP_CHARGE_TYPE_MAPPED,
141 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 178 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
142 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 179 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
180 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
143 NR_CHARGE_TYPE, 181 NR_CHARGE_TYPE,
144}; 182};
145 183
146/* only for here (for easy reading.) */ 184/* only for here (for easy reading.) */
147#define PCGF_CACHE (1UL << PCG_CACHE) 185#define PCGF_CACHE (1UL << PCG_CACHE)
148#define PCGF_USED (1UL << PCG_USED) 186#define PCGF_USED (1UL << PCG_USED)
149#define PCGF_ACTIVE (1UL << PCG_ACTIVE)
150#define PCGF_LOCK (1UL << PCG_LOCK) 187#define PCGF_LOCK (1UL << PCG_LOCK)
151#define PCGF_FILE (1UL << PCG_FILE)
152static const unsigned long 188static const unsigned long
153pcg_default_flags[NR_CHARGE_TYPE] = { 189pcg_default_flags[NR_CHARGE_TYPE] = {
154 PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ 190 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
155 PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ 191 PCGF_USED | PCGF_LOCK, /* Anon */
156 PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ 192 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
157 0, /* FORCE */ 193 0, /* FORCE */
158}; 194};
159 195
160/* 196/* for encoding cft->private value on file */
161 * Always modified under lru lock. Then, not necessary to preempt_disable() 197#define _MEM (0)
162 */ 198#define _MEMSWAP (1)
199#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
200#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
201#define MEMFILE_ATTR(val) ((val) & 0xffff)
202
203static void mem_cgroup_get(struct mem_cgroup *mem);
204static void mem_cgroup_put(struct mem_cgroup *mem);
205static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem);
206
163static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 207static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
164 struct page_cgroup *pc, 208 struct page_cgroup *pc,
165 bool charge) 209 bool charge)
@@ -167,10 +211,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
167 int val = (charge)? 1 : -1; 211 int val = (charge)? 1 : -1;
168 struct mem_cgroup_stat *stat = &mem->stat; 212 struct mem_cgroup_stat *stat = &mem->stat;
169 struct mem_cgroup_stat_cpu *cpustat; 213 struct mem_cgroup_stat_cpu *cpustat;
214 int cpu = get_cpu();
170 215
171 VM_BUG_ON(!irqs_disabled()); 216 cpustat = &stat->cpustat[cpu];
172
173 cpustat = &stat->cpustat[smp_processor_id()];
174 if (PageCgroupCache(pc)) 217 if (PageCgroupCache(pc))
175 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); 218 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
176 else 219 else
@@ -182,6 +225,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
182 else 225 else
183 __mem_cgroup_stat_add_safe(cpustat, 226 __mem_cgroup_stat_add_safe(cpustat,
184 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 227 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
228 put_cpu();
185} 229}
186 230
187static struct mem_cgroup_per_zone * 231static struct mem_cgroup_per_zone *
@@ -197,6 +241,9 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
197 int nid = page_cgroup_nid(pc); 241 int nid = page_cgroup_nid(pc);
198 int zid = page_cgroup_zid(pc); 242 int zid = page_cgroup_zid(pc);
199 243
244 if (!mem)
245 return NULL;
246
200 return mem_cgroup_zoneinfo(mem, nid, zid); 247 return mem_cgroup_zoneinfo(mem, nid, zid);
201} 248}
202 249
@@ -236,118 +283,169 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
236 struct mem_cgroup, css); 283 struct mem_cgroup, css);
237} 284}
238 285
239static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, 286static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
240 struct page_cgroup *pc)
241{ 287{
242 int lru = LRU_BASE; 288 struct mem_cgroup *mem = NULL;
243 289 /*
244 if (PageCgroupUnevictable(pc)) 290 * Because we have no locks, mm->owner's may be being moved to other
245 lru = LRU_UNEVICTABLE; 291 * cgroup. We use css_tryget() here even if this looks
246 else { 292 * pessimistic (rather than adding locks here).
247 if (PageCgroupActive(pc)) 293 */
248 lru += LRU_ACTIVE; 294 rcu_read_lock();
249 if (PageCgroupFile(pc)) 295 do {
250 lru += LRU_FILE; 296 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
251 } 297 if (unlikely(!mem))
252 298 break;
253 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 299 } while (!css_tryget(&mem->css));
254 300 rcu_read_unlock();
255 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false); 301 return mem;
256 list_del(&pc->lru);
257} 302}
258 303
259static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, 304static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem)
260 struct page_cgroup *pc)
261{ 305{
262 int lru = LRU_BASE; 306 if (!mem)
307 return true;
308 return css_is_removed(&mem->css);
309}
263 310
264 if (PageCgroupUnevictable(pc)) 311/*
265 lru = LRU_UNEVICTABLE; 312 * Following LRU functions are allowed to be used without PCG_LOCK.
266 else { 313 * Operations are called by routine of global LRU independently from memcg.
267 if (PageCgroupActive(pc)) 314 * What we have to take care of here is validness of pc->mem_cgroup.
268 lru += LRU_ACTIVE; 315 *
269 if (PageCgroupFile(pc)) 316 * Changes to pc->mem_cgroup happens when
270 lru += LRU_FILE; 317 * 1. charge
271 } 318 * 2. moving account
319 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
320 * It is added to LRU before charge.
321 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
322 * When moving account, the page is not on LRU. It's isolated.
323 */
272 324
273 MEM_CGROUP_ZSTAT(mz, lru) += 1; 325void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
274 list_add(&pc->lru, &mz->lists[lru]); 326{
327 struct page_cgroup *pc;
328 struct mem_cgroup *mem;
329 struct mem_cgroup_per_zone *mz;
275 330
276 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); 331 if (mem_cgroup_disabled())
332 return;
333 pc = lookup_page_cgroup(page);
334 /* can happen while we handle swapcache. */
335 if (list_empty(&pc->lru) || !pc->mem_cgroup)
336 return;
337 /*
338 * We don't check PCG_USED bit. It's cleared when the "page" is finally
339 * removed from global LRU.
340 */
341 mz = page_cgroup_zoneinfo(pc);
342 mem = pc->mem_cgroup;
343 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
344 list_del_init(&pc->lru);
345 return;
277} 346}
278 347
279static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru) 348void mem_cgroup_del_lru(struct page *page)
280{ 349{
281 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 350 mem_cgroup_del_lru_list(page, page_lru(page));
282 int active = PageCgroupActive(pc); 351}
283 int file = PageCgroupFile(pc);
284 int unevictable = PageCgroupUnevictable(pc);
285 enum lru_list from = unevictable ? LRU_UNEVICTABLE :
286 (LRU_FILE * !!file + !!active);
287 352
288 if (lru == from) 353void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
354{
355 struct mem_cgroup_per_zone *mz;
356 struct page_cgroup *pc;
357
358 if (mem_cgroup_disabled())
289 return; 359 return;
290 360
291 MEM_CGROUP_ZSTAT(mz, from) -= 1; 361 pc = lookup_page_cgroup(page);
292 /* 362 /*
293 * However this is done under mz->lru_lock, another flags, which 363 * Used bit is set without atomic ops but after smp_wmb().
294 * are not related to LRU, will be modified from out-of-lock. 364 * For making pc->mem_cgroup visible, insert smp_rmb() here.
295 * We have to use atomic set/clear flags.
296 */ 365 */
297 if (is_unevictable_lru(lru)) { 366 smp_rmb();
298 ClearPageCgroupActive(pc); 367 /* unused page is not rotated. */
299 SetPageCgroupUnevictable(pc); 368 if (!PageCgroupUsed(pc))
300 } else { 369 return;
301 if (is_active_lru(lru)) 370 mz = page_cgroup_zoneinfo(pc);
302 SetPageCgroupActive(pc);
303 else
304 ClearPageCgroupActive(pc);
305 ClearPageCgroupUnevictable(pc);
306 }
307
308 MEM_CGROUP_ZSTAT(mz, lru) += 1;
309 list_move(&pc->lru, &mz->lists[lru]); 371 list_move(&pc->lru, &mz->lists[lru]);
310} 372}
311 373
312int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 374void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
313{ 375{
314 int ret; 376 struct page_cgroup *pc;
377 struct mem_cgroup_per_zone *mz;
315 378
316 task_lock(task); 379 if (mem_cgroup_disabled())
317 ret = task->mm && mm_match_cgroup(task->mm, mem); 380 return;
318 task_unlock(task); 381 pc = lookup_page_cgroup(page);
319 return ret; 382 /*
383 * Used bit is set without atomic ops but after smp_wmb().
384 * For making pc->mem_cgroup visible, insert smp_rmb() here.
385 */
386 smp_rmb();
387 if (!PageCgroupUsed(pc))
388 return;
389
390 mz = page_cgroup_zoneinfo(pc);
391 MEM_CGROUP_ZSTAT(mz, lru) += 1;
392 list_add(&pc->lru, &mz->lists[lru]);
320} 393}
321 394
322/* 395/*
323 * This routine assumes that the appropriate zone's lru lock is already held 396 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
397 * lru because the page may.be reused after it's fully uncharged (because of
398 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
399 * it again. This function is only used to charge SwapCache. It's done under
400 * lock_page and expected that zone->lru_lock is never held.
324 */ 401 */
325void mem_cgroup_move_lists(struct page *page, enum lru_list lru) 402static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
326{ 403{
327 struct page_cgroup *pc;
328 struct mem_cgroup_per_zone *mz;
329 unsigned long flags; 404 unsigned long flags;
405 struct zone *zone = page_zone(page);
406 struct page_cgroup *pc = lookup_page_cgroup(page);
330 407
331 if (mem_cgroup_subsys.disabled) 408 spin_lock_irqsave(&zone->lru_lock, flags);
332 return;
333
334 /* 409 /*
335 * We cannot lock_page_cgroup while holding zone's lru_lock, 410 * Forget old LRU when this page_cgroup is *not* used. This Used bit
336 * because other holders of lock_page_cgroup can be interrupted 411 * is guarded by lock_page() because the page is SwapCache.
337 * with an attempt to rotate_reclaimable_page. But we cannot
338 * safely get to page_cgroup without it, so just try_lock it:
339 * mem_cgroup_isolate_pages allows for page left on wrong list.
340 */ 412 */
341 pc = lookup_page_cgroup(page); 413 if (!PageCgroupUsed(pc))
342 if (!trylock_page_cgroup(pc)) 414 mem_cgroup_del_lru_list(page, page_lru(page));
415 spin_unlock_irqrestore(&zone->lru_lock, flags);
416}
417
418static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
419{
420 unsigned long flags;
421 struct zone *zone = page_zone(page);
422 struct page_cgroup *pc = lookup_page_cgroup(page);
423
424 spin_lock_irqsave(&zone->lru_lock, flags);
425 /* link when the page is linked to LRU but page_cgroup isn't */
426 if (PageLRU(page) && list_empty(&pc->lru))
427 mem_cgroup_add_lru_list(page, page_lru(page));
428 spin_unlock_irqrestore(&zone->lru_lock, flags);
429}
430
431
432void mem_cgroup_move_lists(struct page *page,
433 enum lru_list from, enum lru_list to)
434{
435 if (mem_cgroup_disabled())
343 return; 436 return;
344 if (pc && PageCgroupUsed(pc)) { 437 mem_cgroup_del_lru_list(page, from);
345 mz = page_cgroup_zoneinfo(pc); 438 mem_cgroup_add_lru_list(page, to);
346 spin_lock_irqsave(&mz->lru_lock, flags); 439}
347 __mem_cgroup_move_lists(pc, lru); 440
348 spin_unlock_irqrestore(&mz->lru_lock, flags); 441int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
349 } 442{
350 unlock_page_cgroup(pc); 443 int ret;
444
445 task_lock(task);
446 ret = task->mm && mm_match_cgroup(task->mm, mem);
447 task_unlock(task);
448 return ret;
351} 449}
352 450
353/* 451/*
@@ -372,39 +470,116 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
372 */ 470 */
373int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 471int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
374{ 472{
375 return mem->prev_priority; 473 int prev_priority;
474
475 spin_lock(&mem->reclaim_param_lock);
476 prev_priority = mem->prev_priority;
477 spin_unlock(&mem->reclaim_param_lock);
478
479 return prev_priority;
376} 480}
377 481
378void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) 482void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
379{ 483{
484 spin_lock(&mem->reclaim_param_lock);
380 if (priority < mem->prev_priority) 485 if (priority < mem->prev_priority)
381 mem->prev_priority = priority; 486 mem->prev_priority = priority;
487 spin_unlock(&mem->reclaim_param_lock);
382} 488}
383 489
384void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) 490void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
385{ 491{
492 spin_lock(&mem->reclaim_param_lock);
386 mem->prev_priority = priority; 493 mem->prev_priority = priority;
494 spin_unlock(&mem->reclaim_param_lock);
387} 495}
388 496
389/* 497static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
390 * Calculate # of pages to be scanned in this priority/zone. 498{
391 * See also vmscan.c 499 unsigned long active;
392 * 500 unsigned long inactive;
393 * priority starts from "DEF_PRIORITY" and decremented in each loop. 501 unsigned long gb;
394 * (see include/linux/mmzone.h) 502 unsigned long inactive_ratio;
395 */ 503
504 inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON);
505 active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON);
506
507 gb = (inactive + active) >> (30 - PAGE_SHIFT);
508 if (gb)
509 inactive_ratio = int_sqrt(10 * gb);
510 else
511 inactive_ratio = 1;
512
513 if (present_pages) {
514 present_pages[0] = inactive;
515 present_pages[1] = active;
516 }
517
518 return inactive_ratio;
519}
396 520
397long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone, 521int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
398 int priority, enum lru_list lru) 522{
523 unsigned long active;
524 unsigned long inactive;
525 unsigned long present_pages[2];
526 unsigned long inactive_ratio;
527
528 inactive_ratio = calc_inactive_ratio(memcg, present_pages);
529
530 inactive = present_pages[0];
531 active = present_pages[1];
532
533 if (inactive * inactive_ratio < active)
534 return 1;
535
536 return 0;
537}
538
539unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
540 struct zone *zone,
541 enum lru_list lru)
399{ 542{
400 long nr_pages;
401 int nid = zone->zone_pgdat->node_id; 543 int nid = zone->zone_pgdat->node_id;
402 int zid = zone_idx(zone); 544 int zid = zone_idx(zone);
403 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 545 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
404 546
405 nr_pages = MEM_CGROUP_ZSTAT(mz, lru); 547 return MEM_CGROUP_ZSTAT(mz, lru);
548}
549
550struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
551 struct zone *zone)
552{
553 int nid = zone->zone_pgdat->node_id;
554 int zid = zone_idx(zone);
555 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
406 556
407 return (nr_pages >> priority); 557 return &mz->reclaim_stat;
558}
559
560struct zone_reclaim_stat *
561mem_cgroup_get_reclaim_stat_from_page(struct page *page)
562{
563 struct page_cgroup *pc;
564 struct mem_cgroup_per_zone *mz;
565
566 if (mem_cgroup_disabled())
567 return NULL;
568
569 pc = lookup_page_cgroup(page);
570 /*
571 * Used bit is set without atomic ops but after smp_wmb().
572 * For making pc->mem_cgroup visible, insert smp_rmb() here.
573 */
574 smp_rmb();
575 if (!PageCgroupUsed(pc))
576 return NULL;
577
578 mz = page_cgroup_zoneinfo(pc);
579 if (!mz)
580 return NULL;
581
582 return &mz->reclaim_stat;
408} 583}
409 584
410unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 585unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
@@ -429,94 +604,279 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
429 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 604 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
430 src = &mz->lists[lru]; 605 src = &mz->lists[lru];
431 606
432 spin_lock(&mz->lru_lock);
433 scan = 0; 607 scan = 0;
434 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 608 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
435 if (scan >= nr_to_scan) 609 if (scan >= nr_to_scan)
436 break; 610 break;
611
612 page = pc->page;
437 if (unlikely(!PageCgroupUsed(pc))) 613 if (unlikely(!PageCgroupUsed(pc)))
438 continue; 614 continue;
439 page = pc->page;
440
441 if (unlikely(!PageLRU(page))) 615 if (unlikely(!PageLRU(page)))
442 continue; 616 continue;
443 617
444 /*
445 * TODO: play better with lumpy reclaim, grabbing anything.
446 */
447 if (PageUnevictable(page) ||
448 (PageActive(page) && !active) ||
449 (!PageActive(page) && active)) {
450 __mem_cgroup_move_lists(pc, page_lru(page));
451 continue;
452 }
453
454 scan++; 618 scan++;
455 list_move(&pc->lru, &pc_list);
456
457 if (__isolate_lru_page(page, mode, file) == 0) { 619 if (__isolate_lru_page(page, mode, file) == 0) {
458 list_move(&page->lru, dst); 620 list_move(&page->lru, dst);
459 nr_taken++; 621 nr_taken++;
460 } 622 }
461 } 623 }
462 624
463 list_splice(&pc_list, src);
464 spin_unlock(&mz->lru_lock);
465
466 *scanned = scan; 625 *scanned = scan;
467 return nr_taken; 626 return nr_taken;
468} 627}
469 628
629#define mem_cgroup_from_res_counter(counter, member) \
630 container_of(counter, struct mem_cgroup, member)
631
470/* 632/*
471 * Charge the memory controller for page usage. 633 * This routine finds the DFS walk successor. This routine should be
472 * Return 634 * called with hierarchy_mutex held
473 * 0 if the charge was successful
474 * < 0 if the cgroup is over its limit
475 */ 635 */
476static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 636static struct mem_cgroup *
477 gfp_t gfp_mask, enum charge_type ctype, 637__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
478 struct mem_cgroup *memcg)
479{ 638{
639 struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
640
641 curr_cgroup = curr->css.cgroup;
642 root_cgroup = root_mem->css.cgroup;
643
644 if (!list_empty(&curr_cgroup->children)) {
645 /*
646 * Walk down to children
647 */
648 cgroup = list_entry(curr_cgroup->children.next,
649 struct cgroup, sibling);
650 curr = mem_cgroup_from_cont(cgroup);
651 goto done;
652 }
653
654visit_parent:
655 if (curr_cgroup == root_cgroup) {
656 /* caller handles NULL case */
657 curr = NULL;
658 goto done;
659 }
660
661 /*
662 * Goto next sibling
663 */
664 if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
665 cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
666 sibling);
667 curr = mem_cgroup_from_cont(cgroup);
668 goto done;
669 }
670
671 /*
672 * Go up to next parent and next parent's sibling if need be
673 */
674 curr_cgroup = curr_cgroup->parent;
675 goto visit_parent;
676
677done:
678 return curr;
679}
680
681/*
682 * Visit the first child (need not be the first child as per the ordering
683 * of the cgroup list, since we track last_scanned_child) of @mem and use
684 * that to reclaim free pages from.
685 */
686static struct mem_cgroup *
687mem_cgroup_get_next_node(struct mem_cgroup *root_mem)
688{
689 struct cgroup *cgroup;
690 struct mem_cgroup *orig, *next;
691 bool obsolete;
692
693 /*
694 * Scan all children under the mem_cgroup mem
695 */
696 mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
697
698 orig = root_mem->last_scanned_child;
699 obsolete = mem_cgroup_is_obsolete(orig);
700
701 if (list_empty(&root_mem->css.cgroup->children)) {
702 /*
703 * root_mem might have children before and last_scanned_child
704 * may point to one of them. We put it later.
705 */
706 if (orig)
707 VM_BUG_ON(!obsolete);
708 next = NULL;
709 goto done;
710 }
711
712 if (!orig || obsolete) {
713 cgroup = list_first_entry(&root_mem->css.cgroup->children,
714 struct cgroup, sibling);
715 next = mem_cgroup_from_cont(cgroup);
716 } else
717 next = __mem_cgroup_get_next_node(orig, root_mem);
718
719done:
720 if (next)
721 mem_cgroup_get(next);
722 root_mem->last_scanned_child = next;
723 if (orig)
724 mem_cgroup_put(orig);
725 mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
726 return (next) ? next : root_mem;
727}
728
729static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
730{
731 if (do_swap_account) {
732 if (res_counter_check_under_limit(&mem->res) &&
733 res_counter_check_under_limit(&mem->memsw))
734 return true;
735 } else
736 if (res_counter_check_under_limit(&mem->res))
737 return true;
738 return false;
739}
740
741static unsigned int get_swappiness(struct mem_cgroup *memcg)
742{
743 struct cgroup *cgrp = memcg->css.cgroup;
744 unsigned int swappiness;
745
746 /* root ? */
747 if (cgrp->parent == NULL)
748 return vm_swappiness;
749
750 spin_lock(&memcg->reclaim_param_lock);
751 swappiness = memcg->swappiness;
752 spin_unlock(&memcg->reclaim_param_lock);
753
754 return swappiness;
755}
756
757/*
758 * Dance down the hierarchy if needed to reclaim memory. We remember the
759 * last child we reclaimed from, so that we don't end up penalizing
760 * one child extensively based on its position in the children list.
761 *
762 * root_mem is the original ancestor that we've been reclaim from.
763 */
764static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
765 gfp_t gfp_mask, bool noswap)
766{
767 struct mem_cgroup *next_mem;
768 int ret = 0;
769
770 /*
771 * Reclaim unconditionally and don't check for return value.
772 * We need to reclaim in the current group and down the tree.
773 * One might think about checking for children before reclaiming,
774 * but there might be left over accounting, even after children
775 * have left.
776 */
777 ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap,
778 get_swappiness(root_mem));
779 if (mem_cgroup_check_under_limit(root_mem))
780 return 1; /* indicate reclaim has succeeded */
781 if (!root_mem->use_hierarchy)
782 return ret;
783
784 next_mem = mem_cgroup_get_next_node(root_mem);
785
786 while (next_mem != root_mem) {
787 if (mem_cgroup_is_obsolete(next_mem)) {
788 next_mem = mem_cgroup_get_next_node(root_mem);
789 continue;
790 }
791 ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap,
792 get_swappiness(next_mem));
793 if (mem_cgroup_check_under_limit(root_mem))
794 return 1; /* indicate reclaim has succeeded */
795 next_mem = mem_cgroup_get_next_node(root_mem);
796 }
797 return ret;
798}
799
800bool mem_cgroup_oom_called(struct task_struct *task)
801{
802 bool ret = false;
480 struct mem_cgroup *mem; 803 struct mem_cgroup *mem;
481 struct page_cgroup *pc; 804 struct mm_struct *mm;
482 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
483 struct mem_cgroup_per_zone *mz;
484 unsigned long flags;
485 805
486 pc = lookup_page_cgroup(page); 806 rcu_read_lock();
487 /* can happen at boot */ 807 mm = task->mm;
488 if (unlikely(!pc)) 808 if (!mm)
809 mm = &init_mm;
810 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
811 if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
812 ret = true;
813 rcu_read_unlock();
814 return ret;
815}
816/*
817 * Unlike exported interface, "oom" parameter is added. if oom==true,
818 * oom-killer can be invoked.
819 */
820static int __mem_cgroup_try_charge(struct mm_struct *mm,
821 gfp_t gfp_mask, struct mem_cgroup **memcg,
822 bool oom)
823{
824 struct mem_cgroup *mem, *mem_over_limit;
825 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
826 struct res_counter *fail_res;
827
828 if (unlikely(test_thread_flag(TIF_MEMDIE))) {
829 /* Don't account this! */
830 *memcg = NULL;
489 return 0; 831 return 0;
490 prefetchw(pc); 832 }
833
491 /* 834 /*
492 * We always charge the cgroup the mm_struct belongs to. 835 * We always charge the cgroup the mm_struct belongs to.
493 * The mm_struct's mem_cgroup changes on task migration if the 836 * The mm_struct's mem_cgroup changes on task migration if the
494 * thread group leader migrates. It's possible that mm is not 837 * thread group leader migrates. It's possible that mm is not
495 * set, if so charge the init_mm (happens for pagecache usage). 838 * set, if so charge the init_mm (happens for pagecache usage).
496 */ 839 */
497 840 mem = *memcg;
498 if (likely(!memcg)) { 841 if (likely(!mem)) {
499 rcu_read_lock(); 842 mem = try_get_mem_cgroup_from_mm(mm);
500 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 843 *memcg = mem;
501 if (unlikely(!mem)) {
502 rcu_read_unlock();
503 return 0;
504 }
505 /*
506 * For every charge from the cgroup, increment reference count
507 */
508 css_get(&mem->css);
509 rcu_read_unlock();
510 } else { 844 } else {
511 mem = memcg; 845 css_get(&mem->css);
512 css_get(&memcg->css);
513 } 846 }
847 if (unlikely(!mem))
848 return 0;
849
850 VM_BUG_ON(mem_cgroup_is_obsolete(mem));
851
852 while (1) {
853 int ret;
854 bool noswap = false;
855
856 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
857 if (likely(!ret)) {
858 if (!do_swap_account)
859 break;
860 ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
861 &fail_res);
862 if (likely(!ret))
863 break;
864 /* mem+swap counter fails */
865 res_counter_uncharge(&mem->res, PAGE_SIZE);
866 noswap = true;
867 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
868 memsw);
869 } else
870 /* mem counter fails */
871 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
872 res);
514 873
515 while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
516 if (!(gfp_mask & __GFP_WAIT)) 874 if (!(gfp_mask & __GFP_WAIT))
517 goto out; 875 goto nomem;
518 876
519 if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) 877 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
878 noswap);
879 if (ret)
520 continue; 880 continue;
521 881
522 /* 882 /*
@@ -525,49 +885,221 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
525 * moved to swap cache or just unmapped from the cgroup. 885 * moved to swap cache or just unmapped from the cgroup.
526 * Check the limit again to see if the reclaim reduced the 886 * Check the limit again to see if the reclaim reduced the
527 * current usage of the cgroup before giving up 887 * current usage of the cgroup before giving up
888 *
528 */ 889 */
529 if (res_counter_check_under_limit(&mem->res)) 890 if (mem_cgroup_check_under_limit(mem_over_limit))
530 continue; 891 continue;
531 892
532 if (!nr_retries--) { 893 if (!nr_retries--) {
533 mem_cgroup_out_of_memory(mem, gfp_mask); 894 if (oom) {
534 goto out; 895 mutex_lock(&memcg_tasklist);
896 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
897 mutex_unlock(&memcg_tasklist);
898 mem_over_limit->last_oom_jiffies = jiffies;
899 }
900 goto nomem;
535 } 901 }
536 } 902 }
903 return 0;
904nomem:
905 css_put(&mem->css);
906 return -ENOMEM;
907}
908
909static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
910{
911 struct mem_cgroup *mem;
912 swp_entry_t ent;
913
914 if (!PageSwapCache(page))
915 return NULL;
537 916
917 ent.val = page_private(page);
918 mem = lookup_swap_cgroup(ent);
919 if (!mem)
920 return NULL;
921 if (!css_tryget(&mem->css))
922 return NULL;
923 return mem;
924}
925
926/*
927 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
928 * USED state. If already USED, uncharge and return.
929 */
930
931static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
932 struct page_cgroup *pc,
933 enum charge_type ctype)
934{
935 /* try_charge() can return NULL to *memcg, taking care of it. */
936 if (!mem)
937 return;
538 938
539 lock_page_cgroup(pc); 939 lock_page_cgroup(pc);
540 if (unlikely(PageCgroupUsed(pc))) { 940 if (unlikely(PageCgroupUsed(pc))) {
541 unlock_page_cgroup(pc); 941 unlock_page_cgroup(pc);
542 res_counter_uncharge(&mem->res, PAGE_SIZE); 942 res_counter_uncharge(&mem->res, PAGE_SIZE);
943 if (do_swap_account)
944 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
543 css_put(&mem->css); 945 css_put(&mem->css);
544 946 return;
545 goto done;
546 } 947 }
547 pc->mem_cgroup = mem; 948 pc->mem_cgroup = mem;
548 /* 949 smp_wmb();
549 * If a page is accounted as a page cache, insert to inactive list.
550 * If anon, insert to active list.
551 */
552 pc->flags = pcg_default_flags[ctype]; 950 pc->flags = pcg_default_flags[ctype];
553 951
554 mz = page_cgroup_zoneinfo(pc); 952 mem_cgroup_charge_statistics(mem, pc, true);
555 953
556 spin_lock_irqsave(&mz->lru_lock, flags);
557 __mem_cgroup_add_list(mz, pc);
558 spin_unlock_irqrestore(&mz->lru_lock, flags);
559 unlock_page_cgroup(pc); 954 unlock_page_cgroup(pc);
955}
560 956
561done: 957/**
562 return 0; 958 * mem_cgroup_move_account - move account of the page
959 * @pc: page_cgroup of the page.
960 * @from: mem_cgroup which the page is moved from.
961 * @to: mem_cgroup which the page is moved to. @from != @to.
962 *
963 * The caller must confirm following.
964 * - page is not on LRU (isolate_page() is useful.)
965 *
966 * returns 0 at success,
967 * returns -EBUSY when lock is busy or "pc" is unstable.
968 *
969 * This function does "uncharge" from old cgroup but doesn't do "charge" to
970 * new cgroup. It should be done by a caller.
971 */
972
973static int mem_cgroup_move_account(struct page_cgroup *pc,
974 struct mem_cgroup *from, struct mem_cgroup *to)
975{
976 struct mem_cgroup_per_zone *from_mz, *to_mz;
977 int nid, zid;
978 int ret = -EBUSY;
979
980 VM_BUG_ON(from == to);
981 VM_BUG_ON(PageLRU(pc->page));
982
983 nid = page_cgroup_nid(pc);
984 zid = page_cgroup_zid(pc);
985 from_mz = mem_cgroup_zoneinfo(from, nid, zid);
986 to_mz = mem_cgroup_zoneinfo(to, nid, zid);
987
988 if (!trylock_page_cgroup(pc))
989 return ret;
990
991 if (!PageCgroupUsed(pc))
992 goto out;
993
994 if (pc->mem_cgroup != from)
995 goto out;
996
997 res_counter_uncharge(&from->res, PAGE_SIZE);
998 mem_cgroup_charge_statistics(from, pc, false);
999 if (do_swap_account)
1000 res_counter_uncharge(&from->memsw, PAGE_SIZE);
1001 css_put(&from->css);
1002
1003 css_get(&to->css);
1004 pc->mem_cgroup = to;
1005 mem_cgroup_charge_statistics(to, pc, true);
1006 ret = 0;
563out: 1007out:
564 css_put(&mem->css); 1008 unlock_page_cgroup(pc);
565 return -ENOMEM; 1009 return ret;
1010}
1011
1012/*
1013 * move charges to its parent.
1014 */
1015
1016static int mem_cgroup_move_parent(struct page_cgroup *pc,
1017 struct mem_cgroup *child,
1018 gfp_t gfp_mask)
1019{
1020 struct page *page = pc->page;
1021 struct cgroup *cg = child->css.cgroup;
1022 struct cgroup *pcg = cg->parent;
1023 struct mem_cgroup *parent;
1024 int ret;
1025
1026 /* Is ROOT ? */
1027 if (!pcg)
1028 return -EINVAL;
1029
1030
1031 parent = mem_cgroup_from_cont(pcg);
1032
1033
1034 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
1035 if (ret || !parent)
1036 return ret;
1037
1038 if (!get_page_unless_zero(page)) {
1039 ret = -EBUSY;
1040 goto uncharge;
1041 }
1042
1043 ret = isolate_lru_page(page);
1044
1045 if (ret)
1046 goto cancel;
1047
1048 ret = mem_cgroup_move_account(pc, child, parent);
1049
1050 putback_lru_page(page);
1051 if (!ret) {
1052 put_page(page);
1053 /* drop extra refcnt by try_charge() */
1054 css_put(&parent->css);
1055 return 0;
1056 }
1057
1058cancel:
1059 put_page(page);
1060uncharge:
1061 /* drop extra refcnt by try_charge() */
1062 css_put(&parent->css);
1063 /* uncharge if move fails */
1064 res_counter_uncharge(&parent->res, PAGE_SIZE);
1065 if (do_swap_account)
1066 res_counter_uncharge(&parent->memsw, PAGE_SIZE);
1067 return ret;
1068}
1069
1070/*
1071 * Charge the memory controller for page usage.
1072 * Return
1073 * 0 if the charge was successful
1074 * < 0 if the cgroup is over its limit
1075 */
1076static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1077 gfp_t gfp_mask, enum charge_type ctype,
1078 struct mem_cgroup *memcg)
1079{
1080 struct mem_cgroup *mem;
1081 struct page_cgroup *pc;
1082 int ret;
1083
1084 pc = lookup_page_cgroup(page);
1085 /* can happen at boot */
1086 if (unlikely(!pc))
1087 return 0;
1088 prefetchw(pc);
1089
1090 mem = memcg;
1091 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1092 if (ret || !mem)
1093 return ret;
1094
1095 __mem_cgroup_commit_charge(mem, pc, ctype);
1096 return 0;
566} 1097}
567 1098
568int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) 1099int mem_cgroup_newpage_charge(struct page *page,
1100 struct mm_struct *mm, gfp_t gfp_mask)
569{ 1101{
570 if (mem_cgroup_subsys.disabled) 1102 if (mem_cgroup_disabled())
571 return 0; 1103 return 0;
572 if (PageCompound(page)) 1104 if (PageCompound(page))
573 return 0; 1105 return 0;
@@ -589,7 +1121,10 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
589int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 1121int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
590 gfp_t gfp_mask) 1122 gfp_t gfp_mask)
591{ 1123{
592 if (mem_cgroup_subsys.disabled) 1124 struct mem_cgroup *mem = NULL;
1125 int ret;
1126
1127 if (mem_cgroup_disabled())
593 return 0; 1128 return 0;
594 if (PageCompound(page)) 1129 if (PageCompound(page))
595 return 0; 1130 return 0;
@@ -601,6 +1136,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
601 * For GFP_NOWAIT case, the page may be pre-charged before calling 1136 * For GFP_NOWAIT case, the page may be pre-charged before calling
602 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 1137 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
603 * charge twice. (It works but has to pay a bit larger cost.) 1138 * charge twice. (It works but has to pay a bit larger cost.)
1139 * And when the page is SwapCache, it should take swap information
1140 * into account. This is under lock_page() now.
604 */ 1141 */
605 if (!(gfp_mask & __GFP_WAIT)) { 1142 if (!(gfp_mask & __GFP_WAIT)) {
606 struct page_cgroup *pc; 1143 struct page_cgroup *pc;
@@ -617,58 +1154,198 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
617 unlock_page_cgroup(pc); 1154 unlock_page_cgroup(pc);
618 } 1155 }
619 1156
620 if (unlikely(!mm)) 1157 if (do_swap_account && PageSwapCache(page)) {
1158 mem = try_get_mem_cgroup_from_swapcache(page);
1159 if (mem)
1160 mm = NULL;
1161 else
1162 mem = NULL;
1163 /* SwapCache may be still linked to LRU now. */
1164 mem_cgroup_lru_del_before_commit_swapcache(page);
1165 }
1166
1167 if (unlikely(!mm && !mem))
621 mm = &init_mm; 1168 mm = &init_mm;
622 1169
623 if (page_is_file_cache(page)) 1170 if (page_is_file_cache(page))
624 return mem_cgroup_charge_common(page, mm, gfp_mask, 1171 return mem_cgroup_charge_common(page, mm, gfp_mask,
625 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 1172 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
626 else 1173
627 return mem_cgroup_charge_common(page, mm, gfp_mask, 1174 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
628 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); 1175 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
1176 if (mem)
1177 css_put(&mem->css);
1178 if (PageSwapCache(page))
1179 mem_cgroup_lru_add_after_commit_swapcache(page);
1180
1181 if (do_swap_account && !ret && PageSwapCache(page)) {
1182 swp_entry_t ent = {.val = page_private(page)};
1183 /* avoid double counting */
1184 mem = swap_cgroup_record(ent, NULL);
1185 if (mem) {
1186 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1187 mem_cgroup_put(mem);
1188 }
1189 }
1190 return ret;
1191}
1192
1193/*
1194 * While swap-in, try_charge -> commit or cancel, the page is locked.
1195 * And when try_charge() successfully returns, one refcnt to memcg without
1196 * struct page_cgroup is aquired. This refcnt will be cumsumed by
1197 * "commit()" or removed by "cancel()"
1198 */
1199int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1200 struct page *page,
1201 gfp_t mask, struct mem_cgroup **ptr)
1202{
1203 struct mem_cgroup *mem;
1204 int ret;
1205
1206 if (mem_cgroup_disabled())
1207 return 0;
1208
1209 if (!do_swap_account)
1210 goto charge_cur_mm;
1211 /*
1212 * A racing thread's fault, or swapoff, may have already updated
1213 * the pte, and even removed page from swap cache: return success
1214 * to go on to do_swap_page()'s pte_same() test, which should fail.
1215 */
1216 if (!PageSwapCache(page))
1217 return 0;
1218 mem = try_get_mem_cgroup_from_swapcache(page);
1219 if (!mem)
1220 goto charge_cur_mm;
1221 *ptr = mem;
1222 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
1223 /* drop extra refcnt from tryget */
1224 css_put(&mem->css);
1225 return ret;
1226charge_cur_mm:
1227 if (unlikely(!mm))
1228 mm = &init_mm;
1229 return __mem_cgroup_try_charge(mm, mask, ptr, true);
1230}
1231
1232void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1233{
1234 struct page_cgroup *pc;
1235
1236 if (mem_cgroup_disabled())
1237 return;
1238 if (!ptr)
1239 return;
1240 pc = lookup_page_cgroup(page);
1241 mem_cgroup_lru_del_before_commit_swapcache(page);
1242 __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
1243 mem_cgroup_lru_add_after_commit_swapcache(page);
1244 /*
1245 * Now swap is on-memory. This means this page may be
1246 * counted both as mem and swap....double count.
1247 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
1248 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
1249 * may call delete_from_swap_cache() before reach here.
1250 */
1251 if (do_swap_account && PageSwapCache(page)) {
1252 swp_entry_t ent = {.val = page_private(page)};
1253 struct mem_cgroup *memcg;
1254 memcg = swap_cgroup_record(ent, NULL);
1255 if (memcg) {
1256 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1257 mem_cgroup_put(memcg);
1258 }
1259
1260 }
1261 /* add this page(page_cgroup) to the LRU we want. */
1262
1263}
1264
1265void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1266{
1267 if (mem_cgroup_disabled())
1268 return;
1269 if (!mem)
1270 return;
1271 res_counter_uncharge(&mem->res, PAGE_SIZE);
1272 if (do_swap_account)
1273 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1274 css_put(&mem->css);
629} 1275}
630 1276
1277
631/* 1278/*
632 * uncharge if !page_mapped(page) 1279 * uncharge if !page_mapped(page)
633 */ 1280 */
634static void 1281static struct mem_cgroup *
635__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 1282__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
636{ 1283{
637 struct page_cgroup *pc; 1284 struct page_cgroup *pc;
638 struct mem_cgroup *mem; 1285 struct mem_cgroup *mem = NULL;
639 struct mem_cgroup_per_zone *mz; 1286 struct mem_cgroup_per_zone *mz;
640 unsigned long flags;
641 1287
642 if (mem_cgroup_subsys.disabled) 1288 if (mem_cgroup_disabled())
643 return; 1289 return NULL;
1290
1291 if (PageSwapCache(page))
1292 return NULL;
644 1293
645 /* 1294 /*
646 * Check if our page_cgroup is valid 1295 * Check if our page_cgroup is valid
647 */ 1296 */
648 pc = lookup_page_cgroup(page); 1297 pc = lookup_page_cgroup(page);
649 if (unlikely(!pc || !PageCgroupUsed(pc))) 1298 if (unlikely(!pc || !PageCgroupUsed(pc)))
650 return; 1299 return NULL;
651 1300
652 lock_page_cgroup(pc); 1301 lock_page_cgroup(pc);
653 if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page)) 1302
654 || !PageCgroupUsed(pc)) { 1303 mem = pc->mem_cgroup;
655 /* This happens at race in zap_pte_range() and do_swap_page()*/ 1304
656 unlock_page_cgroup(pc); 1305 if (!PageCgroupUsed(pc))
657 return; 1306 goto unlock_out;
1307
1308 switch (ctype) {
1309 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1310 if (page_mapped(page))
1311 goto unlock_out;
1312 break;
1313 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
1314 if (!PageAnon(page)) { /* Shared memory */
1315 if (page->mapping && !page_is_file_cache(page))
1316 goto unlock_out;
1317 } else if (page_mapped(page)) /* Anon */
1318 goto unlock_out;
1319 break;
1320 default:
1321 break;
658 } 1322 }
1323
1324 res_counter_uncharge(&mem->res, PAGE_SIZE);
1325 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1326 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1327
1328 mem_cgroup_charge_statistics(mem, pc, false);
659 ClearPageCgroupUsed(pc); 1329 ClearPageCgroupUsed(pc);
660 mem = pc->mem_cgroup; 1330 /*
1331 * pc->mem_cgroup is not cleared here. It will be accessed when it's
1332 * freed from LRU. This is safe because uncharged page is expected not
1333 * to be reused (freed soon). Exception is SwapCache, it's handled by
1334 * special functions.
1335 */
661 1336
662 mz = page_cgroup_zoneinfo(pc); 1337 mz = page_cgroup_zoneinfo(pc);
663 spin_lock_irqsave(&mz->lru_lock, flags);
664 __mem_cgroup_remove_list(mz, pc);
665 spin_unlock_irqrestore(&mz->lru_lock, flags);
666 unlock_page_cgroup(pc); 1338 unlock_page_cgroup(pc);
667 1339
668 res_counter_uncharge(&mem->res, PAGE_SIZE); 1340 /* at swapout, this memcg will be accessed to record to swap */
669 css_put(&mem->css); 1341 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1342 css_put(&mem->css);
670 1343
671 return; 1344 return mem;
1345
1346unlock_out:
1347 unlock_page_cgroup(pc);
1348 return NULL;
672} 1349}
673 1350
674void mem_cgroup_uncharge_page(struct page *page) 1351void mem_cgroup_uncharge_page(struct page *page)
@@ -689,16 +1366,55 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
689} 1366}
690 1367
691/* 1368/*
692 * Before starting migration, account against new page. 1369 * called from __delete_from_swap_cache() and drop "page" account.
1370 * memcg information is recorded to swap_cgroup of "ent"
693 */ 1371 */
694int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) 1372void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
1373{
1374 struct mem_cgroup *memcg;
1375
1376 memcg = __mem_cgroup_uncharge_common(page,
1377 MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
1378 /* record memcg information */
1379 if (do_swap_account && memcg) {
1380 swap_cgroup_record(ent, memcg);
1381 mem_cgroup_get(memcg);
1382 }
1383 if (memcg)
1384 css_put(&memcg->css);
1385}
1386
1387#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1388/*
1389 * called from swap_entry_free(). remove record in swap_cgroup and
1390 * uncharge "memsw" account.
1391 */
1392void mem_cgroup_uncharge_swap(swp_entry_t ent)
1393{
1394 struct mem_cgroup *memcg;
1395
1396 if (!do_swap_account)
1397 return;
1398
1399 memcg = swap_cgroup_record(ent, NULL);
1400 if (memcg) {
1401 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1402 mem_cgroup_put(memcg);
1403 }
1404}
1405#endif
1406
1407/*
1408 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
1409 * page belongs to.
1410 */
1411int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
695{ 1412{
696 struct page_cgroup *pc; 1413 struct page_cgroup *pc;
697 struct mem_cgroup *mem = NULL; 1414 struct mem_cgroup *mem = NULL;
698 enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
699 int ret = 0; 1415 int ret = 0;
700 1416
701 if (mem_cgroup_subsys.disabled) 1417 if (mem_cgroup_disabled())
702 return 0; 1418 return 0;
703 1419
704 pc = lookup_page_cgroup(page); 1420 pc = lookup_page_cgroup(page);
@@ -706,41 +1422,67 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
706 if (PageCgroupUsed(pc)) { 1422 if (PageCgroupUsed(pc)) {
707 mem = pc->mem_cgroup; 1423 mem = pc->mem_cgroup;
708 css_get(&mem->css); 1424 css_get(&mem->css);
709 if (PageCgroupCache(pc)) {
710 if (page_is_file_cache(page))
711 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
712 else
713 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
714 }
715 } 1425 }
716 unlock_page_cgroup(pc); 1426 unlock_page_cgroup(pc);
1427
717 if (mem) { 1428 if (mem) {
718 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, 1429 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
719 ctype, mem);
720 css_put(&mem->css); 1430 css_put(&mem->css);
721 } 1431 }
1432 *ptr = mem;
722 return ret; 1433 return ret;
723} 1434}
724 1435
725/* remove redundant charge if migration failed*/ 1436/* remove redundant charge if migration failed*/
726void mem_cgroup_end_migration(struct page *newpage) 1437void mem_cgroup_end_migration(struct mem_cgroup *mem,
1438 struct page *oldpage, struct page *newpage)
727{ 1439{
1440 struct page *target, *unused;
1441 struct page_cgroup *pc;
1442 enum charge_type ctype;
1443
1444 if (!mem)
1445 return;
1446
1447 /* at migration success, oldpage->mapping is NULL. */
1448 if (oldpage->mapping) {
1449 target = oldpage;
1450 unused = NULL;
1451 } else {
1452 target = newpage;
1453 unused = oldpage;
1454 }
1455
1456 if (PageAnon(target))
1457 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
1458 else if (page_is_file_cache(target))
1459 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
1460 else
1461 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
1462
1463 /* unused page is not on radix-tree now. */
1464 if (unused)
1465 __mem_cgroup_uncharge_common(unused, ctype);
1466
1467 pc = lookup_page_cgroup(target);
728 /* 1468 /*
729 * At success, page->mapping is not NULL. 1469 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
730 * special rollback care is necessary when 1470 * So, double-counting is effectively avoided.
731 * 1. at migration failure. (newpage->mapping is cleared in this case)
732 * 2. the newpage was moved but not remapped again because the task
733 * exits and the newpage is obsolete. In this case, the new page
734 * may be a swapcache. So, we just call mem_cgroup_uncharge_page()
735 * always for avoiding mess. The page_cgroup will be removed if
736 * unnecessary. File cache pages is still on radix-tree. Don't
737 * care it.
738 */ 1471 */
739 if (!newpage->mapping) 1472 __mem_cgroup_commit_charge(mem, pc, ctype);
740 __mem_cgroup_uncharge_common(newpage, 1473
741 MEM_CGROUP_CHARGE_TYPE_FORCE); 1474 /*
742 else if (PageAnon(newpage)) 1475 * Both of oldpage and newpage are still under lock_page().
743 mem_cgroup_uncharge_page(newpage); 1476 * Then, we don't have to care about race in radix-tree.
1477 * But we have to be careful that this page is unmapped or not.
1478 *
1479 * There is a case for !page_mapped(). At the start of
1480 * migration, oldpage was mapped. But now, it's zapped.
1481 * But we know *target* page is not freed/reused under us.
1482 * mem_cgroup_uncharge_page() does all necessary checks.
1483 */
1484 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
1485 mem_cgroup_uncharge_page(target);
744} 1486}
745 1487
746/* 1488/*
@@ -748,29 +1490,26 @@ void mem_cgroup_end_migration(struct page *newpage)
748 * This is typically used for page reclaiming for shmem for reducing side 1490 * This is typically used for page reclaiming for shmem for reducing side
749 * effect of page allocation from shmem, which is used by some mem_cgroup. 1491 * effect of page allocation from shmem, which is used by some mem_cgroup.
750 */ 1492 */
751int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) 1493int mem_cgroup_shrink_usage(struct page *page,
1494 struct mm_struct *mm,
1495 gfp_t gfp_mask)
752{ 1496{
753 struct mem_cgroup *mem; 1497 struct mem_cgroup *mem = NULL;
754 int progress = 0; 1498 int progress = 0;
755 int retry = MEM_CGROUP_RECLAIM_RETRIES; 1499 int retry = MEM_CGROUP_RECLAIM_RETRIES;
756 1500
757 if (mem_cgroup_subsys.disabled) 1501 if (mem_cgroup_disabled())
758 return 0; 1502 return 0;
759 if (!mm) 1503 if (page)
1504 mem = try_get_mem_cgroup_from_swapcache(page);
1505 if (!mem && mm)
1506 mem = try_get_mem_cgroup_from_mm(mm);
1507 if (unlikely(!mem))
760 return 0; 1508 return 0;
761 1509
762 rcu_read_lock();
763 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
764 if (unlikely(!mem)) {
765 rcu_read_unlock();
766 return 0;
767 }
768 css_get(&mem->css);
769 rcu_read_unlock();
770
771 do { 1510 do {
772 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); 1511 progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true);
773 progress += res_counter_check_under_limit(&mem->res); 1512 progress += mem_cgroup_check_under_limit(mem);
774 } while (!progress && --retry); 1513 } while (!progress && --retry);
775 1514
776 css_put(&mem->css); 1515 css_put(&mem->css);
@@ -779,116 +1518,295 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
779 return 0; 1518 return 0;
780} 1519}
781 1520
782int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) 1521static DEFINE_MUTEX(set_limit_mutex);
1522
1523static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1524 unsigned long long val)
783{ 1525{
784 1526
785 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 1527 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
786 int progress; 1528 int progress;
1529 u64 memswlimit;
787 int ret = 0; 1530 int ret = 0;
788 1531
789 while (res_counter_set_limit(&memcg->res, val)) { 1532 while (retry_count) {
790 if (signal_pending(current)) { 1533 if (signal_pending(current)) {
791 ret = -EINTR; 1534 ret = -EINTR;
792 break; 1535 break;
793 } 1536 }
794 if (!retry_count) { 1537 /*
795 ret = -EBUSY; 1538 * Rather than hide all in some function, I do this in
1539 * open coded manner. You see what this really does.
1540 * We have to guarantee mem->res.limit < mem->memsw.limit.
1541 */
1542 mutex_lock(&set_limit_mutex);
1543 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1544 if (memswlimit < val) {
1545 ret = -EINVAL;
1546 mutex_unlock(&set_limit_mutex);
796 break; 1547 break;
797 } 1548 }
798 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL); 1549 ret = res_counter_set_limit(&memcg->res, val);
799 if (!progress) 1550 mutex_unlock(&set_limit_mutex);
800 retry_count--; 1551
1552 if (!ret)
1553 break;
1554
1555 progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
1556 false);
1557 if (!progress) retry_count--;
801 } 1558 }
1559
802 return ret; 1560 return ret;
803} 1561}
804 1562
1563int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1564 unsigned long long val)
1565{
1566 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1567 u64 memlimit, oldusage, curusage;
1568 int ret;
1569
1570 if (!do_swap_account)
1571 return -EINVAL;
1572
1573 while (retry_count) {
1574 if (signal_pending(current)) {
1575 ret = -EINTR;
1576 break;
1577 }
1578 /*
1579 * Rather than hide all in some function, I do this in
1580 * open coded manner. You see what this really does.
1581 * We have to guarantee mem->res.limit < mem->memsw.limit.
1582 */
1583 mutex_lock(&set_limit_mutex);
1584 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1585 if (memlimit > val) {
1586 ret = -EINVAL;
1587 mutex_unlock(&set_limit_mutex);
1588 break;
1589 }
1590 ret = res_counter_set_limit(&memcg->memsw, val);
1591 mutex_unlock(&set_limit_mutex);
1592
1593 if (!ret)
1594 break;
1595
1596 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1597 mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true);
1598 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1599 if (curusage >= oldusage)
1600 retry_count--;
1601 }
1602 return ret;
1603}
805 1604
806/* 1605/*
807 * This routine traverse page_cgroup in given list and drop them all. 1606 * This routine traverse page_cgroup in given list and drop them all.
808 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 1607 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
809 */ 1608 */
810#define FORCE_UNCHARGE_BATCH (128) 1609static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
811static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, 1610 int node, int zid, enum lru_list lru)
812 struct mem_cgroup_per_zone *mz,
813 enum lru_list lru)
814{ 1611{
815 struct page_cgroup *pc; 1612 struct zone *zone;
816 struct page *page; 1613 struct mem_cgroup_per_zone *mz;
817 int count = FORCE_UNCHARGE_BATCH; 1614 struct page_cgroup *pc, *busy;
818 unsigned long flags; 1615 unsigned long flags, loop;
819 struct list_head *list; 1616 struct list_head *list;
1617 int ret = 0;
820 1618
1619 zone = &NODE_DATA(node)->node_zones[zid];
1620 mz = mem_cgroup_zoneinfo(mem, node, zid);
821 list = &mz->lists[lru]; 1621 list = &mz->lists[lru];
822 1622
823 spin_lock_irqsave(&mz->lru_lock, flags); 1623 loop = MEM_CGROUP_ZSTAT(mz, lru);
824 while (!list_empty(list)) { 1624 /* give some margin against EBUSY etc...*/
825 pc = list_entry(list->prev, struct page_cgroup, lru); 1625 loop += 256;
826 page = pc->page; 1626 busy = NULL;
827 if (!PageCgroupUsed(pc)) 1627 while (loop--) {
828 break; 1628 ret = 0;
829 get_page(page); 1629 spin_lock_irqsave(&zone->lru_lock, flags);
830 spin_unlock_irqrestore(&mz->lru_lock, flags); 1630 if (list_empty(list)) {
831 /* 1631 spin_unlock_irqrestore(&zone->lru_lock, flags);
832 * Check if this page is on LRU. !LRU page can be found
833 * if it's under page migration.
834 */
835 if (PageLRU(page)) {
836 __mem_cgroup_uncharge_common(page,
837 MEM_CGROUP_CHARGE_TYPE_FORCE);
838 put_page(page);
839 if (--count <= 0) {
840 count = FORCE_UNCHARGE_BATCH;
841 cond_resched();
842 }
843 } else {
844 spin_lock_irqsave(&mz->lru_lock, flags);
845 break; 1632 break;
846 } 1633 }
847 spin_lock_irqsave(&mz->lru_lock, flags); 1634 pc = list_entry(list->prev, struct page_cgroup, lru);
1635 if (busy == pc) {
1636 list_move(&pc->lru, list);
1637 busy = 0;
1638 spin_unlock_irqrestore(&zone->lru_lock, flags);
1639 continue;
1640 }
1641 spin_unlock_irqrestore(&zone->lru_lock, flags);
1642
1643 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
1644 if (ret == -ENOMEM)
1645 break;
1646
1647 if (ret == -EBUSY || ret == -EINVAL) {
1648 /* found lock contention or "pc" is obsolete. */
1649 busy = pc;
1650 cond_resched();
1651 } else
1652 busy = NULL;
848 } 1653 }
849 spin_unlock_irqrestore(&mz->lru_lock, flags); 1654
1655 if (!ret && !list_empty(list))
1656 return -EBUSY;
1657 return ret;
850} 1658}
851 1659
852/* 1660/*
853 * make mem_cgroup's charge to be 0 if there is no task. 1661 * make mem_cgroup's charge to be 0 if there is no task.
854 * This enables deleting this mem_cgroup. 1662 * This enables deleting this mem_cgroup.
855 */ 1663 */
856static int mem_cgroup_force_empty(struct mem_cgroup *mem) 1664static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
857{ 1665{
858 int ret = -EBUSY; 1666 int ret;
859 int node, zid; 1667 int node, zid, shrink;
1668 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1669 struct cgroup *cgrp = mem->css.cgroup;
860 1670
861 css_get(&mem->css); 1671 css_get(&mem->css);
862 /* 1672
863 * page reclaim code (kswapd etc..) will move pages between 1673 shrink = 0;
864 * active_list <-> inactive_list while we don't take a lock. 1674 /* should free all ? */
865 * So, we have to do loop here until all lists are empty. 1675 if (free_all)
866 */ 1676 goto try_to_free;
1677move_account:
867 while (mem->res.usage > 0) { 1678 while (mem->res.usage > 0) {
868 if (atomic_read(&mem->css.cgroup->count) > 0) 1679 ret = -EBUSY;
1680 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
1681 goto out;
1682 ret = -EINTR;
1683 if (signal_pending(current))
869 goto out; 1684 goto out;
870 /* This is for making all *used* pages to be on LRU. */ 1685 /* This is for making all *used* pages to be on LRU. */
871 lru_add_drain_all(); 1686 lru_add_drain_all();
872 for_each_node_state(node, N_POSSIBLE) 1687 ret = 0;
873 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1688 for_each_node_state(node, N_HIGH_MEMORY) {
874 struct mem_cgroup_per_zone *mz; 1689 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
875 enum lru_list l; 1690 enum lru_list l;
876 mz = mem_cgroup_zoneinfo(mem, node, zid); 1691 for_each_lru(l) {
877 for_each_lru(l) 1692 ret = mem_cgroup_force_empty_list(mem,
878 mem_cgroup_force_empty_list(mem, mz, l); 1693 node, zid, l);
1694 if (ret)
1695 break;
1696 }
879 } 1697 }
1698 if (ret)
1699 break;
1700 }
1701 /* it seems parent cgroup doesn't have enough mem */
1702 if (ret == -ENOMEM)
1703 goto try_to_free;
880 cond_resched(); 1704 cond_resched();
881 } 1705 }
882 ret = 0; 1706 ret = 0;
883out: 1707out:
884 css_put(&mem->css); 1708 css_put(&mem->css);
885 return ret; 1709 return ret;
1710
1711try_to_free:
1712 /* returns EBUSY if there is a task or if we come here twice. */
1713 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
1714 ret = -EBUSY;
1715 goto out;
1716 }
1717 /* we call try-to-free pages for make this cgroup empty */
1718 lru_add_drain_all();
1719 /* try to free all pages in this cgroup */
1720 shrink = 1;
1721 while (nr_retries && mem->res.usage > 0) {
1722 int progress;
1723
1724 if (signal_pending(current)) {
1725 ret = -EINTR;
1726 goto out;
1727 }
1728 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
1729 false, get_swappiness(mem));
1730 if (!progress) {
1731 nr_retries--;
1732 /* maybe some writeback is necessary */
1733 congestion_wait(WRITE, HZ/10);
1734 }
1735
1736 }
1737 lru_add_drain();
1738 /* try move_account...there may be some *locked* pages. */
1739 if (mem->res.usage)
1740 goto move_account;
1741 ret = 0;
1742 goto out;
1743}
1744
1745int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
1746{
1747 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
1748}
1749
1750
1751static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
1752{
1753 return mem_cgroup_from_cont(cont)->use_hierarchy;
1754}
1755
1756static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
1757 u64 val)
1758{
1759 int retval = 0;
1760 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1761 struct cgroup *parent = cont->parent;
1762 struct mem_cgroup *parent_mem = NULL;
1763
1764 if (parent)
1765 parent_mem = mem_cgroup_from_cont(parent);
1766
1767 cgroup_lock();
1768 /*
1769 * If parent's use_hiearchy is set, we can't make any modifications
1770 * in the child subtrees. If it is unset, then the change can
1771 * occur, provided the current cgroup has no children.
1772 *
1773 * For the root cgroup, parent_mem is NULL, we allow value to be
1774 * set if there are no children.
1775 */
1776 if ((!parent_mem || !parent_mem->use_hierarchy) &&
1777 (val == 1 || val == 0)) {
1778 if (list_empty(&cont->children))
1779 mem->use_hierarchy = val;
1780 else
1781 retval = -EBUSY;
1782 } else
1783 retval = -EINVAL;
1784 cgroup_unlock();
1785
1786 return retval;
886} 1787}
887 1788
888static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 1789static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
889{ 1790{
890 return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, 1791 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
891 cft->private); 1792 u64 val = 0;
1793 int type, name;
1794
1795 type = MEMFILE_TYPE(cft->private);
1796 name = MEMFILE_ATTR(cft->private);
1797 switch (type) {
1798 case _MEM:
1799 val = res_counter_read_u64(&mem->res, name);
1800 break;
1801 case _MEMSWAP:
1802 if (do_swap_account)
1803 val = res_counter_read_u64(&mem->memsw, name);
1804 break;
1805 default:
1806 BUG();
1807 break;
1808 }
1809 return val;
892} 1810}
893/* 1811/*
894 * The user of this function is... 1812 * The user of this function is...
@@ -898,15 +1816,22 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
898 const char *buffer) 1816 const char *buffer)
899{ 1817{
900 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 1818 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
1819 int type, name;
901 unsigned long long val; 1820 unsigned long long val;
902 int ret; 1821 int ret;
903 1822
904 switch (cft->private) { 1823 type = MEMFILE_TYPE(cft->private);
1824 name = MEMFILE_ATTR(cft->private);
1825 switch (name) {
905 case RES_LIMIT: 1826 case RES_LIMIT:
906 /* This function does all necessary parse...reuse it */ 1827 /* This function does all necessary parse...reuse it */
907 ret = res_counter_memparse_write_strategy(buffer, &val); 1828 ret = res_counter_memparse_write_strategy(buffer, &val);
908 if (!ret) 1829 if (ret)
1830 break;
1831 if (type == _MEM)
909 ret = mem_cgroup_resize_limit(memcg, val); 1832 ret = mem_cgroup_resize_limit(memcg, val);
1833 else
1834 ret = mem_cgroup_resize_memsw_limit(memcg, val);
910 break; 1835 break;
911 default: 1836 default:
912 ret = -EINVAL; /* should be BUG() ? */ 1837 ret = -EINVAL; /* should be BUG() ? */
@@ -915,27 +1840,59 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
915 return ret; 1840 return ret;
916} 1841}
917 1842
1843static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
1844 unsigned long long *mem_limit, unsigned long long *memsw_limit)
1845{
1846 struct cgroup *cgroup;
1847 unsigned long long min_limit, min_memsw_limit, tmp;
1848
1849 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1850 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1851 cgroup = memcg->css.cgroup;
1852 if (!memcg->use_hierarchy)
1853 goto out;
1854
1855 while (cgroup->parent) {
1856 cgroup = cgroup->parent;
1857 memcg = mem_cgroup_from_cont(cgroup);
1858 if (!memcg->use_hierarchy)
1859 break;
1860 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
1861 min_limit = min(min_limit, tmp);
1862 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1863 min_memsw_limit = min(min_memsw_limit, tmp);
1864 }
1865out:
1866 *mem_limit = min_limit;
1867 *memsw_limit = min_memsw_limit;
1868 return;
1869}
1870
918static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 1871static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
919{ 1872{
920 struct mem_cgroup *mem; 1873 struct mem_cgroup *mem;
1874 int type, name;
921 1875
922 mem = mem_cgroup_from_cont(cont); 1876 mem = mem_cgroup_from_cont(cont);
923 switch (event) { 1877 type = MEMFILE_TYPE(event);
1878 name = MEMFILE_ATTR(event);
1879 switch (name) {
924 case RES_MAX_USAGE: 1880 case RES_MAX_USAGE:
925 res_counter_reset_max(&mem->res); 1881 if (type == _MEM)
1882 res_counter_reset_max(&mem->res);
1883 else
1884 res_counter_reset_max(&mem->memsw);
926 break; 1885 break;
927 case RES_FAILCNT: 1886 case RES_FAILCNT:
928 res_counter_reset_failcnt(&mem->res); 1887 if (type == _MEM)
1888 res_counter_reset_failcnt(&mem->res);
1889 else
1890 res_counter_reset_failcnt(&mem->memsw);
929 break; 1891 break;
930 } 1892 }
931 return 0; 1893 return 0;
932} 1894}
933 1895
934static int mem_force_empty_write(struct cgroup *cont, unsigned int event)
935{
936 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont));
937}
938
939static const struct mem_cgroup_stat_desc { 1896static const struct mem_cgroup_stat_desc {
940 const char *msg; 1897 const char *msg;
941 u64 unit; 1898 u64 unit;
@@ -984,42 +1941,170 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
984 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); 1941 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
985 1942
986 } 1943 }
1944 {
1945 unsigned long long limit, memsw_limit;
1946 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
1947 cb->fill(cb, "hierarchical_memory_limit", limit);
1948 if (do_swap_account)
1949 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
1950 }
1951
1952#ifdef CONFIG_DEBUG_VM
1953 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
1954
1955 {
1956 int nid, zid;
1957 struct mem_cgroup_per_zone *mz;
1958 unsigned long recent_rotated[2] = {0, 0};
1959 unsigned long recent_scanned[2] = {0, 0};
1960
1961 for_each_online_node(nid)
1962 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1963 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
1964
1965 recent_rotated[0] +=
1966 mz->reclaim_stat.recent_rotated[0];
1967 recent_rotated[1] +=
1968 mz->reclaim_stat.recent_rotated[1];
1969 recent_scanned[0] +=
1970 mz->reclaim_stat.recent_scanned[0];
1971 recent_scanned[1] +=
1972 mz->reclaim_stat.recent_scanned[1];
1973 }
1974 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
1975 cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
1976 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
1977 cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
1978 }
1979#endif
1980
987 return 0; 1981 return 0;
988} 1982}
989 1983
1984static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
1985{
1986 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
1987
1988 return get_swappiness(memcg);
1989}
1990
1991static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
1992 u64 val)
1993{
1994 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
1995 struct mem_cgroup *parent;
1996
1997 if (val > 100)
1998 return -EINVAL;
1999
2000 if (cgrp->parent == NULL)
2001 return -EINVAL;
2002
2003 parent = mem_cgroup_from_cont(cgrp->parent);
2004
2005 cgroup_lock();
2006
2007 /* If under hierarchy, only empty-root can set this value */
2008 if ((parent->use_hierarchy) ||
2009 (memcg->use_hierarchy && !list_empty(&cgrp->children))) {
2010 cgroup_unlock();
2011 return -EINVAL;
2012 }
2013
2014 spin_lock(&memcg->reclaim_param_lock);
2015 memcg->swappiness = val;
2016 spin_unlock(&memcg->reclaim_param_lock);
2017
2018 cgroup_unlock();
2019
2020 return 0;
2021}
2022
2023
990static struct cftype mem_cgroup_files[] = { 2024static struct cftype mem_cgroup_files[] = {
991 { 2025 {
992 .name = "usage_in_bytes", 2026 .name = "usage_in_bytes",
993 .private = RES_USAGE, 2027 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
994 .read_u64 = mem_cgroup_read, 2028 .read_u64 = mem_cgroup_read,
995 }, 2029 },
996 { 2030 {
997 .name = "max_usage_in_bytes", 2031 .name = "max_usage_in_bytes",
998 .private = RES_MAX_USAGE, 2032 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
999 .trigger = mem_cgroup_reset, 2033 .trigger = mem_cgroup_reset,
1000 .read_u64 = mem_cgroup_read, 2034 .read_u64 = mem_cgroup_read,
1001 }, 2035 },
1002 { 2036 {
1003 .name = "limit_in_bytes", 2037 .name = "limit_in_bytes",
1004 .private = RES_LIMIT, 2038 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
1005 .write_string = mem_cgroup_write, 2039 .write_string = mem_cgroup_write,
1006 .read_u64 = mem_cgroup_read, 2040 .read_u64 = mem_cgroup_read,
1007 }, 2041 },
1008 { 2042 {
1009 .name = "failcnt", 2043 .name = "failcnt",
1010 .private = RES_FAILCNT, 2044 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
1011 .trigger = mem_cgroup_reset, 2045 .trigger = mem_cgroup_reset,
1012 .read_u64 = mem_cgroup_read, 2046 .read_u64 = mem_cgroup_read,
1013 }, 2047 },
1014 { 2048 {
2049 .name = "stat",
2050 .read_map = mem_control_stat_show,
2051 },
2052 {
1015 .name = "force_empty", 2053 .name = "force_empty",
1016 .trigger = mem_force_empty_write, 2054 .trigger = mem_cgroup_force_empty_write,
1017 }, 2055 },
1018 { 2056 {
1019 .name = "stat", 2057 .name = "use_hierarchy",
1020 .read_map = mem_control_stat_show, 2058 .write_u64 = mem_cgroup_hierarchy_write,
2059 .read_u64 = mem_cgroup_hierarchy_read,
1021 }, 2060 },
2061 {
2062 .name = "swappiness",
2063 .read_u64 = mem_cgroup_swappiness_read,
2064 .write_u64 = mem_cgroup_swappiness_write,
2065 },
2066};
2067
2068#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2069static struct cftype memsw_cgroup_files[] = {
2070 {
2071 .name = "memsw.usage_in_bytes",
2072 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
2073 .read_u64 = mem_cgroup_read,
2074 },
2075 {
2076 .name = "memsw.max_usage_in_bytes",
2077 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
2078 .trigger = mem_cgroup_reset,
2079 .read_u64 = mem_cgroup_read,
2080 },
2081 {
2082 .name = "memsw.limit_in_bytes",
2083 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
2084 .write_string = mem_cgroup_write,
2085 .read_u64 = mem_cgroup_read,
2086 },
2087 {
2088 .name = "memsw.failcnt",
2089 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
2090 .trigger = mem_cgroup_reset,
2091 .read_u64 = mem_cgroup_read,
2092 },
2093};
2094
2095static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
2096{
2097 if (!do_swap_account)
2098 return 0;
2099 return cgroup_add_files(cont, ss, memsw_cgroup_files,
2100 ARRAY_SIZE(memsw_cgroup_files));
1022}; 2101};
2102#else
2103static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
2104{
2105 return 0;
2106}
2107#endif
1023 2108
1024static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 2109static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1025{ 2110{
@@ -1046,7 +2131,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1046 2131
1047 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 2132 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
1048 mz = &pn->zoneinfo[zone]; 2133 mz = &pn->zoneinfo[zone];
1049 spin_lock_init(&mz->lru_lock);
1050 for_each_lru(l) 2134 for_each_lru(l)
1051 INIT_LIST_HEAD(&mz->lists[l]); 2135 INIT_LIST_HEAD(&mz->lists[l]);
1052 } 2136 }
@@ -1058,55 +2142,133 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1058 kfree(mem->info.nodeinfo[node]); 2142 kfree(mem->info.nodeinfo[node]);
1059} 2143}
1060 2144
2145static int mem_cgroup_size(void)
2146{
2147 int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
2148 return sizeof(struct mem_cgroup) + cpustat_size;
2149}
2150
1061static struct mem_cgroup *mem_cgroup_alloc(void) 2151static struct mem_cgroup *mem_cgroup_alloc(void)
1062{ 2152{
1063 struct mem_cgroup *mem; 2153 struct mem_cgroup *mem;
2154 int size = mem_cgroup_size();
1064 2155
1065 if (sizeof(*mem) < PAGE_SIZE) 2156 if (size < PAGE_SIZE)
1066 mem = kmalloc(sizeof(*mem), GFP_KERNEL); 2157 mem = kmalloc(size, GFP_KERNEL);
1067 else 2158 else
1068 mem = vmalloc(sizeof(*mem)); 2159 mem = vmalloc(size);
1069 2160
1070 if (mem) 2161 if (mem)
1071 memset(mem, 0, sizeof(*mem)); 2162 memset(mem, 0, size);
1072 return mem; 2163 return mem;
1073} 2164}
1074 2165
1075static void mem_cgroup_free(struct mem_cgroup *mem) 2166/*
2167 * At destroying mem_cgroup, references from swap_cgroup can remain.
2168 * (scanning all at force_empty is too costly...)
2169 *
2170 * Instead of clearing all references at force_empty, we remember
2171 * the number of reference from swap_cgroup and free mem_cgroup when
2172 * it goes down to 0.
2173 *
2174 * Removal of cgroup itself succeeds regardless of refs from swap.
2175 */
2176
2177static void __mem_cgroup_free(struct mem_cgroup *mem)
1076{ 2178{
1077 if (sizeof(*mem) < PAGE_SIZE) 2179 int node;
2180
2181 for_each_node_state(node, N_POSSIBLE)
2182 free_mem_cgroup_per_zone_info(mem, node);
2183
2184 if (mem_cgroup_size() < PAGE_SIZE)
1078 kfree(mem); 2185 kfree(mem);
1079 else 2186 else
1080 vfree(mem); 2187 vfree(mem);
1081} 2188}
1082 2189
2190static void mem_cgroup_get(struct mem_cgroup *mem)
2191{
2192 atomic_inc(&mem->refcnt);
2193}
1083 2194
1084static struct cgroup_subsys_state * 2195static void mem_cgroup_put(struct mem_cgroup *mem)
2196{
2197 if (atomic_dec_and_test(&mem->refcnt)) {
2198 struct mem_cgroup *parent = parent_mem_cgroup(mem);
2199 __mem_cgroup_free(mem);
2200 if (parent)
2201 mem_cgroup_put(parent);
2202 }
2203}
2204
2205/*
2206 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
2207 */
2208static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem)
2209{
2210 if (!mem->res.parent)
2211 return NULL;
2212 return mem_cgroup_from_res_counter(mem->res.parent, res);
2213}
2214
2215#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2216static void __init enable_swap_cgroup(void)
2217{
2218 if (!mem_cgroup_disabled() && really_do_swap_account)
2219 do_swap_account = 1;
2220}
2221#else
2222static void __init enable_swap_cgroup(void)
2223{
2224}
2225#endif
2226
2227static struct cgroup_subsys_state * __ref
1085mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 2228mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1086{ 2229{
1087 struct mem_cgroup *mem; 2230 struct mem_cgroup *mem, *parent;
1088 int node; 2231 int node;
1089 2232
1090 if (unlikely((cont->parent) == NULL)) { 2233 mem = mem_cgroup_alloc();
1091 mem = &init_mem_cgroup; 2234 if (!mem)
1092 } else { 2235 return ERR_PTR(-ENOMEM);
1093 mem = mem_cgroup_alloc();
1094 if (!mem)
1095 return ERR_PTR(-ENOMEM);
1096 }
1097
1098 res_counter_init(&mem->res);
1099 2236
1100 for_each_node_state(node, N_POSSIBLE) 2237 for_each_node_state(node, N_POSSIBLE)
1101 if (alloc_mem_cgroup_per_zone_info(mem, node)) 2238 if (alloc_mem_cgroup_per_zone_info(mem, node))
1102 goto free_out; 2239 goto free_out;
2240 /* root ? */
2241 if (cont->parent == NULL) {
2242 enable_swap_cgroup();
2243 parent = NULL;
2244 } else {
2245 parent = mem_cgroup_from_cont(cont->parent);
2246 mem->use_hierarchy = parent->use_hierarchy;
2247 }
1103 2248
2249 if (parent && parent->use_hierarchy) {
2250 res_counter_init(&mem->res, &parent->res);
2251 res_counter_init(&mem->memsw, &parent->memsw);
2252 /*
2253 * We increment refcnt of the parent to ensure that we can
2254 * safely access it on res_counter_charge/uncharge.
2255 * This refcnt will be decremented when freeing this
2256 * mem_cgroup(see mem_cgroup_put).
2257 */
2258 mem_cgroup_get(parent);
2259 } else {
2260 res_counter_init(&mem->res, NULL);
2261 res_counter_init(&mem->memsw, NULL);
2262 }
2263 mem->last_scanned_child = NULL;
2264 spin_lock_init(&mem->reclaim_param_lock);
2265
2266 if (parent)
2267 mem->swappiness = get_swappiness(parent);
2268 atomic_set(&mem->refcnt, 1);
1104 return &mem->css; 2269 return &mem->css;
1105free_out: 2270free_out:
1106 for_each_node_state(node, N_POSSIBLE) 2271 __mem_cgroup_free(mem);
1107 free_mem_cgroup_per_zone_info(mem, node);
1108 if (cont->parent != NULL)
1109 mem_cgroup_free(mem);
1110 return ERR_PTR(-ENOMEM); 2272 return ERR_PTR(-ENOMEM);
1111} 2273}
1112 2274
@@ -1114,26 +2276,33 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
1114 struct cgroup *cont) 2276 struct cgroup *cont)
1115{ 2277{
1116 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2278 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1117 mem_cgroup_force_empty(mem); 2279 mem_cgroup_force_empty(mem, false);
1118} 2280}
1119 2281
1120static void mem_cgroup_destroy(struct cgroup_subsys *ss, 2282static void mem_cgroup_destroy(struct cgroup_subsys *ss,
1121 struct cgroup *cont) 2283 struct cgroup *cont)
1122{ 2284{
1123 int node;
1124 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2285 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2286 struct mem_cgroup *last_scanned_child = mem->last_scanned_child;
1125 2287
1126 for_each_node_state(node, N_POSSIBLE) 2288 if (last_scanned_child) {
1127 free_mem_cgroup_per_zone_info(mem, node); 2289 VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child));
1128 2290 mem_cgroup_put(last_scanned_child);
1129 mem_cgroup_free(mem_cgroup_from_cont(cont)); 2291 }
2292 mem_cgroup_put(mem);
1130} 2293}
1131 2294
1132static int mem_cgroup_populate(struct cgroup_subsys *ss, 2295static int mem_cgroup_populate(struct cgroup_subsys *ss,
1133 struct cgroup *cont) 2296 struct cgroup *cont)
1134{ 2297{
1135 return cgroup_add_files(cont, ss, mem_cgroup_files, 2298 int ret;
1136 ARRAY_SIZE(mem_cgroup_files)); 2299
2300 ret = cgroup_add_files(cont, ss, mem_cgroup_files,
2301 ARRAY_SIZE(mem_cgroup_files));
2302
2303 if (!ret)
2304 ret = register_memsw_files(cont, ss);
2305 return ret;
1137} 2306}
1138 2307
1139static void mem_cgroup_move_task(struct cgroup_subsys *ss, 2308static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -1141,25 +2310,12 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1141 struct cgroup *old_cont, 2310 struct cgroup *old_cont,
1142 struct task_struct *p) 2311 struct task_struct *p)
1143{ 2312{
1144 struct mm_struct *mm; 2313 mutex_lock(&memcg_tasklist);
1145 struct mem_cgroup *mem, *old_mem;
1146
1147 mm = get_task_mm(p);
1148 if (mm == NULL)
1149 return;
1150
1151 mem = mem_cgroup_from_cont(cont);
1152 old_mem = mem_cgroup_from_cont(old_cont);
1153
1154 /* 2314 /*
1155 * Only thread group leaders are allowed to migrate, the mm_struct is 2315 * FIXME: It's better to move charges of this process from old
1156 * in effect owned by the leader 2316 * memcg to new memcg. But it's just on TODO-List now.
1157 */ 2317 */
1158 if (!thread_group_leader(p)) 2318 mutex_unlock(&memcg_tasklist);
1159 goto out;
1160
1161out:
1162 mmput(mm);
1163} 2319}
1164 2320
1165struct cgroup_subsys mem_cgroup_subsys = { 2321struct cgroup_subsys mem_cgroup_subsys = {
@@ -1172,3 +2328,13 @@ struct cgroup_subsys mem_cgroup_subsys = {
1172 .attach = mem_cgroup_move_task, 2328 .attach = mem_cgroup_move_task,
1173 .early_init = 0, 2329 .early_init = 0,
1174}; 2330};
2331
2332#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2333
2334static int __init disable_swap_account(char *s)
2335{
2336 really_do_swap_account = 0;
2337 return 1;
2338}
2339__setup("noswapaccount", disable_swap_account);
2340#endif
diff --git a/mm/memory.c b/mm/memory.c
index 0a2010a9518c..baa999e87cd2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -52,6 +52,9 @@
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/memcontrol.h> 53#include <linux/memcontrol.h>
54#include <linux/mmu_notifier.h> 54#include <linux/mmu_notifier.h>
55#include <linux/kallsyms.h>
56#include <linux/swapops.h>
57#include <linux/elf.h>
55 58
56#include <asm/pgalloc.h> 59#include <asm/pgalloc.h>
57#include <asm/uaccess.h> 60#include <asm/uaccess.h>
@@ -59,9 +62,6 @@
59#include <asm/tlbflush.h> 62#include <asm/tlbflush.h>
60#include <asm/pgtable.h> 63#include <asm/pgtable.h>
61 64
62#include <linux/swapops.h>
63#include <linux/elf.h>
64
65#include "internal.h" 65#include "internal.h"
66 66
67#ifndef CONFIG_NEED_MULTIPLE_NODES 67#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -375,15 +375,65 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
375 * 375 *
376 * The calling function must still handle the error. 376 * The calling function must still handle the error.
377 */ 377 */
378static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, 378static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
379 unsigned long vaddr) 379 pte_t pte, struct page *page)
380{ 380{
381 printk(KERN_ERR "Bad pte = %08llx, process = %s, " 381 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
382 "vm_flags = %lx, vaddr = %lx\n", 382 pud_t *pud = pud_offset(pgd, addr);
383 (long long)pte_val(pte), 383 pmd_t *pmd = pmd_offset(pud, addr);
384 (vma->vm_mm == current->mm ? current->comm : "???"), 384 struct address_space *mapping;
385 vma->vm_flags, vaddr); 385 pgoff_t index;
386 static unsigned long resume;
387 static unsigned long nr_shown;
388 static unsigned long nr_unshown;
389
390 /*
391 * Allow a burst of 60 reports, then keep quiet for that minute;
392 * or allow a steady drip of one report per second.
393 */
394 if (nr_shown == 60) {
395 if (time_before(jiffies, resume)) {
396 nr_unshown++;
397 return;
398 }
399 if (nr_unshown) {
400 printk(KERN_ALERT
401 "BUG: Bad page map: %lu messages suppressed\n",
402 nr_unshown);
403 nr_unshown = 0;
404 }
405 nr_shown = 0;
406 }
407 if (nr_shown++ == 0)
408 resume = jiffies + 60 * HZ;
409
410 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
411 index = linear_page_index(vma, addr);
412
413 printk(KERN_ALERT
414 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
415 current->comm,
416 (long long)pte_val(pte), (long long)pmd_val(*pmd));
417 if (page) {
418 printk(KERN_ALERT
419 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
420 page, (void *)page->flags, page_count(page),
421 page_mapcount(page), page->mapping, page->index);
422 }
423 printk(KERN_ALERT
424 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
425 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
426 /*
427 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
428 */
429 if (vma->vm_ops)
430 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
431 (unsigned long)vma->vm_ops->fault);
432 if (vma->vm_file && vma->vm_file->f_op)
433 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
434 (unsigned long)vma->vm_file->f_op->mmap);
386 dump_stack(); 435 dump_stack();
436 add_taint(TAINT_BAD_PAGE);
387} 437}
388 438
389static inline int is_cow_mapping(unsigned int flags) 439static inline int is_cow_mapping(unsigned int flags)
@@ -441,21 +491,18 @@ static inline int is_cow_mapping(unsigned int flags)
441struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, 491struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
442 pte_t pte) 492 pte_t pte)
443{ 493{
444 unsigned long pfn; 494 unsigned long pfn = pte_pfn(pte);
445 495
446 if (HAVE_PTE_SPECIAL) { 496 if (HAVE_PTE_SPECIAL) {
447 if (likely(!pte_special(pte))) { 497 if (likely(!pte_special(pte)))
448 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 498 goto check_pfn;
449 return pte_page(pte); 499 if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
450 } 500 print_bad_pte(vma, addr, pte, NULL);
451 VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
452 return NULL; 501 return NULL;
453 } 502 }
454 503
455 /* !HAVE_PTE_SPECIAL case follows: */ 504 /* !HAVE_PTE_SPECIAL case follows: */
456 505
457 pfn = pte_pfn(pte);
458
459 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { 506 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
460 if (vma->vm_flags & VM_MIXEDMAP) { 507 if (vma->vm_flags & VM_MIXEDMAP) {
461 if (!pfn_valid(pfn)) 508 if (!pfn_valid(pfn))
@@ -471,11 +518,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
471 } 518 }
472 } 519 }
473 520
474 VM_BUG_ON(!pfn_valid(pfn)); 521check_pfn:
522 if (unlikely(pfn > highest_memmap_pfn)) {
523 print_bad_pte(vma, addr, pte, NULL);
524 return NULL;
525 }
475 526
476 /* 527 /*
477 * NOTE! We still have PageReserved() pages in the page tables. 528 * NOTE! We still have PageReserved() pages in the page tables.
478 *
479 * eg. VDSO mappings can cause them to exist. 529 * eg. VDSO mappings can cause them to exist.
480 */ 530 */
481out: 531out:
@@ -767,11 +817,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
767 else { 817 else {
768 if (pte_dirty(ptent)) 818 if (pte_dirty(ptent))
769 set_page_dirty(page); 819 set_page_dirty(page);
770 if (pte_young(ptent)) 820 if (pte_young(ptent) &&
771 SetPageReferenced(page); 821 likely(!VM_SequentialReadHint(vma)))
822 mark_page_accessed(page);
772 file_rss--; 823 file_rss--;
773 } 824 }
774 page_remove_rmap(page, vma); 825 page_remove_rmap(page);
826 if (unlikely(page_mapcount(page) < 0))
827 print_bad_pte(vma, addr, ptent, page);
775 tlb_remove_page(tlb, page); 828 tlb_remove_page(tlb, page);
776 continue; 829 continue;
777 } 830 }
@@ -781,8 +834,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
781 */ 834 */
782 if (unlikely(details)) 835 if (unlikely(details))
783 continue; 836 continue;
784 if (!pte_file(ptent)) 837 if (pte_file(ptent)) {
785 free_swap_and_cache(pte_to_swp_entry(ptent)); 838 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
839 print_bad_pte(vma, addr, ptent, NULL);
840 } else if
841 (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
842 print_bad_pte(vma, addr, ptent, NULL);
786 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 843 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
787 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); 844 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
788 845
@@ -1153,6 +1210,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1153 int write = !!(flags & GUP_FLAGS_WRITE); 1210 int write = !!(flags & GUP_FLAGS_WRITE);
1154 int force = !!(flags & GUP_FLAGS_FORCE); 1211 int force = !!(flags & GUP_FLAGS_FORCE);
1155 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); 1212 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1213 int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
1156 1214
1157 if (len <= 0) 1215 if (len <= 0)
1158 return 0; 1216 return 0;
@@ -1231,12 +1289,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1231 struct page *page; 1289 struct page *page;
1232 1290
1233 /* 1291 /*
1234 * If tsk is ooming, cut off its access to large memory 1292 * If we have a pending SIGKILL, don't keep faulting
1235 * allocations. It has a pending SIGKILL, but it can't 1293 * pages and potentially allocating memory, unless
1236 * be processed until returning to user space. 1294 * current is handling munlock--e.g., on exit. In
1295 * that case, we are not allocating memory. Rather,
1296 * we're only unlocking already resident/mapped pages.
1237 */ 1297 */
1238 if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) 1298 if (unlikely(!ignore_sigkill &&
1239 return i ? i : -ENOMEM; 1299 fatal_signal_pending(current)))
1300 return i ? i : -ERESTARTSYS;
1240 1301
1241 if (write) 1302 if (write)
1242 foll_flags |= FOLL_WRITE; 1303 foll_flags |= FOLL_WRITE;
@@ -1263,9 +1324,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1263 * do_wp_page has broken COW when necessary, 1324 * do_wp_page has broken COW when necessary,
1264 * even if maybe_mkwrite decided not to set 1325 * even if maybe_mkwrite decided not to set
1265 * pte_write. We can thus safely do subsequent 1326 * pte_write. We can thus safely do subsequent
1266 * page lookups as if they were reads. 1327 * page lookups as if they were reads. But only
1328 * do so when looping for pte_write is futile:
1329 * in some cases userspace may also be wanting
1330 * to write to the gotten user page, which a
1331 * read fault here might prevent (a readonly
1332 * page might get reCOWed by userspace write).
1267 */ 1333 */
1268 if (ret & VM_FAULT_WRITE) 1334 if ((ret & VM_FAULT_WRITE) &&
1335 !(vma->vm_flags & VM_WRITE))
1269 foll_flags &= ~FOLL_WRITE; 1336 foll_flags &= ~FOLL_WRITE;
1270 1337
1271 cond_resched(); 1338 cond_resched();
@@ -1444,6 +1511,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1444 unsigned long pfn) 1511 unsigned long pfn)
1445{ 1512{
1446 int ret; 1513 int ret;
1514 pgprot_t pgprot = vma->vm_page_prot;
1447 /* 1515 /*
1448 * Technically, architectures with pte_special can avoid all these 1516 * Technically, architectures with pte_special can avoid all these
1449 * restrictions (same for remap_pfn_range). However we would like 1517 * restrictions (same for remap_pfn_range). However we would like
@@ -1458,10 +1526,10 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
1458 1526
1459 if (addr < vma->vm_start || addr >= vma->vm_end) 1527 if (addr < vma->vm_start || addr >= vma->vm_end)
1460 return -EFAULT; 1528 return -EFAULT;
1461 if (track_pfn_vma_new(vma, vma->vm_page_prot, pfn, PAGE_SIZE)) 1529 if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
1462 return -EINVAL; 1530 return -EINVAL;
1463 1531
1464 ret = insert_pfn(vma, addr, pfn, vma->vm_page_prot); 1532 ret = insert_pfn(vma, addr, pfn, pgprot);
1465 1533
1466 if (ret) 1534 if (ret)
1467 untrack_pfn_vma(vma, pfn, PAGE_SIZE); 1535 untrack_pfn_vma(vma, pfn, PAGE_SIZE);
@@ -1604,9 +1672,15 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
1604 1672
1605 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; 1673 vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
1606 1674
1607 err = track_pfn_vma_new(vma, prot, pfn, PAGE_ALIGN(size)); 1675 err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
1608 if (err) 1676 if (err) {
1677 /*
1678 * To indicate that track_pfn related cleanup is not
1679 * needed from higher level routine calling unmap_vmas
1680 */
1681 vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
1609 return -EINVAL; 1682 return -EINVAL;
1683 }
1610 1684
1611 BUG_ON(addr >= end); 1685 BUG_ON(addr >= end);
1612 pfn -= addr >> PAGE_SHIFT; 1686 pfn -= addr >> PAGE_SHIFT;
@@ -1644,6 +1718,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1644 1718
1645 BUG_ON(pmd_huge(*pmd)); 1719 BUG_ON(pmd_huge(*pmd));
1646 1720
1721 arch_enter_lazy_mmu_mode();
1722
1647 token = pmd_pgtable(*pmd); 1723 token = pmd_pgtable(*pmd);
1648 1724
1649 do { 1725 do {
@@ -1652,6 +1728,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1652 break; 1728 break;
1653 } while (pte++, addr += PAGE_SIZE, addr != end); 1729 } while (pte++, addr += PAGE_SIZE, addr != end);
1654 1730
1731 arch_leave_lazy_mmu_mode();
1732
1655 if (mm != &init_mm) 1733 if (mm != &init_mm)
1656 pte_unmap_unlock(pte-1, ptl); 1734 pte_unmap_unlock(pte-1, ptl);
1657 return err; 1735 return err;
@@ -1837,10 +1915,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1837 * not dirty accountable. 1915 * not dirty accountable.
1838 */ 1916 */
1839 if (PageAnon(old_page)) { 1917 if (PageAnon(old_page)) {
1840 if (trylock_page(old_page)) { 1918 if (!trylock_page(old_page)) {
1841 reuse = can_share_swap_page(old_page); 1919 page_cache_get(old_page);
1842 unlock_page(old_page); 1920 pte_unmap_unlock(page_table, ptl);
1921 lock_page(old_page);
1922 page_table = pte_offset_map_lock(mm, pmd, address,
1923 &ptl);
1924 if (!pte_same(*page_table, orig_pte)) {
1925 unlock_page(old_page);
1926 page_cache_release(old_page);
1927 goto unlock;
1928 }
1929 page_cache_release(old_page);
1843 } 1930 }
1931 reuse = reuse_swap_page(old_page);
1932 unlock_page(old_page);
1844 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 1933 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
1845 (VM_WRITE|VM_SHARED))) { 1934 (VM_WRITE|VM_SHARED))) {
1846 /* 1935 /*
@@ -1910,7 +1999,7 @@ gotten:
1910 * Don't let another task, with possibly unlocked vma, 1999 * Don't let another task, with possibly unlocked vma,
1911 * keep the mlocked page. 2000 * keep the mlocked page.
1912 */ 2001 */
1913 if (vma->vm_flags & VM_LOCKED) { 2002 if ((vma->vm_flags & VM_LOCKED) && old_page) {
1914 lock_page(old_page); /* for LRU manipulation */ 2003 lock_page(old_page); /* for LRU manipulation */
1915 clear_page_mlock(old_page); 2004 clear_page_mlock(old_page);
1916 unlock_page(old_page); 2005 unlock_page(old_page);
@@ -1918,7 +2007,7 @@ gotten:
1918 cow_user_page(new_page, old_page, address, vma); 2007 cow_user_page(new_page, old_page, address, vma);
1919 __SetPageUptodate(new_page); 2008 __SetPageUptodate(new_page);
1920 2009
1921 if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) 2010 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
1922 goto oom_free_new; 2011 goto oom_free_new;
1923 2012
1924 /* 2013 /*
@@ -1943,11 +2032,7 @@ gotten:
1943 * thread doing COW. 2032 * thread doing COW.
1944 */ 2033 */
1945 ptep_clear_flush_notify(vma, address, page_table); 2034 ptep_clear_flush_notify(vma, address, page_table);
1946 SetPageSwapBacked(new_page);
1947 lru_cache_add_active_or_unevictable(new_page, vma);
1948 page_add_new_anon_rmap(new_page, vma, address); 2035 page_add_new_anon_rmap(new_page, vma, address);
1949
1950//TODO: is this safe? do_anonymous_page() does it this way.
1951 set_pte_at(mm, address, page_table, entry); 2036 set_pte_at(mm, address, page_table, entry);
1952 update_mmu_cache(vma, address, entry); 2037 update_mmu_cache(vma, address, entry);
1953 if (old_page) { 2038 if (old_page) {
@@ -1973,7 +2058,7 @@ gotten:
1973 * mapcount is visible. So transitively, TLBs to 2058 * mapcount is visible. So transitively, TLBs to
1974 * old page will be flushed before it can be reused. 2059 * old page will be flushed before it can be reused.
1975 */ 2060 */
1976 page_remove_rmap(old_page, vma); 2061 page_remove_rmap(old_page);
1977 } 2062 }
1978 2063
1979 /* Free the old page.. */ 2064 /* Free the old page.. */
@@ -2266,7 +2351,7 @@ int vmtruncate(struct inode * inode, loff_t offset)
2266 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); 2351 unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
2267 } 2352 }
2268 2353
2269 if (inode->i_op && inode->i_op->truncate) 2354 if (inode->i_op->truncate)
2270 inode->i_op->truncate(inode); 2355 inode->i_op->truncate(inode);
2271 return 0; 2356 return 0;
2272 2357
@@ -2286,7 +2371,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
2286 * a way to truncate a range of blocks (punch a hole) - 2371 * a way to truncate a range of blocks (punch a hole) -
2287 * we should return failure right now. 2372 * we should return failure right now.
2288 */ 2373 */
2289 if (!inode->i_op || !inode->i_op->truncate_range) 2374 if (!inode->i_op->truncate_range)
2290 return -ENOSYS; 2375 return -ENOSYS;
2291 2376
2292 mutex_lock(&inode->i_mutex); 2377 mutex_lock(&inode->i_mutex);
@@ -2314,6 +2399,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2314 struct page *page; 2399 struct page *page;
2315 swp_entry_t entry; 2400 swp_entry_t entry;
2316 pte_t pte; 2401 pte_t pte;
2402 struct mem_cgroup *ptr = NULL;
2317 int ret = 0; 2403 int ret = 0;
2318 2404
2319 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2405 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@@ -2352,7 +2438,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2352 lock_page(page); 2438 lock_page(page);
2353 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2439 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2354 2440
2355 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { 2441 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2356 ret = VM_FAULT_OOM; 2442 ret = VM_FAULT_OOM;
2357 unlock_page(page); 2443 unlock_page(page);
2358 goto out; 2444 goto out;
@@ -2370,22 +2456,35 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2370 goto out_nomap; 2456 goto out_nomap;
2371 } 2457 }
2372 2458
2373 /* The page isn't present yet, go ahead with the fault. */ 2459 /*
2460 * The page isn't present yet, go ahead with the fault.
2461 *
2462 * Be careful about the sequence of operations here.
2463 * To get its accounting right, reuse_swap_page() must be called
2464 * while the page is counted on swap but not yet in mapcount i.e.
2465 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
2466 * must be called after the swap_free(), or it will never succeed.
2467 * Because delete_from_swap_page() may be called by reuse_swap_page(),
2468 * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
2469 * in page->private. In this case, a record in swap_cgroup is silently
2470 * discarded at swap_free().
2471 */
2374 2472
2375 inc_mm_counter(mm, anon_rss); 2473 inc_mm_counter(mm, anon_rss);
2376 pte = mk_pte(page, vma->vm_page_prot); 2474 pte = mk_pte(page, vma->vm_page_prot);
2377 if (write_access && can_share_swap_page(page)) { 2475 if (write_access && reuse_swap_page(page)) {
2378 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2476 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2379 write_access = 0; 2477 write_access = 0;
2380 } 2478 }
2381
2382 flush_icache_page(vma, page); 2479 flush_icache_page(vma, page);
2383 set_pte_at(mm, address, page_table, pte); 2480 set_pte_at(mm, address, page_table, pte);
2384 page_add_anon_rmap(page, vma, address); 2481 page_add_anon_rmap(page, vma, address);
2482 /* It's better to call commit-charge after rmap is established */
2483 mem_cgroup_commit_charge_swapin(page, ptr);
2385 2484
2386 swap_free(entry); 2485 swap_free(entry);
2387 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 2486 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2388 remove_exclusive_swap_page(page); 2487 try_to_free_swap(page);
2389 unlock_page(page); 2488 unlock_page(page);
2390 2489
2391 if (write_access) { 2490 if (write_access) {
@@ -2402,7 +2501,7 @@ unlock:
2402out: 2501out:
2403 return ret; 2502 return ret;
2404out_nomap: 2503out_nomap:
2405 mem_cgroup_uncharge_page(page); 2504 mem_cgroup_cancel_charge_swapin(ptr);
2406 pte_unmap_unlock(page_table, ptl); 2505 pte_unmap_unlock(page_table, ptl);
2407 unlock_page(page); 2506 unlock_page(page);
2408 page_cache_release(page); 2507 page_cache_release(page);
@@ -2432,7 +2531,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2432 goto oom; 2531 goto oom;
2433 __SetPageUptodate(page); 2532 __SetPageUptodate(page);
2434 2533
2435 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) 2534 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
2436 goto oom_free_page; 2535 goto oom_free_page;
2437 2536
2438 entry = mk_pte(page, vma->vm_page_prot); 2537 entry = mk_pte(page, vma->vm_page_prot);
@@ -2442,8 +2541,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2442 if (!pte_none(*page_table)) 2541 if (!pte_none(*page_table))
2443 goto release; 2542 goto release;
2444 inc_mm_counter(mm, anon_rss); 2543 inc_mm_counter(mm, anon_rss);
2445 SetPageSwapBacked(page);
2446 lru_cache_add_active_or_unevictable(page, vma);
2447 page_add_new_anon_rmap(page, vma, address); 2544 page_add_new_anon_rmap(page, vma, address);
2448 set_pte_at(mm, address, page_table, entry); 2545 set_pte_at(mm, address, page_table, entry);
2449 2546
@@ -2525,7 +2622,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2525 ret = VM_FAULT_OOM; 2622 ret = VM_FAULT_OOM;
2526 goto out; 2623 goto out;
2527 } 2624 }
2528 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { 2625 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
2529 ret = VM_FAULT_OOM; 2626 ret = VM_FAULT_OOM;
2530 page_cache_release(page); 2627 page_cache_release(page);
2531 goto out; 2628 goto out;
@@ -2591,8 +2688,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2591 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2688 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2592 if (anon) { 2689 if (anon) {
2593 inc_mm_counter(mm, anon_rss); 2690 inc_mm_counter(mm, anon_rss);
2594 SetPageSwapBacked(page);
2595 lru_cache_add_active_or_unevictable(page, vma);
2596 page_add_new_anon_rmap(page, vma, address); 2691 page_add_new_anon_rmap(page, vma, address);
2597 } else { 2692 } else {
2598 inc_mm_counter(mm, file_rss); 2693 inc_mm_counter(mm, file_rss);
@@ -2602,7 +2697,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2602 get_page(dirty_page); 2697 get_page(dirty_page);
2603 } 2698 }
2604 } 2699 }
2605//TODO: is this safe? do_anonymous_page() does it this way.
2606 set_pte_at(mm, address, page_table, entry); 2700 set_pte_at(mm, address, page_table, entry);
2607 2701
2608 /* no need to invalidate: a not-present page won't be cached */ 2702 /* no need to invalidate: a not-present page won't be cached */
@@ -2666,12 +2760,11 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2666 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2760 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2667 return 0; 2761 return 0;
2668 2762
2669 if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || 2763 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
2670 !(vma->vm_flags & VM_CAN_NONLINEAR))) {
2671 /* 2764 /*
2672 * Page table corrupted: show pte and kill process. 2765 * Page table corrupted: show pte and kill process.
2673 */ 2766 */
2674 print_bad_pte(vma, orig_pte, address); 2767 print_bad_pte(vma, address, orig_pte, NULL);
2675 return VM_FAULT_OOM; 2768 return VM_FAULT_OOM;
2676 } 2769 }
2677 2770
@@ -2953,7 +3046,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
2953{ 3046{
2954 resource_size_t phys_addr; 3047 resource_size_t phys_addr;
2955 unsigned long prot = 0; 3048 unsigned long prot = 0;
2956 void *maddr; 3049 void __iomem *maddr;
2957 int offset = addr & (PAGE_SIZE-1); 3050 int offset = addr & (PAGE_SIZE-1);
2958 3051
2959 if (follow_phys(vma, addr, write, &prot, &phys_addr)) 3052 if (follow_phys(vma, addr, write, &prot, &phys_addr))
@@ -3079,6 +3172,15 @@ void print_vma_addr(char *prefix, unsigned long ip)
3079#ifdef CONFIG_PROVE_LOCKING 3172#ifdef CONFIG_PROVE_LOCKING
3080void might_fault(void) 3173void might_fault(void)
3081{ 3174{
3175 /*
3176 * Some code (nfs/sunrpc) uses socket ops on kernel memory while
3177 * holding the mmap_sem, this is safe because kernel memory doesn't
3178 * get paged out, therefore we'll never actually fault, and the
3179 * below annotations will generate false positives.
3180 */
3181 if (segment_eq(get_fs(), KERNEL_DS))
3182 return;
3183
3082 might_sleep(); 3184 might_sleep();
3083 /* 3185 /*
3084 * it would be nicer only to annotate paths which are not under 3186 * it would be nicer only to annotate paths which are not under
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b17371185468..c083cf5fd6df 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -216,7 +216,8 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
216 return 0; 216 return 0;
217} 217}
218 218
219static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn) 219static int __meminit __add_section(int nid, struct zone *zone,
220 unsigned long phys_start_pfn)
220{ 221{
221 int nr_pages = PAGES_PER_SECTION; 222 int nr_pages = PAGES_PER_SECTION;
222 int ret; 223 int ret;
@@ -234,7 +235,7 @@ static int __meminit __add_section(struct zone *zone, unsigned long phys_start_p
234 if (ret < 0) 235 if (ret < 0)
235 return ret; 236 return ret;
236 237
237 return register_new_memory(__pfn_to_section(phys_start_pfn)); 238 return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
238} 239}
239 240
240#ifdef CONFIG_SPARSEMEM_VMEMMAP 241#ifdef CONFIG_SPARSEMEM_VMEMMAP
@@ -273,8 +274,8 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
273 * call this function after deciding the zone to which to 274 * call this function after deciding the zone to which to
274 * add the new pages. 275 * add the new pages.
275 */ 276 */
276int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn, 277int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
277 unsigned long nr_pages) 278 unsigned long nr_pages)
278{ 279{
279 unsigned long i; 280 unsigned long i;
280 int err = 0; 281 int err = 0;
@@ -284,7 +285,7 @@ int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn,
284 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 285 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
285 286
286 for (i = start_sec; i <= end_sec; i++) { 287 for (i = start_sec; i <= end_sec; i++) {
287 err = __add_section(zone, i << PFN_SECTION_SHIFT); 288 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
288 289
289 /* 290 /*
290 * EEXIST is finally dealt with by ioresource collision 291 * EEXIST is finally dealt with by ioresource collision
@@ -626,15 +627,12 @@ int scan_lru_pages(unsigned long start, unsigned long end)
626} 627}
627 628
628static struct page * 629static struct page *
629hotremove_migrate_alloc(struct page *page, 630hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
630 unsigned long private,
631 int **x)
632{ 631{
633 /* This should be improoooooved!! */ 632 /* This should be improooooved!! */
634 return alloc_page(GFP_HIGHUSER_PAGECACHE); 633 return alloc_page(GFP_HIGHUSER_MOVABLE);
635} 634}
636 635
637
638#define NR_OFFLINE_AT_ONCE_PAGES (256) 636#define NR_OFFLINE_AT_ONCE_PAGES (256)
639static int 637static int
640do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 638do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e412ffa8e52e..3eb4a6fdc043 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1068,10 +1068,9 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1068 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; 1068 return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1069} 1069}
1070 1070
1071asmlinkage long sys_mbind(unsigned long start, unsigned long len, 1071SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1072 unsigned long mode, 1072 unsigned long, mode, unsigned long __user *, nmask,
1073 unsigned long __user *nmask, unsigned long maxnode, 1073 unsigned long, maxnode, unsigned, flags)
1074 unsigned flags)
1075{ 1074{
1076 nodemask_t nodes; 1075 nodemask_t nodes;
1077 int err; 1076 int err;
@@ -1091,8 +1090,8 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
1091} 1090}
1092 1091
1093/* Set the process memory policy */ 1092/* Set the process memory policy */
1094asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, 1093SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1095 unsigned long maxnode) 1094 unsigned long, maxnode)
1096{ 1095{
1097 int err; 1096 int err;
1098 nodemask_t nodes; 1097 nodemask_t nodes;
@@ -1110,9 +1109,9 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
1110 return do_set_mempolicy(mode, flags, &nodes); 1109 return do_set_mempolicy(mode, flags, &nodes);
1111} 1110}
1112 1111
1113asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, 1112SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1114 const unsigned long __user *old_nodes, 1113 const unsigned long __user *, old_nodes,
1115 const unsigned long __user *new_nodes) 1114 const unsigned long __user *, new_nodes)
1116{ 1115{
1117 const struct cred *cred = current_cred(), *tcred; 1116 const struct cred *cred = current_cred(), *tcred;
1118 struct mm_struct *mm; 1117 struct mm_struct *mm;
@@ -1185,10 +1184,9 @@ out:
1185 1184
1186 1185
1187/* Retrieve NUMA policy */ 1186/* Retrieve NUMA policy */
1188asmlinkage long sys_get_mempolicy(int __user *policy, 1187SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1189 unsigned long __user *nmask, 1188 unsigned long __user *, nmask, unsigned long, maxnode,
1190 unsigned long maxnode, 1189 unsigned long, addr, unsigned long, flags)
1191 unsigned long addr, unsigned long flags)
1192{ 1190{
1193 int err; 1191 int err;
1194 int uninitialized_var(pval); 1192 int uninitialized_var(pval);
diff --git a/mm/migrate.c b/mm/migrate.c
index 21631ab8c08b..2bb4e1d63520 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -121,20 +121,6 @@ static void remove_migration_pte(struct vm_area_struct *vma,
121 if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) 121 if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
122 goto out; 122 goto out;
123 123
124 /*
125 * Yes, ignore the return value from a GFP_ATOMIC mem_cgroup_charge.
126 * Failure is not an option here: we're now expected to remove every
127 * migration pte, and will cause crashes otherwise. Normally this
128 * is not an issue: mem_cgroup_prepare_migration bumped up the old
129 * page_cgroup count for safety, that's now attached to the new page,
130 * so this charge should just be another incrementation of the count,
131 * to keep in balance with rmap.c's mem_cgroup_uncharging. But if
132 * there's been a force_empty, those reference counts may no longer
133 * be reliable, and this charge can actually fail: oh well, we don't
134 * make the situation any worse by proceeding as if it had succeeded.
135 */
136 mem_cgroup_charge(new, mm, GFP_ATOMIC);
137
138 get_page(new); 124 get_page(new);
139 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 125 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
140 if (is_write_migration_entry(entry)) 126 if (is_write_migration_entry(entry))
@@ -300,12 +286,10 @@ static int migrate_page_move_mapping(struct address_space *mapping,
300 * Now we know that no one else is looking at the page. 286 * Now we know that no one else is looking at the page.
301 */ 287 */
302 get_page(newpage); /* add cache reference */ 288 get_page(newpage); /* add cache reference */
303#ifdef CONFIG_SWAP
304 if (PageSwapCache(page)) { 289 if (PageSwapCache(page)) {
305 SetPageSwapCache(newpage); 290 SetPageSwapCache(newpage);
306 set_page_private(newpage, page_private(page)); 291 set_page_private(newpage, page_private(page));
307 } 292 }
308#endif
309 293
310 radix_tree_replace_slot(pslot, newpage); 294 radix_tree_replace_slot(pslot, newpage);
311 295
@@ -373,18 +357,13 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
373 357
374 mlock_migrate_page(newpage, page); 358 mlock_migrate_page(newpage, page);
375 359
376#ifdef CONFIG_SWAP
377 ClearPageSwapCache(page); 360 ClearPageSwapCache(page);
378#endif
379 ClearPagePrivate(page); 361 ClearPagePrivate(page);
380 set_page_private(page, 0); 362 set_page_private(page, 0);
381 /* page->mapping contains a flag for PageAnon() */ 363 /* page->mapping contains a flag for PageAnon() */
382 anon = PageAnon(page); 364 anon = PageAnon(page);
383 page->mapping = NULL; 365 page->mapping = NULL;
384 366
385 if (!anon) /* This page was removed from radix-tree. */
386 mem_cgroup_uncharge_cache_page(page);
387
388 /* 367 /*
389 * If any waiters have accumulated on the new page then 368 * If any waiters have accumulated on the new page then
390 * wake them up. 369 * wake them up.
@@ -618,6 +597,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
618 struct page *newpage = get_new_page(page, private, &result); 597 struct page *newpage = get_new_page(page, private, &result);
619 int rcu_locked = 0; 598 int rcu_locked = 0;
620 int charge = 0; 599 int charge = 0;
600 struct mem_cgroup *mem;
621 601
622 if (!newpage) 602 if (!newpage)
623 return -ENOMEM; 603 return -ENOMEM;
@@ -627,24 +607,26 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
627 goto move_newpage; 607 goto move_newpage;
628 } 608 }
629 609
630 charge = mem_cgroup_prepare_migration(page, newpage);
631 if (charge == -ENOMEM) {
632 rc = -ENOMEM;
633 goto move_newpage;
634 }
635 /* prepare cgroup just returns 0 or -ENOMEM */ 610 /* prepare cgroup just returns 0 or -ENOMEM */
636 BUG_ON(charge);
637
638 rc = -EAGAIN; 611 rc = -EAGAIN;
612
639 if (!trylock_page(page)) { 613 if (!trylock_page(page)) {
640 if (!force) 614 if (!force)
641 goto move_newpage; 615 goto move_newpage;
642 lock_page(page); 616 lock_page(page);
643 } 617 }
644 618
619 /* charge against new page */
620 charge = mem_cgroup_prepare_migration(page, &mem);
621 if (charge == -ENOMEM) {
622 rc = -ENOMEM;
623 goto unlock;
624 }
625 BUG_ON(charge);
626
645 if (PageWriteback(page)) { 627 if (PageWriteback(page)) {
646 if (!force) 628 if (!force)
647 goto unlock; 629 goto uncharge;
648 wait_on_page_writeback(page); 630 wait_on_page_writeback(page);
649 } 631 }
650 /* 632 /*
@@ -697,7 +679,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
697rcu_unlock: 679rcu_unlock:
698 if (rcu_locked) 680 if (rcu_locked)
699 rcu_read_unlock(); 681 rcu_read_unlock();
700 682uncharge:
683 if (!charge)
684 mem_cgroup_end_migration(mem, page, newpage);
701unlock: 685unlock:
702 unlock_page(page); 686 unlock_page(page);
703 687
@@ -713,8 +697,6 @@ unlock:
713 } 697 }
714 698
715move_newpage: 699move_newpage:
716 if (!charge)
717 mem_cgroup_end_migration(newpage);
718 700
719 /* 701 /*
720 * Move the new page to the LRU. If migration was not successful 702 * Move the new page to the LRU. If migration was not successful
@@ -848,12 +830,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
848 struct vm_area_struct *vma; 830 struct vm_area_struct *vma;
849 struct page *page; 831 struct page *page;
850 832
851 /*
852 * A valid page pointer that will not match any of the
853 * pages that will be moved.
854 */
855 pp->page = ZERO_PAGE(0);
856
857 err = -EFAULT; 833 err = -EFAULT;
858 vma = find_vma(mm, pp->addr); 834 vma = find_vma(mm, pp->addr);
859 if (!vma || !vma_migratable(vma)) 835 if (!vma || !vma_migratable(vma))
@@ -919,41 +895,43 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
919 const int __user *nodes, 895 const int __user *nodes,
920 int __user *status, int flags) 896 int __user *status, int flags)
921{ 897{
922 struct page_to_node *pm = NULL; 898 struct page_to_node *pm;
923 nodemask_t task_nodes; 899 nodemask_t task_nodes;
924 int err = 0; 900 unsigned long chunk_nr_pages;
925 int i; 901 unsigned long chunk_start;
902 int err;
926 903
927 task_nodes = cpuset_mems_allowed(task); 904 task_nodes = cpuset_mems_allowed(task);
928 905
929 /* Limit nr_pages so that the multiplication may not overflow */ 906 err = -ENOMEM;
930 if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { 907 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
931 err = -E2BIG; 908 if (!pm)
932 goto out;
933 }
934
935 pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
936 if (!pm) {
937 err = -ENOMEM;
938 goto out; 909 goto out;
939 }
940
941 /* 910 /*
942 * Get parameters from user space and initialize the pm 911 * Store a chunk of page_to_node array in a page,
943 * array. Return various errors if the user did something wrong. 912 * but keep the last one as a marker
944 */ 913 */
945 for (i = 0; i < nr_pages; i++) { 914 chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
946 const void __user *p;
947 915
948 err = -EFAULT; 916 for (chunk_start = 0;
949 if (get_user(p, pages + i)) 917 chunk_start < nr_pages;
950 goto out_pm; 918 chunk_start += chunk_nr_pages) {
919 int j;
920
921 if (chunk_start + chunk_nr_pages > nr_pages)
922 chunk_nr_pages = nr_pages - chunk_start;
951 923
952 pm[i].addr = (unsigned long)p; 924 /* fill the chunk pm with addrs and nodes from user-space */
953 if (nodes) { 925 for (j = 0; j < chunk_nr_pages; j++) {
926 const void __user *p;
954 int node; 927 int node;
955 928
956 if (get_user(node, nodes + i)) 929 err = -EFAULT;
930 if (get_user(p, pages + j + chunk_start))
931 goto out_pm;
932 pm[j].addr = (unsigned long) p;
933
934 if (get_user(node, nodes + j + chunk_start))
957 goto out_pm; 935 goto out_pm;
958 936
959 err = -ENODEV; 937 err = -ENODEV;
@@ -964,22 +942,29 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
964 if (!node_isset(node, task_nodes)) 942 if (!node_isset(node, task_nodes))
965 goto out_pm; 943 goto out_pm;
966 944
967 pm[i].node = node; 945 pm[j].node = node;
968 } else 946 }
969 pm[i].node = 0; /* anything to not match MAX_NUMNODES */ 947
970 } 948 /* End marker for this chunk */
971 /* End marker */ 949 pm[chunk_nr_pages].node = MAX_NUMNODES;
972 pm[nr_pages].node = MAX_NUMNODES; 950
951 /* Migrate this chunk */
952 err = do_move_page_to_node_array(mm, pm,
953 flags & MPOL_MF_MOVE_ALL);
954 if (err < 0)
955 goto out_pm;
973 956
974 err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL);
975 if (err >= 0)
976 /* Return status information */ 957 /* Return status information */
977 for (i = 0; i < nr_pages; i++) 958 for (j = 0; j < chunk_nr_pages; j++)
978 if (put_user(pm[i].status, status + i)) 959 if (put_user(pm[j].status, status + j + chunk_start)) {
979 err = -EFAULT; 960 err = -EFAULT;
961 goto out_pm;
962 }
963 }
964 err = 0;
980 965
981out_pm: 966out_pm:
982 vfree(pm); 967 free_page((unsigned long)pm);
983out: 968out:
984 return err; 969 return err;
985} 970}
@@ -1070,10 +1055,10 @@ out:
1070 * Move a list of pages in the address space of the currently executing 1055 * Move a list of pages in the address space of the currently executing
1071 * process. 1056 * process.
1072 */ 1057 */
1073asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, 1058SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
1074 const void __user * __user *pages, 1059 const void __user * __user *, pages,
1075 const int __user *nodes, 1060 const int __user *, nodes,
1076 int __user *status, int flags) 1061 int __user *, status, int, flags)
1077{ 1062{
1078 const struct cred *cred = current_cred(), *tcred; 1063 const struct cred *cred = current_cred(), *tcred;
1079 struct task_struct *task; 1064 struct task_struct *task;
diff --git a/mm/mincore.c b/mm/mincore.c
index 5178800bc129..8cb508f84ea4 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -177,8 +177,8 @@ none_mapped:
177 * mapped 177 * mapped
178 * -EAGAIN - A kernel resource was temporarily unavailable. 178 * -EAGAIN - A kernel resource was temporarily unavailable.
179 */ 179 */
180asmlinkage long sys_mincore(unsigned long start, size_t len, 180SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
181 unsigned char __user * vec) 181 unsigned char __user *, vec)
182{ 182{
183 long retval; 183 long retval;
184 unsigned long pages; 184 unsigned long pages;
diff --git a/mm/mlock.c b/mm/mlock.c
index 3035a56e7616..028ec482fdd4 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -173,12 +173,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
173 (atomic_read(&mm->mm_users) != 0)); 173 (atomic_read(&mm->mm_users) != 0));
174 174
175 /* 175 /*
176 * mlock: don't page populate if page has PROT_NONE permission. 176 * mlock: don't page populate if vma has PROT_NONE permission.
177 * munlock: the pages always do munlock althrough 177 * munlock: always do munlock although the vma has PROT_NONE
178 * its has PROT_NONE permission. 178 * permission, or SIGKILL is pending.
179 */ 179 */
180 if (!mlock) 180 if (!mlock)
181 gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS; 181 gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS |
182 GUP_FLAGS_IGNORE_SIGKILL;
182 183
183 if (vma->vm_flags & VM_WRITE) 184 if (vma->vm_flags & VM_WRITE)
184 gup_flags |= GUP_FLAGS_WRITE; 185 gup_flags |= GUP_FLAGS_WRITE;
@@ -293,14 +294,10 @@ static inline int __mlock_posix_error_return(long retval)
293 * 294 *
294 * return number of pages [> 0] to be removed from locked_vm on success 295 * return number of pages [> 0] to be removed from locked_vm on success
295 * of "special" vmas. 296 * of "special" vmas.
296 *
297 * return negative error if vma spanning @start-@range disappears while
298 * mmap semaphore is dropped. Unlikely?
299 */ 297 */
300long mlock_vma_pages_range(struct vm_area_struct *vma, 298long mlock_vma_pages_range(struct vm_area_struct *vma,
301 unsigned long start, unsigned long end) 299 unsigned long start, unsigned long end)
302{ 300{
303 struct mm_struct *mm = vma->vm_mm;
304 int nr_pages = (end - start) / PAGE_SIZE; 301 int nr_pages = (end - start) / PAGE_SIZE;
305 BUG_ON(!(vma->vm_flags & VM_LOCKED)); 302 BUG_ON(!(vma->vm_flags & VM_LOCKED));
306 303
@@ -313,20 +310,8 @@ long mlock_vma_pages_range(struct vm_area_struct *vma,
313 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || 310 if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) ||
314 is_vm_hugetlb_page(vma) || 311 is_vm_hugetlb_page(vma) ||
315 vma == get_gate_vma(current))) { 312 vma == get_gate_vma(current))) {
316 long error;
317 downgrade_write(&mm->mmap_sem);
318
319 error = __mlock_vma_pages_range(vma, start, end, 1);
320 313
321 up_read(&mm->mmap_sem); 314 return __mlock_vma_pages_range(vma, start, end, 1);
322 /* vma can change or disappear */
323 down_write(&mm->mmap_sem);
324 vma = find_vma(mm, start);
325 /* non-NULL vma must contain @start, but need to check @end */
326 if (!vma || end > vma->vm_end)
327 return -ENOMEM;
328
329 return 0; /* hide other errors from mmap(), et al */
330 } 315 }
331 316
332 /* 317 /*
@@ -437,41 +422,14 @@ success:
437 vma->vm_flags = newflags; 422 vma->vm_flags = newflags;
438 423
439 if (lock) { 424 if (lock) {
440 /*
441 * mmap_sem is currently held for write. Downgrade the write
442 * lock to a read lock so that other faults, mmap scans, ...
443 * while we fault in all pages.
444 */
445 downgrade_write(&mm->mmap_sem);
446
447 ret = __mlock_vma_pages_range(vma, start, end, 1); 425 ret = __mlock_vma_pages_range(vma, start, end, 1);
448 426
449 /* 427 if (ret > 0) {
450 * Need to reacquire mmap sem in write mode, as our callers
451 * expect this. We have no support for atomically upgrading
452 * a sem to write, so we need to check for ranges while sem
453 * is unlocked.
454 */
455 up_read(&mm->mmap_sem);
456 /* vma can change or disappear */
457 down_write(&mm->mmap_sem);
458 *prev = find_vma(mm, start);
459 /* non-NULL *prev must contain @start, but need to check @end */
460 if (!(*prev) || end > (*prev)->vm_end)
461 ret = -ENOMEM;
462 else if (ret > 0) {
463 mm->locked_vm -= ret; 428 mm->locked_vm -= ret;
464 ret = 0; 429 ret = 0;
465 } else 430 } else
466 ret = __mlock_posix_error_return(ret); /* translate if needed */ 431 ret = __mlock_posix_error_return(ret); /* translate if needed */
467 } else { 432 } else {
468 /*
469 * TODO: for unlocking, pages will already be resident, so
470 * we don't need to wait for allocations/reclaim/pagein, ...
471 * However, unlocking a very large region can still take a
472 * while. Should we downgrade the semaphore for both lock
473 * AND unlock ?
474 */
475 __mlock_vma_pages_range(vma, start, end, 0); 433 __mlock_vma_pages_range(vma, start, end, 0);
476 } 434 }
477 435
@@ -529,7 +487,7 @@ static int do_mlock(unsigned long start, size_t len, int on)
529 return error; 487 return error;
530} 488}
531 489
532asmlinkage long sys_mlock(unsigned long start, size_t len) 490SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
533{ 491{
534 unsigned long locked; 492 unsigned long locked;
535 unsigned long lock_limit; 493 unsigned long lock_limit;
@@ -557,7 +515,7 @@ asmlinkage long sys_mlock(unsigned long start, size_t len)
557 return error; 515 return error;
558} 516}
559 517
560asmlinkage long sys_munlock(unsigned long start, size_t len) 518SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
561{ 519{
562 int ret; 520 int ret;
563 521
@@ -594,7 +552,7 @@ out:
594 return 0; 552 return 0;
595} 553}
596 554
597asmlinkage long sys_mlockall(int flags) 555SYSCALL_DEFINE1(mlockall, int, flags)
598{ 556{
599 unsigned long lock_limit; 557 unsigned long lock_limit;
600 int ret = -EINVAL; 558 int ret = -EINVAL;
@@ -622,7 +580,7 @@ out:
622 return ret; 580 return ret;
623} 581}
624 582
625asmlinkage long sys_munlockall(void) 583SYSCALL_DEFINE0(munlockall)
626{ 584{
627 int ret; 585 int ret;
628 586
diff --git a/mm/mmap.c b/mm/mmap.c
index c3647f3b0621..3b3ed0bb9fdb 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Written by obz. 4 * Written by obz.
5 * 5 *
6 * Address space accounting code <alan@redhat.com> 6 * Address space accounting code <alan@lxorguk.ukuu.org.uk>
7 */ 7 */
8 8
9#include <linux/slab.h> 9#include <linux/slab.h>
@@ -246,7 +246,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
246 return next; 246 return next;
247} 247}
248 248
249asmlinkage unsigned long sys_brk(unsigned long brk) 249SYSCALL_DEFINE1(brk, unsigned long, brk)
250{ 250{
251 unsigned long rlim, retval; 251 unsigned long rlim, retval;
252 unsigned long newbrk, oldbrk; 252 unsigned long newbrk, oldbrk;
@@ -414,7 +414,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
414 414
415static void __vma_link_file(struct vm_area_struct *vma) 415static void __vma_link_file(struct vm_area_struct *vma)
416{ 416{
417 struct file * file; 417 struct file *file;
418 418
419 file = vma->vm_file; 419 file = vma->vm_file;
420 if (file) { 420 if (file) {
@@ -475,11 +475,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
475 * insert vm structure into list and rbtree and anon_vma, 475 * insert vm structure into list and rbtree and anon_vma,
476 * but it has already been inserted into prio_tree earlier. 476 * but it has already been inserted into prio_tree earlier.
477 */ 477 */
478static void 478static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
479__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
480{ 479{
481 struct vm_area_struct * __vma, * prev; 480 struct vm_area_struct *__vma, *prev;
482 struct rb_node ** rb_link, * rb_parent; 481 struct rb_node **rb_link, *rb_parent;
483 482
484 __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); 483 __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
485 BUG_ON(__vma && __vma->vm_start < vma->vm_end); 484 BUG_ON(__vma && __vma->vm_start < vma->vm_end);
@@ -660,6 +659,9 @@ again: remove_next = 1 + (end > next->vm_end);
660 validate_mm(mm); 659 validate_mm(mm);
661} 660}
662 661
662/* Flags that can be inherited from an existing mapping when merging */
663#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR)
664
663/* 665/*
664 * If the vma has a ->close operation then the driver probably needs to release 666 * If the vma has a ->close operation then the driver probably needs to release
665 * per-vma resources, so we don't attempt to merge those. 667 * per-vma resources, so we don't attempt to merge those.
@@ -667,7 +669,7 @@ again: remove_next = 1 + (end > next->vm_end);
667static inline int is_mergeable_vma(struct vm_area_struct *vma, 669static inline int is_mergeable_vma(struct vm_area_struct *vma,
668 struct file *file, unsigned long vm_flags) 670 struct file *file, unsigned long vm_flags)
669{ 671{
670 if (vma->vm_flags != vm_flags) 672 if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS)
671 return 0; 673 return 0;
672 if (vma->vm_file != file) 674 if (vma->vm_file != file)
673 return 0; 675 return 0;
@@ -909,7 +911,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
909 * The caller must hold down_write(current->mm->mmap_sem). 911 * The caller must hold down_write(current->mm->mmap_sem).
910 */ 912 */
911 913
912unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, 914unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
913 unsigned long len, unsigned long prot, 915 unsigned long len, unsigned long prot,
914 unsigned long flags, unsigned long pgoff) 916 unsigned long flags, unsigned long pgoff)
915{ 917{
@@ -1092,6 +1094,15 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
1092 mapping_cap_account_dirty(vma->vm_file->f_mapping); 1094 mapping_cap_account_dirty(vma->vm_file->f_mapping);
1093} 1095}
1094 1096
1097/*
1098 * We account for memory if it's a private writeable mapping,
1099 * and VM_NORESERVE wasn't set.
1100 */
1101static inline int accountable_mapping(unsigned int vm_flags)
1102{
1103 return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
1104}
1105
1095unsigned long mmap_region(struct file *file, unsigned long addr, 1106unsigned long mmap_region(struct file *file, unsigned long addr,
1096 unsigned long len, unsigned long flags, 1107 unsigned long len, unsigned long flags,
1097 unsigned int vm_flags, unsigned long pgoff, 1108 unsigned int vm_flags, unsigned long pgoff,
@@ -1119,36 +1130,32 @@ munmap_back:
1119 if (!may_expand_vm(mm, len >> PAGE_SHIFT)) 1130 if (!may_expand_vm(mm, len >> PAGE_SHIFT))
1120 return -ENOMEM; 1131 return -ENOMEM;
1121 1132
1122 if (flags & MAP_NORESERVE) 1133 /*
1134 * Set 'VM_NORESERVE' if we should not account for the
1135 * memory use of this mapping. We only honor MAP_NORESERVE
1136 * if we're allowed to overcommit memory.
1137 */
1138 if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER)
1139 vm_flags |= VM_NORESERVE;
1140 if (!accountable)
1123 vm_flags |= VM_NORESERVE; 1141 vm_flags |= VM_NORESERVE;
1124 1142
1125 if (accountable && (!(flags & MAP_NORESERVE) || 1143 /*
1126 sysctl_overcommit_memory == OVERCOMMIT_NEVER)) { 1144 * Private writable mapping: check memory availability
1127 if (vm_flags & VM_SHARED) { 1145 */
1128 /* Check memory availability in shmem_file_setup? */ 1146 if (accountable_mapping(vm_flags)) {
1129 vm_flags |= VM_ACCOUNT; 1147 charged = len >> PAGE_SHIFT;
1130 } else if (vm_flags & VM_WRITE) { 1148 if (security_vm_enough_memory(charged))
1131 /* 1149 return -ENOMEM;
1132 * Private writable mapping: check memory availability 1150 vm_flags |= VM_ACCOUNT;
1133 */
1134 charged = len >> PAGE_SHIFT;
1135 if (security_vm_enough_memory(charged))
1136 return -ENOMEM;
1137 vm_flags |= VM_ACCOUNT;
1138 }
1139 } 1151 }
1140 1152
1141 /* 1153 /*
1142 * Can we just expand an old private anonymous mapping? 1154 * Can we just expand an old mapping?
1143 * The VM_SHARED test is necessary because shmem_zero_setup
1144 * will create the file object for a shared anonymous map below.
1145 */ 1155 */
1146 if (!file && !(vm_flags & VM_SHARED)) { 1156 vma = vma_merge(mm, prev, addr, addr + len, vm_flags, NULL, file, pgoff, NULL);
1147 vma = vma_merge(mm, prev, addr, addr + len, vm_flags, 1157 if (vma)
1148 NULL, NULL, pgoff, NULL); 1158 goto out;
1149 if (vma)
1150 goto out;
1151 }
1152 1159
1153 /* 1160 /*
1154 * Determine the object being mapped and call the appropriate 1161 * Determine the object being mapped and call the appropriate
@@ -1191,14 +1198,6 @@ munmap_back:
1191 goto free_vma; 1198 goto free_vma;
1192 } 1199 }
1193 1200
1194 /* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
1195 * shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
1196 * that memory reservation must be checked; but that reservation
1197 * belongs to shared memory object, not to vma: so now clear it.
1198 */
1199 if ((vm_flags & (VM_SHARED|VM_ACCOUNT)) == (VM_SHARED|VM_ACCOUNT))
1200 vma->vm_flags &= ~VM_ACCOUNT;
1201
1202 /* Can addr have changed?? 1201 /* Can addr have changed??
1203 * 1202 *
1204 * Answer: Yes, several device drivers can do it in their 1203 * Answer: Yes, several device drivers can do it in their
@@ -1211,17 +1210,8 @@ munmap_back:
1211 if (vma_wants_writenotify(vma)) 1210 if (vma_wants_writenotify(vma))
1212 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); 1211 vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED);
1213 1212
1214 if (file && vma_merge(mm, prev, addr, vma->vm_end, 1213 vma_link(mm, vma, prev, rb_link, rb_parent);
1215 vma->vm_flags, NULL, file, pgoff, vma_policy(vma))) { 1214 file = vma->vm_file;
1216 mpol_put(vma_policy(vma));
1217 kmem_cache_free(vm_area_cachep, vma);
1218 fput(file);
1219 if (vm_flags & VM_EXECUTABLE)
1220 removed_exe_file_vma(mm);
1221 } else {
1222 vma_link(mm, vma, prev, rb_link, rb_parent);
1223 file = vma->vm_file;
1224 }
1225 1215
1226 /* Once vma denies write, undo our temporary denial count */ 1216 /* Once vma denies write, undo our temporary denial count */
1227 if (correct_wcount) 1217 if (correct_wcount)
@@ -1468,7 +1458,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1468EXPORT_SYMBOL(get_unmapped_area); 1458EXPORT_SYMBOL(get_unmapped_area);
1469 1459
1470/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ 1460/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
1471struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) 1461struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1472{ 1462{
1473 struct vm_area_struct *vma = NULL; 1463 struct vm_area_struct *vma = NULL;
1474 1464
@@ -1511,7 +1501,7 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
1511 struct vm_area_struct **pprev) 1501 struct vm_area_struct **pprev)
1512{ 1502{
1513 struct vm_area_struct *vma = NULL, *prev = NULL; 1503 struct vm_area_struct *vma = NULL, *prev = NULL;
1514 struct rb_node * rb_node; 1504 struct rb_node *rb_node;
1515 if (!mm) 1505 if (!mm)
1516 goto out; 1506 goto out;
1517 1507
@@ -1545,7 +1535,7 @@ out:
1545 * update accounting. This is shared with both the 1535 * update accounting. This is shared with both the
1546 * grow-up and grow-down cases. 1536 * grow-up and grow-down cases.
1547 */ 1537 */
1548static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow) 1538static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
1549{ 1539{
1550 struct mm_struct *mm = vma->vm_mm; 1540 struct mm_struct *mm = vma->vm_mm;
1551 struct rlimit *rlim = current->signal->rlim; 1541 struct rlimit *rlim = current->signal->rlim;
@@ -1953,7 +1943,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1953 1943
1954EXPORT_SYMBOL(do_munmap); 1944EXPORT_SYMBOL(do_munmap);
1955 1945
1956asmlinkage long sys_munmap(unsigned long addr, size_t len) 1946SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
1957{ 1947{
1958 int ret; 1948 int ret;
1959 struct mm_struct *mm = current->mm; 1949 struct mm_struct *mm = current->mm;
@@ -2095,6 +2085,9 @@ void exit_mmap(struct mm_struct *mm)
2095 arch_exit_mmap(mm); 2085 arch_exit_mmap(mm);
2096 mmu_notifier_release(mm); 2086 mmu_notifier_release(mm);
2097 2087
2088 if (!mm->mmap) /* Can happen if dup_mmap() received an OOM */
2089 return;
2090
2098 if (mm->locked_vm) { 2091 if (mm->locked_vm) {
2099 vma = mm->mmap; 2092 vma = mm->mmap;
2100 while (vma) { 2093 while (vma) {
@@ -2107,7 +2100,7 @@ void exit_mmap(struct mm_struct *mm)
2107 lru_add_drain(); 2100 lru_add_drain();
2108 flush_cache_mm(mm); 2101 flush_cache_mm(mm);
2109 tlb = tlb_gather_mmu(mm, 1); 2102 tlb = tlb_gather_mmu(mm, 1);
2110 /* Don't update_hiwater_rss(mm) here, do_exit already did */ 2103 /* update_hiwater_rss(mm) here? but nobody should be looking */
2111 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2104 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2112 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2105 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2113 vm_unacct_memory(nr_accounted); 2106 vm_unacct_memory(nr_accounted);
@@ -2474,3 +2467,13 @@ void mm_drop_all_locks(struct mm_struct *mm)
2474 2467
2475 mutex_unlock(&mm_all_locks_mutex); 2468 mutex_unlock(&mm_all_locks_mutex);
2476} 2469}
2470
2471/*
2472 * initialise the VMA slab
2473 */
2474void __init mmap_init(void)
2475{
2476 vm_area_cachep = kmem_cache_create("vm_area_struct",
2477 sizeof(struct vm_area_struct), 0,
2478 SLAB_PANIC, NULL);
2479}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index fded06f923f4..abe2694e13f4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -4,7 +4,7 @@
4 * (C) Copyright 1994 Linus Torvalds 4 * (C) Copyright 1994 Linus Torvalds
5 * (C) Copyright 2002 Christoph Hellwig 5 * (C) Copyright 2002 Christoph Hellwig
6 * 6 *
7 * Address space accounting code <alan@redhat.com> 7 * Address space accounting code <alan@lxorguk.ukuu.org.uk>
8 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved 8 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved
9 */ 9 */
10 10
@@ -22,6 +22,7 @@
22#include <linux/swap.h> 22#include <linux/swap.h>
23#include <linux/swapops.h> 23#include <linux/swapops.h>
24#include <linux/mmu_notifier.h> 24#include <linux/mmu_notifier.h>
25#include <linux/migrate.h>
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26#include <asm/pgtable.h> 27#include <asm/pgtable.h>
27#include <asm/cacheflush.h> 28#include <asm/cacheflush.h>
@@ -59,8 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
59 ptent = pte_mkwrite(ptent); 60 ptent = pte_mkwrite(ptent);
60 61
61 ptep_modify_prot_commit(mm, addr, pte, ptent); 62 ptep_modify_prot_commit(mm, addr, pte, ptent);
62#ifdef CONFIG_MIGRATION 63 } else if (PAGE_MIGRATION && !pte_file(oldpte)) {
63 } else if (!pte_file(oldpte)) {
64 swp_entry_t entry = pte_to_swp_entry(oldpte); 64 swp_entry_t entry = pte_to_swp_entry(oldpte);
65 65
66 if (is_write_migration_entry(entry)) { 66 if (is_write_migration_entry(entry)) {
@@ -72,9 +72,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
72 set_pte_at(mm, addr, pte, 72 set_pte_at(mm, addr, pte,
73 swp_entry_to_pte(entry)); 73 swp_entry_to_pte(entry));
74 } 74 }
75#endif
76 } 75 }
77
78 } while (pte++, addr += PAGE_SIZE, addr != end); 76 } while (pte++, addr += PAGE_SIZE, addr != end);
79 arch_leave_lazy_mmu_mode(); 77 arch_leave_lazy_mmu_mode();
80 pte_unmap_unlock(pte - 1, ptl); 78 pte_unmap_unlock(pte - 1, ptl);
@@ -219,8 +217,8 @@ fail:
219 return error; 217 return error;
220} 218}
221 219
222asmlinkage long 220SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,
223sys_mprotect(unsigned long start, size_t len, unsigned long prot) 221 unsigned long, prot)
224{ 222{
225 unsigned long vm_flags, nstart, end, tmp, reqprot; 223 unsigned long vm_flags, nstart, end, tmp, reqprot;
226 struct vm_area_struct *vma, *prev; 224 struct vm_area_struct *vma, *prev;
diff --git a/mm/mremap.c b/mm/mremap.c
index 58a2908f42f5..a39b7b91be46 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * (C) Copyright 1996 Linus Torvalds 4 * (C) Copyright 1996 Linus Torvalds
5 * 5 *
6 * Address space accounting code <alan@redhat.com> 6 * Address space accounting code <alan@lxorguk.ukuu.org.uk>
7 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved 7 * (C) Copyright 2002 Red Hat Inc, All Rights Reserved
8 */ 8 */
9 9
@@ -420,9 +420,9 @@ out_nc:
420 return ret; 420 return ret;
421} 421}
422 422
423asmlinkage unsigned long sys_mremap(unsigned long addr, 423SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
424 unsigned long old_len, unsigned long new_len, 424 unsigned long, new_len, unsigned long, flags,
425 unsigned long flags, unsigned long new_addr) 425 unsigned long, new_addr)
426{ 426{
427 unsigned long ret; 427 unsigned long ret;
428 428
diff --git a/mm/msync.c b/mm/msync.c
index 144a7570535d..4083209b7f02 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -28,7 +28,7 @@
28 * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to 28 * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to
29 * applications. 29 * applications.
30 */ 30 */
31asmlinkage long sys_msync(unsigned long start, size_t len, int flags) 31SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
32{ 32{
33 unsigned long end; 33 unsigned long end;
34 struct mm_struct *mm = current->mm; 34 struct mm_struct *mm = current->mm;
@@ -82,7 +82,7 @@ asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
82 (vma->vm_flags & VM_SHARED)) { 82 (vma->vm_flags & VM_SHARED)) {
83 get_file(file); 83 get_file(file);
84 up_read(&mm->mmap_sem); 84 up_read(&mm->mmap_sem);
85 error = do_fsync(file, 0); 85 error = vfs_fsync(file, file->f_path.dentry, 0);
86 fput(file); 86 fput(file);
87 if (error || start >= end) 87 if (error || start >= end)
88 goto out; 88 goto out;
diff --git a/mm/nommu.c b/mm/nommu.c
index 7695dc850785..2fcf47d449b4 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -6,11 +6,11 @@
6 * 6 *
7 * See Documentation/nommu-mmap.txt 7 * See Documentation/nommu-mmap.txt
8 * 8 *
9 * Copyright (c) 2004-2005 David Howells <dhowells@redhat.com> 9 * Copyright (c) 2004-2008 David Howells <dhowells@redhat.com>
10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> 10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> 11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> 12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
13 * Copyright (c) 2007 Paul Mundt <lethal@linux-sh.org> 13 * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org>
14 */ 14 */
15 15
16#include <linux/module.h> 16#include <linux/module.h>
@@ -33,6 +33,28 @@
33#include <asm/uaccess.h> 33#include <asm/uaccess.h>
34#include <asm/tlb.h> 34#include <asm/tlb.h>
35#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
36#include "internal.h"
37
38static inline __attribute__((format(printf, 1, 2)))
39void no_printk(const char *fmt, ...)
40{
41}
42
43#if 0
44#define kenter(FMT, ...) \
45 printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
46#define kleave(FMT, ...) \
47 printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
48#define kdebug(FMT, ...) \
49 printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__)
50#else
51#define kenter(FMT, ...) \
52 no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
53#define kleave(FMT, ...) \
54 no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
55#define kdebug(FMT, ...) \
56 no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
57#endif
36 58
37#include "internal.h" 59#include "internal.h"
38 60
@@ -40,19 +62,22 @@ void *high_memory;
40struct page *mem_map; 62struct page *mem_map;
41unsigned long max_mapnr; 63unsigned long max_mapnr;
42unsigned long num_physpages; 64unsigned long num_physpages;
43unsigned long askedalloc, realalloc;
44atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); 65atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
45int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 66int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
46int sysctl_overcommit_ratio = 50; /* default is 50% */ 67int sysctl_overcommit_ratio = 50; /* default is 50% */
47int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; 68int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
69int sysctl_nr_trim_pages = 1; /* page trimming behaviour */
48int heap_stack_gap = 0; 70int heap_stack_gap = 0;
49 71
72atomic_t mmap_pages_allocated;
73
50EXPORT_SYMBOL(mem_map); 74EXPORT_SYMBOL(mem_map);
51EXPORT_SYMBOL(num_physpages); 75EXPORT_SYMBOL(num_physpages);
52 76
53/* list of shareable VMAs */ 77/* list of mapped, potentially shareable regions */
54struct rb_root nommu_vma_tree = RB_ROOT; 78static struct kmem_cache *vm_region_jar;
55DECLARE_RWSEM(nommu_vma_sem); 79struct rb_root nommu_region_tree = RB_ROOT;
80DECLARE_RWSEM(nommu_region_sem);
56 81
57struct vm_operations_struct generic_file_vm_ops = { 82struct vm_operations_struct generic_file_vm_ops = {
58}; 83};
@@ -86,7 +111,7 @@ do_expand:
86 i_size_write(inode, offset); 111 i_size_write(inode, offset);
87 112
88out_truncate: 113out_truncate:
89 if (inode->i_op && inode->i_op->truncate) 114 if (inode->i_op->truncate)
90 inode->i_op->truncate(inode); 115 inode->i_op->truncate(inode);
91 return 0; 116 return 0;
92out_sig: 117out_sig:
@@ -124,6 +149,20 @@ unsigned int kobjsize(const void *objp)
124 return ksize(objp); 149 return ksize(objp);
125 150
126 /* 151 /*
152 * If it's not a compound page, see if we have a matching VMA
153 * region. This test is intentionally done in reverse order,
154 * so if there's no VMA, we still fall through and hand back
155 * PAGE_SIZE for 0-order pages.
156 */
157 if (!PageCompound(page)) {
158 struct vm_area_struct *vma;
159
160 vma = find_vma(current->mm, (unsigned long)objp);
161 if (vma)
162 return vma->vm_end - vma->vm_start;
163 }
164
165 /*
127 * The ksize() function is only guaranteed to work for pointers 166 * The ksize() function is only guaranteed to work for pointers
128 * returned by kmalloc(). So handle arbitrary pointers here. 167 * returned by kmalloc(). So handle arbitrary pointers here.
129 */ 168 */
@@ -355,6 +394,24 @@ void vunmap(const void *addr)
355} 394}
356EXPORT_SYMBOL(vunmap); 395EXPORT_SYMBOL(vunmap);
357 396
397void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
398{
399 BUG();
400 return NULL;
401}
402EXPORT_SYMBOL(vm_map_ram);
403
404void vm_unmap_ram(const void *mem, unsigned int count)
405{
406 BUG();
407}
408EXPORT_SYMBOL(vm_unmap_ram);
409
410void vm_unmap_aliases(void)
411{
412}
413EXPORT_SYMBOL_GPL(vm_unmap_aliases);
414
358/* 415/*
359 * Implement a stub for vmalloc_sync_all() if the architecture chose not to 416 * Implement a stub for vmalloc_sync_all() if the architecture chose not to
360 * have one. 417 * have one.
@@ -377,7 +434,7 @@ EXPORT_SYMBOL(vm_insert_page);
377 * to a regular file. in this case, the unmapping will need 434 * to a regular file. in this case, the unmapping will need
378 * to invoke file system routines that need the global lock. 435 * to invoke file system routines that need the global lock.
379 */ 436 */
380asmlinkage unsigned long sys_brk(unsigned long brk) 437SYSCALL_DEFINE1(brk, unsigned long, brk)
381{ 438{
382 struct mm_struct *mm = current->mm; 439 struct mm_struct *mm = current->mm;
383 440
@@ -401,129 +458,178 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
401 return mm->brk = brk; 458 return mm->brk = brk;
402} 459}
403 460
404#ifdef DEBUG 461/*
405static void show_process_blocks(void) 462 * initialise the VMA and region record slabs
463 */
464void __init mmap_init(void)
406{ 465{
407 struct vm_list_struct *vml; 466 vm_region_jar = kmem_cache_create("vm_region_jar",
408 467 sizeof(struct vm_region), 0,
409 printk("Process blocks %d:", current->pid); 468 SLAB_PANIC, NULL);
410 469 vm_area_cachep = kmem_cache_create("vm_area_struct",
411 for (vml = &current->mm->context.vmlist; vml; vml = vml->next) { 470 sizeof(struct vm_area_struct), 0,
412 printk(" %p: %p", vml, vml->vma); 471 SLAB_PANIC, NULL);
413 if (vml->vma)
414 printk(" (%d @%lx #%d)",
415 kobjsize((void *) vml->vma->vm_start),
416 vml->vma->vm_start,
417 atomic_read(&vml->vma->vm_usage));
418 printk(vml->next ? " ->" : ".\n");
419 }
420} 472}
421#endif /* DEBUG */
422 473
423/* 474/*
424 * add a VMA into a process's mm_struct in the appropriate place in the list 475 * validate the region tree
425 * - should be called with mm->mmap_sem held writelocked 476 * - the caller must hold the region lock
426 */ 477 */
427static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml) 478#ifdef CONFIG_DEBUG_NOMMU_REGIONS
479static noinline void validate_nommu_regions(void)
428{ 480{
429 struct vm_list_struct **ppv; 481 struct vm_region *region, *last;
430 482 struct rb_node *p, *lastp;
431 for (ppv = &current->mm->context.vmlist; *ppv; ppv = &(*ppv)->next) 483
432 if ((*ppv)->vma->vm_start > vml->vma->vm_start) 484 lastp = rb_first(&nommu_region_tree);
433 break; 485 if (!lastp)
434 486 return;
435 vml->next = *ppv; 487
436 *ppv = vml; 488 last = rb_entry(lastp, struct vm_region, vm_rb);
489 if (unlikely(last->vm_end <= last->vm_start))
490 BUG();
491 if (unlikely(last->vm_top < last->vm_end))
492 BUG();
493
494 while ((p = rb_next(lastp))) {
495 region = rb_entry(p, struct vm_region, vm_rb);
496 last = rb_entry(lastp, struct vm_region, vm_rb);
497
498 if (unlikely(region->vm_end <= region->vm_start))
499 BUG();
500 if (unlikely(region->vm_top < region->vm_end))
501 BUG();
502 if (unlikely(region->vm_start < last->vm_top))
503 BUG();
504
505 lastp = p;
506 }
437} 507}
508#else
509#define validate_nommu_regions() do {} while(0)
510#endif
438 511
439/* 512/*
440 * look up the first VMA in which addr resides, NULL if none 513 * add a region into the global tree
441 * - should be called with mm->mmap_sem at least held readlocked
442 */ 514 */
443struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) 515static void add_nommu_region(struct vm_region *region)
444{ 516{
445 struct vm_list_struct *loop, *vml; 517 struct vm_region *pregion;
518 struct rb_node **p, *parent;
446 519
447 /* search the vm_start ordered list */ 520 validate_nommu_regions();
448 vml = NULL; 521
449 for (loop = mm->context.vmlist; loop; loop = loop->next) { 522 BUG_ON(region->vm_start & ~PAGE_MASK);
450 if (loop->vma->vm_start > addr) 523
451 break; 524 parent = NULL;
452 vml = loop; 525 p = &nommu_region_tree.rb_node;
526 while (*p) {
527 parent = *p;
528 pregion = rb_entry(parent, struct vm_region, vm_rb);
529 if (region->vm_start < pregion->vm_start)
530 p = &(*p)->rb_left;
531 else if (region->vm_start > pregion->vm_start)
532 p = &(*p)->rb_right;
533 else if (pregion == region)
534 return;
535 else
536 BUG();
453 } 537 }
454 538
455 if (vml && vml->vma->vm_end > addr) 539 rb_link_node(&region->vm_rb, parent, p);
456 return vml->vma; 540 rb_insert_color(&region->vm_rb, &nommu_region_tree);
457 541
458 return NULL; 542 validate_nommu_regions();
459} 543}
460EXPORT_SYMBOL(find_vma);
461 544
462/* 545/*
463 * find a VMA 546 * delete a region from the global tree
464 * - we don't extend stack VMAs under NOMMU conditions
465 */ 547 */
466struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) 548static void delete_nommu_region(struct vm_region *region)
467{ 549{
468 return find_vma(mm, addr); 550 BUG_ON(!nommu_region_tree.rb_node);
469}
470 551
471int expand_stack(struct vm_area_struct *vma, unsigned long address) 552 validate_nommu_regions();
472{ 553 rb_erase(&region->vm_rb, &nommu_region_tree);
473 return -ENOMEM; 554 validate_nommu_regions();
474} 555}
475 556
476/* 557/*
477 * look up the first VMA exactly that exactly matches addr 558 * free a contiguous series of pages
478 * - should be called with mm->mmap_sem at least held readlocked
479 */ 559 */
480static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm, 560static void free_page_series(unsigned long from, unsigned long to)
481 unsigned long addr)
482{ 561{
483 struct vm_list_struct *vml; 562 for (; from < to; from += PAGE_SIZE) {
484 563 struct page *page = virt_to_page(from);
485 /* search the vm_start ordered list */ 564
486 for (vml = mm->context.vmlist; vml; vml = vml->next) { 565 kdebug("- free %lx", from);
487 if (vml->vma->vm_start == addr) 566 atomic_dec(&mmap_pages_allocated);
488 return vml->vma; 567 if (page_count(page) != 1)
489 if (vml->vma->vm_start > addr) 568 kdebug("free page %p [%d]", page, page_count(page));
490 break; 569 put_page(page);
491 } 570 }
492
493 return NULL;
494} 571}
495 572
496/* 573/*
497 * find a VMA in the global tree 574 * release a reference to a region
575 * - the caller must hold the region semaphore, which this releases
576 * - the region may not have been added to the tree yet, in which case vm_top
577 * will equal vm_start
498 */ 578 */
499static inline struct vm_area_struct *find_nommu_vma(unsigned long start) 579static void __put_nommu_region(struct vm_region *region)
580 __releases(nommu_region_sem)
500{ 581{
501 struct vm_area_struct *vma; 582 kenter("%p{%d}", region, atomic_read(&region->vm_usage));
502 struct rb_node *n = nommu_vma_tree.rb_node;
503 583
504 while (n) { 584 BUG_ON(!nommu_region_tree.rb_node);
505 vma = rb_entry(n, struct vm_area_struct, vm_rb);
506 585
507 if (start < vma->vm_start) 586 if (atomic_dec_and_test(&region->vm_usage)) {
508 n = n->rb_left; 587 if (region->vm_top > region->vm_start)
509 else if (start > vma->vm_start) 588 delete_nommu_region(region);
510 n = n->rb_right; 589 up_write(&nommu_region_sem);
511 else 590
512 return vma; 591 if (region->vm_file)
592 fput(region->vm_file);
593
594 /* IO memory and memory shared directly out of the pagecache
595 * from ramfs/tmpfs mustn't be released here */
596 if (region->vm_flags & VM_MAPPED_COPY) {
597 kdebug("free series");
598 free_page_series(region->vm_start, region->vm_top);
599 }
600 kmem_cache_free(vm_region_jar, region);
601 } else {
602 up_write(&nommu_region_sem);
513 } 603 }
604}
514 605
515 return NULL; 606/*
607 * release a reference to a region
608 */
609static void put_nommu_region(struct vm_region *region)
610{
611 down_write(&nommu_region_sem);
612 __put_nommu_region(region);
516} 613}
517 614
518/* 615/*
519 * add a VMA in the global tree 616 * add a VMA into a process's mm_struct in the appropriate place in the list
617 * and tree and add to the address space's page tree also if not an anonymous
618 * page
619 * - should be called with mm->mmap_sem held writelocked
520 */ 620 */
521static void add_nommu_vma(struct vm_area_struct *vma) 621static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
522{ 622{
523 struct vm_area_struct *pvma; 623 struct vm_area_struct *pvma, **pp;
524 struct address_space *mapping; 624 struct address_space *mapping;
525 struct rb_node **p = &nommu_vma_tree.rb_node; 625 struct rb_node **p, *parent;
526 struct rb_node *parent = NULL; 626
627 kenter(",%p", vma);
628
629 BUG_ON(!vma->vm_region);
630
631 mm->map_count++;
632 vma->vm_mm = mm;
527 633
528 /* add the VMA to the mapping */ 634 /* add the VMA to the mapping */
529 if (vma->vm_file) { 635 if (vma->vm_file) {
@@ -534,42 +640,62 @@ static void add_nommu_vma(struct vm_area_struct *vma)
534 flush_dcache_mmap_unlock(mapping); 640 flush_dcache_mmap_unlock(mapping);
535 } 641 }
536 642
537 /* add the VMA to the master list */ 643 /* add the VMA to the tree */
644 parent = NULL;
645 p = &mm->mm_rb.rb_node;
538 while (*p) { 646 while (*p) {
539 parent = *p; 647 parent = *p;
540 pvma = rb_entry(parent, struct vm_area_struct, vm_rb); 648 pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
541 649
542 if (vma->vm_start < pvma->vm_start) { 650 /* sort by: start addr, end addr, VMA struct addr in that order
651 * (the latter is necessary as we may get identical VMAs) */
652 if (vma->vm_start < pvma->vm_start)
543 p = &(*p)->rb_left; 653 p = &(*p)->rb_left;
544 } 654 else if (vma->vm_start > pvma->vm_start)
545 else if (vma->vm_start > pvma->vm_start) {
546 p = &(*p)->rb_right; 655 p = &(*p)->rb_right;
547 } 656 else if (vma->vm_end < pvma->vm_end)
548 else { 657 p = &(*p)->rb_left;
549 /* mappings are at the same address - this can only 658 else if (vma->vm_end > pvma->vm_end)
550 * happen for shared-mem chardevs and shared file 659 p = &(*p)->rb_right;
551 * mappings backed by ramfs/tmpfs */ 660 else if (vma < pvma)
552 BUG_ON(!(pvma->vm_flags & VM_SHARED)); 661 p = &(*p)->rb_left;
553 662 else if (vma > pvma)
554 if (vma < pvma) 663 p = &(*p)->rb_right;
555 p = &(*p)->rb_left; 664 else
556 else if (vma > pvma) 665 BUG();
557 p = &(*p)->rb_right;
558 else
559 BUG();
560 }
561 } 666 }
562 667
563 rb_link_node(&vma->vm_rb, parent, p); 668 rb_link_node(&vma->vm_rb, parent, p);
564 rb_insert_color(&vma->vm_rb, &nommu_vma_tree); 669 rb_insert_color(&vma->vm_rb, &mm->mm_rb);
670
671 /* add VMA to the VMA list also */
672 for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) {
673 if (pvma->vm_start > vma->vm_start)
674 break;
675 if (pvma->vm_start < vma->vm_start)
676 continue;
677 if (pvma->vm_end < vma->vm_end)
678 break;
679 }
680
681 vma->vm_next = *pp;
682 *pp = vma;
565} 683}
566 684
567/* 685/*
568 * delete a VMA from the global list 686 * delete a VMA from its owning mm_struct and address space
569 */ 687 */
570static void delete_nommu_vma(struct vm_area_struct *vma) 688static void delete_vma_from_mm(struct vm_area_struct *vma)
571{ 689{
690 struct vm_area_struct **pp;
572 struct address_space *mapping; 691 struct address_space *mapping;
692 struct mm_struct *mm = vma->vm_mm;
693
694 kenter("%p", vma);
695
696 mm->map_count--;
697 if (mm->mmap_cache == vma)
698 mm->mmap_cache = NULL;
573 699
574 /* remove the VMA from the mapping */ 700 /* remove the VMA from the mapping */
575 if (vma->vm_file) { 701 if (vma->vm_file) {
@@ -580,8 +706,115 @@ static void delete_nommu_vma(struct vm_area_struct *vma)
580 flush_dcache_mmap_unlock(mapping); 706 flush_dcache_mmap_unlock(mapping);
581 } 707 }
582 708
583 /* remove from the master list */ 709 /* remove from the MM's tree and list */
584 rb_erase(&vma->vm_rb, &nommu_vma_tree); 710 rb_erase(&vma->vm_rb, &mm->mm_rb);
711 for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) {
712 if (*pp == vma) {
713 *pp = vma->vm_next;
714 break;
715 }
716 }
717
718 vma->vm_mm = NULL;
719}
720
721/*
722 * destroy a VMA record
723 */
724static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
725{
726 kenter("%p", vma);
727 if (vma->vm_ops && vma->vm_ops->close)
728 vma->vm_ops->close(vma);
729 if (vma->vm_file) {
730 fput(vma->vm_file);
731 if (vma->vm_flags & VM_EXECUTABLE)
732 removed_exe_file_vma(mm);
733 }
734 put_nommu_region(vma->vm_region);
735 kmem_cache_free(vm_area_cachep, vma);
736}
737
738/*
739 * look up the first VMA in which addr resides, NULL if none
740 * - should be called with mm->mmap_sem at least held readlocked
741 */
742struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
743{
744 struct vm_area_struct *vma;
745 struct rb_node *n = mm->mm_rb.rb_node;
746
747 /* check the cache first */
748 vma = mm->mmap_cache;
749 if (vma && vma->vm_start <= addr && vma->vm_end > addr)
750 return vma;
751
752 /* trawl the tree (there may be multiple mappings in which addr
753 * resides) */
754 for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
755 vma = rb_entry(n, struct vm_area_struct, vm_rb);
756 if (vma->vm_start > addr)
757 return NULL;
758 if (vma->vm_end > addr) {
759 mm->mmap_cache = vma;
760 return vma;
761 }
762 }
763
764 return NULL;
765}
766EXPORT_SYMBOL(find_vma);
767
768/*
769 * find a VMA
770 * - we don't extend stack VMAs under NOMMU conditions
771 */
772struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
773{
774 return find_vma(mm, addr);
775}
776
777/*
778 * expand a stack to a given address
779 * - not supported under NOMMU conditions
780 */
781int expand_stack(struct vm_area_struct *vma, unsigned long address)
782{
783 return -ENOMEM;
784}
785
786/*
787 * look up the first VMA exactly that exactly matches addr
788 * - should be called with mm->mmap_sem at least held readlocked
789 */
790static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
791 unsigned long addr,
792 unsigned long len)
793{
794 struct vm_area_struct *vma;
795 struct rb_node *n = mm->mm_rb.rb_node;
796 unsigned long end = addr + len;
797
798 /* check the cache first */
799 vma = mm->mmap_cache;
800 if (vma && vma->vm_start == addr && vma->vm_end == end)
801 return vma;
802
803 /* trawl the tree (there may be multiple mappings in which addr
804 * resides) */
805 for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
806 vma = rb_entry(n, struct vm_area_struct, vm_rb);
807 if (vma->vm_start < addr)
808 continue;
809 if (vma->vm_start > addr)
810 return NULL;
811 if (vma->vm_end == end) {
812 mm->mmap_cache = vma;
813 return vma;
814 }
815 }
816
817 return NULL;
585} 818}
586 819
587/* 820/*
@@ -596,7 +829,7 @@ static int validate_mmap_request(struct file *file,
596 unsigned long pgoff, 829 unsigned long pgoff,
597 unsigned long *_capabilities) 830 unsigned long *_capabilities)
598{ 831{
599 unsigned long capabilities; 832 unsigned long capabilities, rlen;
600 unsigned long reqprot = prot; 833 unsigned long reqprot = prot;
601 int ret; 834 int ret;
602 835
@@ -616,12 +849,12 @@ static int validate_mmap_request(struct file *file,
616 return -EINVAL; 849 return -EINVAL;
617 850
618 /* Careful about overflows.. */ 851 /* Careful about overflows.. */
619 len = PAGE_ALIGN(len); 852 rlen = PAGE_ALIGN(len);
620 if (!len || len > TASK_SIZE) 853 if (!rlen || rlen > TASK_SIZE)
621 return -ENOMEM; 854 return -ENOMEM;
622 855
623 /* offset overflow? */ 856 /* offset overflow? */
624 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) 857 if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff)
625 return -EOVERFLOW; 858 return -EOVERFLOW;
626 859
627 if (file) { 860 if (file) {
@@ -795,13 +1028,18 @@ static unsigned long determine_vm_flags(struct file *file,
795} 1028}
796 1029
797/* 1030/*
798 * set up a shared mapping on a file 1031 * set up a shared mapping on a file (the driver or filesystem provides and
1032 * pins the storage)
799 */ 1033 */
800static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len) 1034static int do_mmap_shared_file(struct vm_area_struct *vma)
801{ 1035{
802 int ret; 1036 int ret;
803 1037
804 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); 1038 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1039 if (ret == 0) {
1040 vma->vm_region->vm_top = vma->vm_region->vm_end;
1041 return ret;
1042 }
805 if (ret != -ENOSYS) 1043 if (ret != -ENOSYS)
806 return ret; 1044 return ret;
807 1045
@@ -815,10 +1053,14 @@ static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len)
815/* 1053/*
816 * set up a private mapping or an anonymous shared mapping 1054 * set up a private mapping or an anonymous shared mapping
817 */ 1055 */
818static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) 1056static int do_mmap_private(struct vm_area_struct *vma,
1057 struct vm_region *region,
1058 unsigned long len)
819{ 1059{
1060 struct page *pages;
1061 unsigned long total, point, n, rlen;
820 void *base; 1062 void *base;
821 int ret; 1063 int ret, order;
822 1064
823 /* invoke the file's mapping function so that it can keep track of 1065 /* invoke the file's mapping function so that it can keep track of
824 * shared mappings on devices or memory 1066 * shared mappings on devices or memory
@@ -826,34 +1068,63 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
826 */ 1068 */
827 if (vma->vm_file) { 1069 if (vma->vm_file) {
828 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); 1070 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
829 if (ret != -ENOSYS) { 1071 if (ret == 0) {
830 /* shouldn't return success if we're not sharing */ 1072 /* shouldn't return success if we're not sharing */
831 BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE)); 1073 BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
832 return ret; /* success or a real error */ 1074 vma->vm_region->vm_top = vma->vm_region->vm_end;
1075 return ret;
833 } 1076 }
1077 if (ret != -ENOSYS)
1078 return ret;
834 1079
835 /* getting an ENOSYS error indicates that direct mmap isn't 1080 /* getting an ENOSYS error indicates that direct mmap isn't
836 * possible (as opposed to tried but failed) so we'll try to 1081 * possible (as opposed to tried but failed) so we'll try to
837 * make a private copy of the data and map that instead */ 1082 * make a private copy of the data and map that instead */
838 } 1083 }
839 1084
1085 rlen = PAGE_ALIGN(len);
1086
840 /* allocate some memory to hold the mapping 1087 /* allocate some memory to hold the mapping
841 * - note that this may not return a page-aligned address if the object 1088 * - note that this may not return a page-aligned address if the object
842 * we're allocating is smaller than a page 1089 * we're allocating is smaller than a page
843 */ 1090 */
844 base = kmalloc(len, GFP_KERNEL|__GFP_COMP); 1091 order = get_order(rlen);
845 if (!base) 1092 kdebug("alloc order %d for %lx", order, len);
1093
1094 pages = alloc_pages(GFP_KERNEL, order);
1095 if (!pages)
846 goto enomem; 1096 goto enomem;
847 1097
848 vma->vm_start = (unsigned long) base; 1098 total = 1 << order;
849 vma->vm_end = vma->vm_start + len; 1099 atomic_add(total, &mmap_pages_allocated);
850 vma->vm_flags |= VM_MAPPED_COPY; 1100
1101 point = rlen >> PAGE_SHIFT;
1102
1103 /* we allocated a power-of-2 sized page set, so we may want to trim off
1104 * the excess */
1105 if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
1106 while (total > point) {
1107 order = ilog2(total - point);
1108 n = 1 << order;
1109 kdebug("shave %lu/%lu @%lu", n, total - point, total);
1110 atomic_sub(n, &mmap_pages_allocated);
1111 total -= n;
1112 set_page_refcounted(pages + total);
1113 __free_pages(pages + total, order);
1114 }
1115 }
851 1116
852#ifdef WARN_ON_SLACK 1117 for (point = 1; point < total; point++)
853 if (len + WARN_ON_SLACK <= kobjsize(result)) 1118 set_page_refcounted(&pages[point]);
854 printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n", 1119
855 len, current->pid, kobjsize(result) - len); 1120 base = page_address(pages);
856#endif 1121 region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
1122 region->vm_start = (unsigned long) base;
1123 region->vm_end = region->vm_start + rlen;
1124 region->vm_top = region->vm_start + (total << PAGE_SHIFT);
1125
1126 vma->vm_start = region->vm_start;
1127 vma->vm_end = region->vm_start + len;
857 1128
858 if (vma->vm_file) { 1129 if (vma->vm_file) {
859 /* read the contents of a file into the copy */ 1130 /* read the contents of a file into the copy */
@@ -865,31 +1136,33 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
865 1136
866 old_fs = get_fs(); 1137 old_fs = get_fs();
867 set_fs(KERNEL_DS); 1138 set_fs(KERNEL_DS);
868 ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos); 1139 ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos);
869 set_fs(old_fs); 1140 set_fs(old_fs);
870 1141
871 if (ret < 0) 1142 if (ret < 0)
872 goto error_free; 1143 goto error_free;
873 1144
874 /* clear the last little bit */ 1145 /* clear the last little bit */
875 if (ret < len) 1146 if (ret < rlen)
876 memset(base + ret, 0, len - ret); 1147 memset(base + ret, 0, rlen - ret);
877 1148
878 } else { 1149 } else {
879 /* if it's an anonymous mapping, then just clear it */ 1150 /* if it's an anonymous mapping, then just clear it */
880 memset(base, 0, len); 1151 memset(base, 0, rlen);
881 } 1152 }
882 1153
883 return 0; 1154 return 0;
884 1155
885error_free: 1156error_free:
886 kfree(base); 1157 free_page_series(region->vm_start, region->vm_end);
887 vma->vm_start = 0; 1158 region->vm_start = vma->vm_start = 0;
1159 region->vm_end = vma->vm_end = 0;
1160 region->vm_top = 0;
888 return ret; 1161 return ret;
889 1162
890enomem: 1163enomem:
891 printk("Allocation of length %lu from process %d failed\n", 1164 printk("Allocation of length %lu from process %d (%s) failed\n",
892 len, current->pid); 1165 len, current->pid, current->comm);
893 show_free_areas(); 1166 show_free_areas();
894 return -ENOMEM; 1167 return -ENOMEM;
895} 1168}
@@ -904,13 +1177,14 @@ unsigned long do_mmap_pgoff(struct file *file,
904 unsigned long flags, 1177 unsigned long flags,
905 unsigned long pgoff) 1178 unsigned long pgoff)
906{ 1179{
907 struct vm_list_struct *vml = NULL; 1180 struct vm_area_struct *vma;
908 struct vm_area_struct *vma = NULL; 1181 struct vm_region *region;
909 struct rb_node *rb; 1182 struct rb_node *rb;
910 unsigned long capabilities, vm_flags; 1183 unsigned long capabilities, vm_flags, result;
911 void *result;
912 int ret; 1184 int ret;
913 1185
1186 kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
1187
914 if (!(flags & MAP_FIXED)) 1188 if (!(flags & MAP_FIXED))
915 addr = round_hint_to_min(addr); 1189 addr = round_hint_to_min(addr);
916 1190
@@ -918,73 +1192,120 @@ unsigned long do_mmap_pgoff(struct file *file,
918 * mapping */ 1192 * mapping */
919 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, 1193 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
920 &capabilities); 1194 &capabilities);
921 if (ret < 0) 1195 if (ret < 0) {
1196 kleave(" = %d [val]", ret);
922 return ret; 1197 return ret;
1198 }
923 1199
924 /* we've determined that we can make the mapping, now translate what we 1200 /* we've determined that we can make the mapping, now translate what we
925 * now know into VMA flags */ 1201 * now know into VMA flags */
926 vm_flags = determine_vm_flags(file, prot, flags, capabilities); 1202 vm_flags = determine_vm_flags(file, prot, flags, capabilities);
927 1203
928 /* we're going to need to record the mapping if it works */ 1204 /* we're going to need to record the mapping */
929 vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL); 1205 region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
930 if (!vml) 1206 if (!region)
931 goto error_getting_vml; 1207 goto error_getting_region;
1208
1209 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1210 if (!vma)
1211 goto error_getting_vma;
1212
1213 atomic_set(&region->vm_usage, 1);
1214 region->vm_flags = vm_flags;
1215 region->vm_pgoff = pgoff;
1216
1217 INIT_LIST_HEAD(&vma->anon_vma_node);
1218 vma->vm_flags = vm_flags;
1219 vma->vm_pgoff = pgoff;
1220
1221 if (file) {
1222 region->vm_file = file;
1223 get_file(file);
1224 vma->vm_file = file;
1225 get_file(file);
1226 if (vm_flags & VM_EXECUTABLE) {
1227 added_exe_file_vma(current->mm);
1228 vma->vm_mm = current->mm;
1229 }
1230 }
932 1231
933 down_write(&nommu_vma_sem); 1232 down_write(&nommu_region_sem);
934 1233
935 /* if we want to share, we need to check for VMAs created by other 1234 /* if we want to share, we need to check for regions created by other
936 * mmap() calls that overlap with our proposed mapping 1235 * mmap() calls that overlap with our proposed mapping
937 * - we can only share with an exact match on most regular files 1236 * - we can only share with a superset match on most regular files
938 * - shared mappings on character devices and memory backed files are 1237 * - shared mappings on character devices and memory backed files are
939 * permitted to overlap inexactly as far as we are concerned for in 1238 * permitted to overlap inexactly as far as we are concerned for in
940 * these cases, sharing is handled in the driver or filesystem rather 1239 * these cases, sharing is handled in the driver or filesystem rather
941 * than here 1240 * than here
942 */ 1241 */
943 if (vm_flags & VM_MAYSHARE) { 1242 if (vm_flags & VM_MAYSHARE) {
944 unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1243 struct vm_region *pregion;
945 unsigned long vmpglen; 1244 unsigned long pglen, rpglen, pgend, rpgend, start;
946 1245
947 /* suppress VMA sharing for shared regions */ 1246 pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
948 if (vm_flags & VM_SHARED && 1247 pgend = pgoff + pglen;
949 capabilities & BDI_CAP_MAP_DIRECT)
950 goto dont_share_VMAs;
951 1248
952 for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) { 1249 for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
953 vma = rb_entry(rb, struct vm_area_struct, vm_rb); 1250 pregion = rb_entry(rb, struct vm_region, vm_rb);
954 1251
955 if (!(vma->vm_flags & VM_MAYSHARE)) 1252 if (!(pregion->vm_flags & VM_MAYSHARE))
956 continue; 1253 continue;
957 1254
958 /* search for overlapping mappings on the same file */ 1255 /* search for overlapping mappings on the same file */
959 if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode) 1256 if (pregion->vm_file->f_path.dentry->d_inode !=
1257 file->f_path.dentry->d_inode)
960 continue; 1258 continue;
961 1259
962 if (vma->vm_pgoff >= pgoff + pglen) 1260 if (pregion->vm_pgoff >= pgend)
963 continue; 1261 continue;
964 1262
965 vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1; 1263 rpglen = pregion->vm_end - pregion->vm_start;
966 vmpglen >>= PAGE_SHIFT; 1264 rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT;
967 if (pgoff >= vma->vm_pgoff + vmpglen) 1265 rpgend = pregion->vm_pgoff + rpglen;
1266 if (pgoff >= rpgend)
968 continue; 1267 continue;
969 1268
970 /* handle inexactly overlapping matches between mappings */ 1269 /* handle inexactly overlapping matches between
971 if (vma->vm_pgoff != pgoff || vmpglen != pglen) { 1270 * mappings */
1271 if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
1272 !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
1273 /* new mapping is not a subset of the region */
972 if (!(capabilities & BDI_CAP_MAP_DIRECT)) 1274 if (!(capabilities & BDI_CAP_MAP_DIRECT))
973 goto sharing_violation; 1275 goto sharing_violation;
974 continue; 1276 continue;
975 } 1277 }
976 1278
977 /* we've found a VMA we can share */ 1279 /* we've found a region we can share */
978 atomic_inc(&vma->vm_usage); 1280 atomic_inc(&pregion->vm_usage);
979 1281 vma->vm_region = pregion;
980 vml->vma = vma; 1282 start = pregion->vm_start;
981 result = (void *) vma->vm_start; 1283 start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
982 goto shared; 1284 vma->vm_start = start;
1285 vma->vm_end = start + len;
1286
1287 if (pregion->vm_flags & VM_MAPPED_COPY) {
1288 kdebug("share copy");
1289 vma->vm_flags |= VM_MAPPED_COPY;
1290 } else {
1291 kdebug("share mmap");
1292 ret = do_mmap_shared_file(vma);
1293 if (ret < 0) {
1294 vma->vm_region = NULL;
1295 vma->vm_start = 0;
1296 vma->vm_end = 0;
1297 atomic_dec(&pregion->vm_usage);
1298 pregion = NULL;
1299 goto error_just_free;
1300 }
1301 }
1302 fput(region->vm_file);
1303 kmem_cache_free(vm_region_jar, region);
1304 region = pregion;
1305 result = start;
1306 goto share;
983 } 1307 }
984 1308
985 dont_share_VMAs:
986 vma = NULL;
987
988 /* obtain the address at which to make a shared mapping 1309 /* obtain the address at which to make a shared mapping
989 * - this is the hook for quasi-memory character devices to 1310 * - this is the hook for quasi-memory character devices to
990 * tell us the location of a shared mapping 1311 * tell us the location of a shared mapping
@@ -995,113 +1316,93 @@ unsigned long do_mmap_pgoff(struct file *file,
995 if (IS_ERR((void *) addr)) { 1316 if (IS_ERR((void *) addr)) {
996 ret = addr; 1317 ret = addr;
997 if (ret != (unsigned long) -ENOSYS) 1318 if (ret != (unsigned long) -ENOSYS)
998 goto error; 1319 goto error_just_free;
999 1320
1000 /* the driver refused to tell us where to site 1321 /* the driver refused to tell us where to site
1001 * the mapping so we'll have to attempt to copy 1322 * the mapping so we'll have to attempt to copy
1002 * it */ 1323 * it */
1003 ret = (unsigned long) -ENODEV; 1324 ret = (unsigned long) -ENODEV;
1004 if (!(capabilities & BDI_CAP_MAP_COPY)) 1325 if (!(capabilities & BDI_CAP_MAP_COPY))
1005 goto error; 1326 goto error_just_free;
1006 1327
1007 capabilities &= ~BDI_CAP_MAP_DIRECT; 1328 capabilities &= ~BDI_CAP_MAP_DIRECT;
1329 } else {
1330 vma->vm_start = region->vm_start = addr;
1331 vma->vm_end = region->vm_end = addr + len;
1008 } 1332 }
1009 } 1333 }
1010 } 1334 }
1011 1335
1012 /* we're going to need a VMA struct as well */ 1336 vma->vm_region = region;
1013 vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
1014 if (!vma)
1015 goto error_getting_vma;
1016
1017 INIT_LIST_HEAD(&vma->anon_vma_node);
1018 atomic_set(&vma->vm_usage, 1);
1019 if (file) {
1020 get_file(file);
1021 if (vm_flags & VM_EXECUTABLE) {
1022 added_exe_file_vma(current->mm);
1023 vma->vm_mm = current->mm;
1024 }
1025 }
1026 vma->vm_file = file;
1027 vma->vm_flags = vm_flags;
1028 vma->vm_start = addr;
1029 vma->vm_end = addr + len;
1030 vma->vm_pgoff = pgoff;
1031
1032 vml->vma = vma;
1033 1337
1034 /* set up the mapping */ 1338 /* set up the mapping */
1035 if (file && vma->vm_flags & VM_SHARED) 1339 if (file && vma->vm_flags & VM_SHARED)
1036 ret = do_mmap_shared_file(vma, len); 1340 ret = do_mmap_shared_file(vma);
1037 else 1341 else
1038 ret = do_mmap_private(vma, len); 1342 ret = do_mmap_private(vma, region, len);
1039 if (ret < 0) 1343 if (ret < 0)
1040 goto error; 1344 goto error_put_region;
1041
1042 /* okay... we have a mapping; now we have to register it */
1043 result = (void *) vma->vm_start;
1044 1345
1045 if (vma->vm_flags & VM_MAPPED_COPY) { 1346 add_nommu_region(region);
1046 realalloc += kobjsize(result);
1047 askedalloc += len;
1048 }
1049 1347
1050 realalloc += kobjsize(vma); 1348 /* okay... we have a mapping; now we have to register it */
1051 askedalloc += sizeof(*vma); 1349 result = vma->vm_start;
1052 1350
1053 current->mm->total_vm += len >> PAGE_SHIFT; 1351 current->mm->total_vm += len >> PAGE_SHIFT;
1054 1352
1055 add_nommu_vma(vma); 1353share:
1056 1354 add_vma_to_mm(current->mm, vma);
1057 shared:
1058 realalloc += kobjsize(vml);
1059 askedalloc += sizeof(*vml);
1060 1355
1061 add_vma_to_mm(current->mm, vml); 1356 up_write(&nommu_region_sem);
1062
1063 up_write(&nommu_vma_sem);
1064 1357
1065 if (prot & PROT_EXEC) 1358 if (prot & PROT_EXEC)
1066 flush_icache_range((unsigned long) result, 1359 flush_icache_range(result, result + len);
1067 (unsigned long) result + len);
1068 1360
1069#ifdef DEBUG 1361 kleave(" = %lx", result);
1070 printk("do_mmap:\n"); 1362 return result;
1071 show_process_blocks();
1072#endif
1073 1363
1074 return (unsigned long) result; 1364error_put_region:
1075 1365 __put_nommu_region(region);
1076 error:
1077 up_write(&nommu_vma_sem);
1078 kfree(vml);
1079 if (vma) { 1366 if (vma) {
1080 if (vma->vm_file) { 1367 if (vma->vm_file) {
1081 fput(vma->vm_file); 1368 fput(vma->vm_file);
1082 if (vma->vm_flags & VM_EXECUTABLE) 1369 if (vma->vm_flags & VM_EXECUTABLE)
1083 removed_exe_file_vma(vma->vm_mm); 1370 removed_exe_file_vma(vma->vm_mm);
1084 } 1371 }
1085 kfree(vma); 1372 kmem_cache_free(vm_area_cachep, vma);
1086 } 1373 }
1374 kleave(" = %d [pr]", ret);
1087 return ret; 1375 return ret;
1088 1376
1089 sharing_violation: 1377error_just_free:
1090 up_write(&nommu_vma_sem); 1378 up_write(&nommu_region_sem);
1091 printk("Attempt to share mismatched mappings\n"); 1379error:
1092 kfree(vml); 1380 fput(region->vm_file);
1093 return -EINVAL; 1381 kmem_cache_free(vm_region_jar, region);
1382 fput(vma->vm_file);
1383 if (vma->vm_flags & VM_EXECUTABLE)
1384 removed_exe_file_vma(vma->vm_mm);
1385 kmem_cache_free(vm_area_cachep, vma);
1386 kleave(" = %d", ret);
1387 return ret;
1094 1388
1095 error_getting_vma: 1389sharing_violation:
1096 up_write(&nommu_vma_sem); 1390 up_write(&nommu_region_sem);
1097 kfree(vml); 1391 printk(KERN_WARNING "Attempt to share mismatched mappings\n");
1098 printk("Allocation of vma for %lu byte allocation from process %d failed\n", 1392 ret = -EINVAL;
1393 goto error;
1394
1395error_getting_vma:
1396 kmem_cache_free(vm_region_jar, region);
1397 printk(KERN_WARNING "Allocation of vma for %lu byte allocation"
1398 " from process %d failed\n",
1099 len, current->pid); 1399 len, current->pid);
1100 show_free_areas(); 1400 show_free_areas();
1101 return -ENOMEM; 1401 return -ENOMEM;
1102 1402
1103 error_getting_vml: 1403error_getting_region:
1104 printk("Allocation of vml for %lu byte allocation from process %d failed\n", 1404 printk(KERN_WARNING "Allocation of vm region for %lu byte allocation"
1405 " from process %d failed\n",
1105 len, current->pid); 1406 len, current->pid);
1106 show_free_areas(); 1407 show_free_areas();
1107 return -ENOMEM; 1408 return -ENOMEM;
@@ -1109,90 +1410,188 @@ unsigned long do_mmap_pgoff(struct file *file,
1109EXPORT_SYMBOL(do_mmap_pgoff); 1410EXPORT_SYMBOL(do_mmap_pgoff);
1110 1411
1111/* 1412/*
1112 * handle mapping disposal for uClinux 1413 * split a vma into two pieces at address 'addr', a new vma is allocated either
1414 * for the first part or the tail.
1113 */ 1415 */
1114static void put_vma(struct mm_struct *mm, struct vm_area_struct *vma) 1416int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1417 unsigned long addr, int new_below)
1115{ 1418{
1116 if (vma) { 1419 struct vm_area_struct *new;
1117 down_write(&nommu_vma_sem); 1420 struct vm_region *region;
1421 unsigned long npages;
1118 1422
1119 if (atomic_dec_and_test(&vma->vm_usage)) { 1423 kenter("");
1120 delete_nommu_vma(vma);
1121 1424
1122 if (vma->vm_ops && vma->vm_ops->close) 1425 /* we're only permitted to split anonymous regions that have a single
1123 vma->vm_ops->close(vma); 1426 * owner */
1427 if (vma->vm_file ||
1428 atomic_read(&vma->vm_region->vm_usage) != 1)
1429 return -ENOMEM;
1124 1430
1125 /* IO memory and memory shared directly out of the pagecache from 1431 if (mm->map_count >= sysctl_max_map_count)
1126 * ramfs/tmpfs mustn't be released here */ 1432 return -ENOMEM;
1127 if (vma->vm_flags & VM_MAPPED_COPY) {
1128 realalloc -= kobjsize((void *) vma->vm_start);
1129 askedalloc -= vma->vm_end - vma->vm_start;
1130 kfree((void *) vma->vm_start);
1131 }
1132 1433
1133 realalloc -= kobjsize(vma); 1434 region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL);
1134 askedalloc -= sizeof(*vma); 1435 if (!region)
1436 return -ENOMEM;
1135 1437
1136 if (vma->vm_file) { 1438 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
1137 fput(vma->vm_file); 1439 if (!new) {
1138 if (vma->vm_flags & VM_EXECUTABLE) 1440 kmem_cache_free(vm_region_jar, region);
1139 removed_exe_file_vma(mm); 1441 return -ENOMEM;
1140 } 1442 }
1141 kfree(vma); 1443
1142 } 1444 /* most fields are the same, copy all, and then fixup */
1445 *new = *vma;
1446 *region = *vma->vm_region;
1447 new->vm_region = region;
1448
1449 npages = (addr - vma->vm_start) >> PAGE_SHIFT;
1143 1450
1144 up_write(&nommu_vma_sem); 1451 if (new_below) {
1452 region->vm_top = region->vm_end = new->vm_end = addr;
1453 } else {
1454 region->vm_start = new->vm_start = addr;
1455 region->vm_pgoff = new->vm_pgoff += npages;
1145 } 1456 }
1457
1458 if (new->vm_ops && new->vm_ops->open)
1459 new->vm_ops->open(new);
1460
1461 delete_vma_from_mm(vma);
1462 down_write(&nommu_region_sem);
1463 delete_nommu_region(vma->vm_region);
1464 if (new_below) {
1465 vma->vm_region->vm_start = vma->vm_start = addr;
1466 vma->vm_region->vm_pgoff = vma->vm_pgoff += npages;
1467 } else {
1468 vma->vm_region->vm_end = vma->vm_end = addr;
1469 vma->vm_region->vm_top = addr;
1470 }
1471 add_nommu_region(vma->vm_region);
1472 add_nommu_region(new->vm_region);
1473 up_write(&nommu_region_sem);
1474 add_vma_to_mm(mm, vma);
1475 add_vma_to_mm(mm, new);
1476 return 0;
1146} 1477}
1147 1478
1148/* 1479/*
1149 * release a mapping 1480 * shrink a VMA by removing the specified chunk from either the beginning or
1150 * - under NOMMU conditions the parameters must match exactly to the mapping to 1481 * the end
1151 * be removed
1152 */ 1482 */
1153int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) 1483static int shrink_vma(struct mm_struct *mm,
1484 struct vm_area_struct *vma,
1485 unsigned long from, unsigned long to)
1154{ 1486{
1155 struct vm_list_struct *vml, **parent; 1487 struct vm_region *region;
1156 unsigned long end = addr + len;
1157 1488
1158#ifdef DEBUG 1489 kenter("");
1159 printk("do_munmap:\n");
1160#endif
1161 1490
1162 for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) { 1491 /* adjust the VMA's pointers, which may reposition it in the MM's tree
1163 if ((*parent)->vma->vm_start > addr) 1492 * and list */
1164 break; 1493 delete_vma_from_mm(vma);
1165 if ((*parent)->vma->vm_start == addr && 1494 if (from > vma->vm_start)
1166 ((len == 0) || ((*parent)->vma->vm_end == end))) 1495 vma->vm_end = from;
1167 goto found; 1496 else
1497 vma->vm_start = to;
1498 add_vma_to_mm(mm, vma);
1499
1500 /* cut the backing region down to size */
1501 region = vma->vm_region;
1502 BUG_ON(atomic_read(&region->vm_usage) != 1);
1503
1504 down_write(&nommu_region_sem);
1505 delete_nommu_region(region);
1506 if (from > region->vm_start) {
1507 to = region->vm_top;
1508 region->vm_top = region->vm_end = from;
1509 } else {
1510 region->vm_start = to;
1168 } 1511 }
1512 add_nommu_region(region);
1513 up_write(&nommu_region_sem);
1169 1514
1170 printk("munmap of non-mmaped memory by process %d (%s): %p\n", 1515 free_page_series(from, to);
1171 current->pid, current->comm, (void *) addr); 1516 return 0;
1172 return -EINVAL; 1517}
1173 1518
1174 found: 1519/*
1175 vml = *parent; 1520 * release a mapping
1521 * - under NOMMU conditions the chunk to be unmapped must be backed by a single
1522 * VMA, though it need not cover the whole VMA
1523 */
1524int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1525{
1526 struct vm_area_struct *vma;
1527 struct rb_node *rb;
1528 unsigned long end = start + len;
1529 int ret;
1176 1530
1177 put_vma(mm, vml->vma); 1531 kenter(",%lx,%zx", start, len);
1178 1532
1179 *parent = vml->next; 1533 if (len == 0)
1180 realalloc -= kobjsize(vml); 1534 return -EINVAL;
1181 askedalloc -= sizeof(*vml);
1182 kfree(vml);
1183 1535
1184 update_hiwater_vm(mm); 1536 /* find the first potentially overlapping VMA */
1185 mm->total_vm -= len >> PAGE_SHIFT; 1537 vma = find_vma(mm, start);
1538 if (!vma) {
1539 printk(KERN_WARNING
1540 "munmap of memory not mmapped by process %d (%s):"
1541 " 0x%lx-0x%lx\n",
1542 current->pid, current->comm, start, start + len - 1);
1543 return -EINVAL;
1544 }
1186 1545
1187#ifdef DEBUG 1546 /* we're allowed to split an anonymous VMA but not a file-backed one */
1188 show_process_blocks(); 1547 if (vma->vm_file) {
1189#endif 1548 do {
1549 if (start > vma->vm_start) {
1550 kleave(" = -EINVAL [miss]");
1551 return -EINVAL;
1552 }
1553 if (end == vma->vm_end)
1554 goto erase_whole_vma;
1555 rb = rb_next(&vma->vm_rb);
1556 vma = rb_entry(rb, struct vm_area_struct, vm_rb);
1557 } while (rb);
1558 kleave(" = -EINVAL [split file]");
1559 return -EINVAL;
1560 } else {
1561 /* the chunk must be a subset of the VMA found */
1562 if (start == vma->vm_start && end == vma->vm_end)
1563 goto erase_whole_vma;
1564 if (start < vma->vm_start || end > vma->vm_end) {
1565 kleave(" = -EINVAL [superset]");
1566 return -EINVAL;
1567 }
1568 if (start & ~PAGE_MASK) {
1569 kleave(" = -EINVAL [unaligned start]");
1570 return -EINVAL;
1571 }
1572 if (end != vma->vm_end && end & ~PAGE_MASK) {
1573 kleave(" = -EINVAL [unaligned split]");
1574 return -EINVAL;
1575 }
1576 if (start != vma->vm_start && end != vma->vm_end) {
1577 ret = split_vma(mm, vma, start, 1);
1578 if (ret < 0) {
1579 kleave(" = %d [split]", ret);
1580 return ret;
1581 }
1582 }
1583 return shrink_vma(mm, vma, start, end);
1584 }
1190 1585
1586erase_whole_vma:
1587 delete_vma_from_mm(vma);
1588 delete_vma(mm, vma);
1589 kleave(" = 0");
1191 return 0; 1590 return 0;
1192} 1591}
1193EXPORT_SYMBOL(do_munmap); 1592EXPORT_SYMBOL(do_munmap);
1194 1593
1195asmlinkage long sys_munmap(unsigned long addr, size_t len) 1594SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
1196{ 1595{
1197 int ret; 1596 int ret;
1198 struct mm_struct *mm = current->mm; 1597 struct mm_struct *mm = current->mm;
@@ -1204,32 +1603,26 @@ asmlinkage long sys_munmap(unsigned long addr, size_t len)
1204} 1603}
1205 1604
1206/* 1605/*
1207 * Release all mappings 1606 * release all the mappings made in a process's VM space
1208 */ 1607 */
1209void exit_mmap(struct mm_struct * mm) 1608void exit_mmap(struct mm_struct *mm)
1210{ 1609{
1211 struct vm_list_struct *tmp; 1610 struct vm_area_struct *vma;
1212
1213 if (mm) {
1214#ifdef DEBUG
1215 printk("Exit_mmap:\n");
1216#endif
1217 1611
1218 mm->total_vm = 0; 1612 if (!mm)
1613 return;
1219 1614
1220 while ((tmp = mm->context.vmlist)) { 1615 kenter("");
1221 mm->context.vmlist = tmp->next;
1222 put_vma(mm, tmp->vma);
1223 1616
1224 realalloc -= kobjsize(tmp); 1617 mm->total_vm = 0;
1225 askedalloc -= sizeof(*tmp);
1226 kfree(tmp);
1227 }
1228 1618
1229#ifdef DEBUG 1619 while ((vma = mm->mmap)) {
1230 show_process_blocks(); 1620 mm->mmap = vma->vm_next;
1231#endif 1621 delete_vma_from_mm(vma);
1622 delete_vma(mm, vma);
1232 } 1623 }
1624
1625 kleave("");
1233} 1626}
1234 1627
1235unsigned long do_brk(unsigned long addr, unsigned long len) 1628unsigned long do_brk(unsigned long addr, unsigned long len)
@@ -1242,8 +1635,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
1242 * time (controlled by the MREMAP_MAYMOVE flag and available VM space) 1635 * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
1243 * 1636 *
1244 * under NOMMU conditions, we only permit changing a mapping's size, and only 1637 * under NOMMU conditions, we only permit changing a mapping's size, and only
1245 * as long as it stays within the hole allocated by the kmalloc() call in 1638 * as long as it stays within the region allocated by do_mmap_private() and the
1246 * do_mmap_pgoff() and the block is not shareable 1639 * block is not shareable
1247 * 1640 *
1248 * MREMAP_FIXED is not supported under NOMMU conditions 1641 * MREMAP_FIXED is not supported under NOMMU conditions
1249 */ 1642 */
@@ -1254,13 +1647,16 @@ unsigned long do_mremap(unsigned long addr,
1254 struct vm_area_struct *vma; 1647 struct vm_area_struct *vma;
1255 1648
1256 /* insanity checks first */ 1649 /* insanity checks first */
1257 if (new_len == 0) 1650 if (old_len == 0 || new_len == 0)
1258 return (unsigned long) -EINVAL; 1651 return (unsigned long) -EINVAL;
1259 1652
1653 if (addr & ~PAGE_MASK)
1654 return -EINVAL;
1655
1260 if (flags & MREMAP_FIXED && new_addr != addr) 1656 if (flags & MREMAP_FIXED && new_addr != addr)
1261 return (unsigned long) -EINVAL; 1657 return (unsigned long) -EINVAL;
1262 1658
1263 vma = find_vma_exact(current->mm, addr); 1659 vma = find_vma_exact(current->mm, addr, old_len);
1264 if (!vma) 1660 if (!vma)
1265 return (unsigned long) -EINVAL; 1661 return (unsigned long) -EINVAL;
1266 1662
@@ -1270,22 +1666,18 @@ unsigned long do_mremap(unsigned long addr,
1270 if (vma->vm_flags & VM_MAYSHARE) 1666 if (vma->vm_flags & VM_MAYSHARE)
1271 return (unsigned long) -EPERM; 1667 return (unsigned long) -EPERM;
1272 1668
1273 if (new_len > kobjsize((void *) addr)) 1669 if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)
1274 return (unsigned long) -ENOMEM; 1670 return (unsigned long) -ENOMEM;
1275 1671
1276 /* all checks complete - do it */ 1672 /* all checks complete - do it */
1277 vma->vm_end = vma->vm_start + new_len; 1673 vma->vm_end = vma->vm_start + new_len;
1278
1279 askedalloc -= old_len;
1280 askedalloc += new_len;
1281
1282 return vma->vm_start; 1674 return vma->vm_start;
1283} 1675}
1284EXPORT_SYMBOL(do_mremap); 1676EXPORT_SYMBOL(do_mremap);
1285 1677
1286asmlinkage unsigned long sys_mremap(unsigned long addr, 1678SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
1287 unsigned long old_len, unsigned long new_len, 1679 unsigned long, new_len, unsigned long, flags,
1288 unsigned long flags, unsigned long new_addr) 1680 unsigned long, new_addr)
1289{ 1681{
1290 unsigned long ret; 1682 unsigned long ret;
1291 1683
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 558f9afe6e4e..40ba05061a4f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -31,7 +31,7 @@
31int sysctl_panic_on_oom; 31int sysctl_panic_on_oom;
32int sysctl_oom_kill_allocating_task; 32int sysctl_oom_kill_allocating_task;
33int sysctl_oom_dump_tasks; 33int sysctl_oom_dump_tasks;
34static DEFINE_SPINLOCK(zone_scan_mutex); 34static DEFINE_SPINLOCK(zone_scan_lock);
35/* #define DEBUG */ 35/* #define DEBUG */
36 36
37/** 37/**
@@ -392,6 +392,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
392 printk(KERN_WARNING "%s invoked oom-killer: " 392 printk(KERN_WARNING "%s invoked oom-killer: "
393 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", 393 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
394 current->comm, gfp_mask, order, current->oomkilladj); 394 current->comm, gfp_mask, order, current->oomkilladj);
395 task_lock(current);
396 cpuset_print_task_mems_allowed(current);
397 task_unlock(current);
395 dump_stack(); 398 dump_stack();
396 show_mem(); 399 show_mem();
397 if (sysctl_oom_dump_tasks) 400 if (sysctl_oom_dump_tasks)
@@ -426,7 +429,6 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
426 unsigned long points = 0; 429 unsigned long points = 0;
427 struct task_struct *p; 430 struct task_struct *p;
428 431
429 cgroup_lock();
430 read_lock(&tasklist_lock); 432 read_lock(&tasklist_lock);
431retry: 433retry:
432 p = select_bad_process(&points, mem); 434 p = select_bad_process(&points, mem);
@@ -441,7 +443,6 @@ retry:
441 goto retry; 443 goto retry;
442out: 444out:
443 read_unlock(&tasklist_lock); 445 read_unlock(&tasklist_lock);
444 cgroup_unlock();
445} 446}
446#endif 447#endif
447 448
@@ -470,7 +471,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
470 struct zone *zone; 471 struct zone *zone;
471 int ret = 1; 472 int ret = 1;
472 473
473 spin_lock(&zone_scan_mutex); 474 spin_lock(&zone_scan_lock);
474 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 475 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
475 if (zone_is_oom_locked(zone)) { 476 if (zone_is_oom_locked(zone)) {
476 ret = 0; 477 ret = 0;
@@ -480,7 +481,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
480 481
481 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 482 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
482 /* 483 /*
483 * Lock each zone in the zonelist under zone_scan_mutex so a 484 * Lock each zone in the zonelist under zone_scan_lock so a
484 * parallel invocation of try_set_zone_oom() doesn't succeed 485 * parallel invocation of try_set_zone_oom() doesn't succeed
485 * when it shouldn't. 486 * when it shouldn't.
486 */ 487 */
@@ -488,7 +489,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
488 } 489 }
489 490
490out: 491out:
491 spin_unlock(&zone_scan_mutex); 492 spin_unlock(&zone_scan_lock);
492 return ret; 493 return ret;
493} 494}
494 495
@@ -502,11 +503,82 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
502 struct zoneref *z; 503 struct zoneref *z;
503 struct zone *zone; 504 struct zone *zone;
504 505
505 spin_lock(&zone_scan_mutex); 506 spin_lock(&zone_scan_lock);
506 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 507 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
507 zone_clear_flag(zone, ZONE_OOM_LOCKED); 508 zone_clear_flag(zone, ZONE_OOM_LOCKED);
508 } 509 }
509 spin_unlock(&zone_scan_mutex); 510 spin_unlock(&zone_scan_lock);
511}
512
513/*
514 * Must be called with tasklist_lock held for read.
515 */
516static void __out_of_memory(gfp_t gfp_mask, int order)
517{
518 if (sysctl_oom_kill_allocating_task) {
519 oom_kill_process(current, gfp_mask, order, 0, NULL,
520 "Out of memory (oom_kill_allocating_task)");
521
522 } else {
523 unsigned long points;
524 struct task_struct *p;
525
526retry:
527 /*
528 * Rambo mode: Shoot down a process and hope it solves whatever
529 * issues we may have.
530 */
531 p = select_bad_process(&points, NULL);
532
533 if (PTR_ERR(p) == -1UL)
534 return;
535
536 /* Found nothing?!?! Either we hang forever, or we panic. */
537 if (!p) {
538 read_unlock(&tasklist_lock);
539 panic("Out of memory and no killable processes...\n");
540 }
541
542 if (oom_kill_process(p, gfp_mask, order, points, NULL,
543 "Out of memory"))
544 goto retry;
545 }
546}
547
548/*
549 * pagefault handler calls into here because it is out of memory but
550 * doesn't know exactly how or why.
551 */
552void pagefault_out_of_memory(void)
553{
554 unsigned long freed = 0;
555
556 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
557 if (freed > 0)
558 /* Got some memory back in the last second. */
559 return;
560
561 /*
562 * If this is from memcg, oom-killer is already invoked.
563 * and not worth to go system-wide-oom.
564 */
565 if (mem_cgroup_oom_called(current))
566 goto rest_and_return;
567
568 if (sysctl_panic_on_oom)
569 panic("out of memory from page fault. panic_on_oom is selected.\n");
570
571 read_lock(&tasklist_lock);
572 __out_of_memory(0, 0); /* unknown gfp_mask and order */
573 read_unlock(&tasklist_lock);
574
575 /*
576 * Give "p" a good chance of killing itself before we
577 * retry to allocate memory.
578 */
579rest_and_return:
580 if (!test_thread_flag(TIF_MEMDIE))
581 schedule_timeout_uninterruptible(1);
510} 582}
511 583
512/** 584/**
@@ -522,8 +594,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
522 */ 594 */
523void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) 595void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
524{ 596{
525 struct task_struct *p;
526 unsigned long points = 0;
527 unsigned long freed = 0; 597 unsigned long freed = 0;
528 enum oom_constraint constraint; 598 enum oom_constraint constraint;
529 599
@@ -544,7 +614,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
544 614
545 switch (constraint) { 615 switch (constraint) {
546 case CONSTRAINT_MEMORY_POLICY: 616 case CONSTRAINT_MEMORY_POLICY:
547 oom_kill_process(current, gfp_mask, order, points, NULL, 617 oom_kill_process(current, gfp_mask, order, 0, NULL,
548 "No available memory (MPOL_BIND)"); 618 "No available memory (MPOL_BIND)");
549 break; 619 break;
550 620
@@ -553,35 +623,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
553 panic("out of memory. panic_on_oom is selected\n"); 623 panic("out of memory. panic_on_oom is selected\n");
554 /* Fall-through */ 624 /* Fall-through */
555 case CONSTRAINT_CPUSET: 625 case CONSTRAINT_CPUSET:
556 if (sysctl_oom_kill_allocating_task) { 626 __out_of_memory(gfp_mask, order);
557 oom_kill_process(current, gfp_mask, order, points, NULL,
558 "Out of memory (oom_kill_allocating_task)");
559 break;
560 }
561retry:
562 /*
563 * Rambo mode: Shoot down a process and hope it solves whatever
564 * issues we may have.
565 */
566 p = select_bad_process(&points, NULL);
567
568 if (PTR_ERR(p) == -1UL)
569 goto out;
570
571 /* Found nothing?!?! Either we hang forever, or we panic. */
572 if (!p) {
573 read_unlock(&tasklist_lock);
574 panic("Out of memory and no killable processes...\n");
575 }
576
577 if (oom_kill_process(p, gfp_mask, order, points, NULL,
578 "Out of memory"))
579 goto retry;
580
581 break; 627 break;
582 } 628 }
583 629
584out:
585 read_unlock(&tasklist_lock); 630 read_unlock(&tasklist_lock);
586 631
587 /* 632 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2970e35fd03f..dc32dae01e5f 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void)
69int dirty_background_ratio = 5; 69int dirty_background_ratio = 5;
70 70
71/* 71/*
72 * dirty_background_bytes starts at 0 (disabled) so that it is a function of
73 * dirty_background_ratio * the amount of dirtyable memory
74 */
75unsigned long dirty_background_bytes;
76
77/*
72 * free highmem will not be subtracted from the total free memory 78 * free highmem will not be subtracted from the total free memory
73 * for calculating free ratios if vm_highmem_is_dirtyable is true 79 * for calculating free ratios if vm_highmem_is_dirtyable is true
74 */ 80 */
@@ -80,6 +86,12 @@ int vm_highmem_is_dirtyable;
80int vm_dirty_ratio = 10; 86int vm_dirty_ratio = 10;
81 87
82/* 88/*
89 * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
90 * vm_dirty_ratio * the amount of dirtyable memory
91 */
92unsigned long vm_dirty_bytes;
93
94/*
83 * The interval between `kupdate'-style writebacks, in jiffies 95 * The interval between `kupdate'-style writebacks, in jiffies
84 */ 96 */
85int dirty_writeback_interval = 5 * HZ; 97int dirty_writeback_interval = 5 * HZ;
@@ -135,23 +147,75 @@ static int calc_period_shift(void)
135{ 147{
136 unsigned long dirty_total; 148 unsigned long dirty_total;
137 149
138 dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; 150 if (vm_dirty_bytes)
151 dirty_total = vm_dirty_bytes / PAGE_SIZE;
152 else
153 dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
154 100;
139 return 2 + ilog2(dirty_total - 1); 155 return 2 + ilog2(dirty_total - 1);
140} 156}
141 157
142/* 158/*
143 * update the period when the dirty ratio changes. 159 * update the period when the dirty threshold changes.
144 */ 160 */
161static void update_completion_period(void)
162{
163 int shift = calc_period_shift();
164 prop_change_shift(&vm_completions, shift);
165 prop_change_shift(&vm_dirties, shift);
166}
167
168int dirty_background_ratio_handler(struct ctl_table *table, int write,
169 struct file *filp, void __user *buffer, size_t *lenp,
170 loff_t *ppos)
171{
172 int ret;
173
174 ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
175 if (ret == 0 && write)
176 dirty_background_bytes = 0;
177 return ret;
178}
179
180int dirty_background_bytes_handler(struct ctl_table *table, int write,
181 struct file *filp, void __user *buffer, size_t *lenp,
182 loff_t *ppos)
183{
184 int ret;
185
186 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
187 if (ret == 0 && write)
188 dirty_background_ratio = 0;
189 return ret;
190}
191
145int dirty_ratio_handler(struct ctl_table *table, int write, 192int dirty_ratio_handler(struct ctl_table *table, int write,
146 struct file *filp, void __user *buffer, size_t *lenp, 193 struct file *filp, void __user *buffer, size_t *lenp,
147 loff_t *ppos) 194 loff_t *ppos)
148{ 195{
149 int old_ratio = vm_dirty_ratio; 196 int old_ratio = vm_dirty_ratio;
150 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 197 int ret;
198
199 ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
151 if (ret == 0 && write && vm_dirty_ratio != old_ratio) { 200 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
152 int shift = calc_period_shift(); 201 update_completion_period();
153 prop_change_shift(&vm_completions, shift); 202 vm_dirty_bytes = 0;
154 prop_change_shift(&vm_dirties, shift); 203 }
204 return ret;
205}
206
207
208int dirty_bytes_handler(struct ctl_table *table, int write,
209 struct file *filp, void __user *buffer, size_t *lenp,
210 loff_t *ppos)
211{
212 int old_bytes = vm_dirty_bytes;
213 int ret;
214
215 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
216 if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
217 update_completion_period();
218 vm_dirty_ratio = 0;
155 } 219 }
156 return ret; 220 return ret;
157} 221}
@@ -362,26 +426,32 @@ unsigned long determine_dirtyable_memory(void)
362} 426}
363 427
364void 428void
365get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, 429get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
366 struct backing_dev_info *bdi) 430 unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
367{ 431{
368 int background_ratio; /* Percentages */ 432 unsigned long background;
369 int dirty_ratio; 433 unsigned long dirty;
370 long background;
371 long dirty;
372 unsigned long available_memory = determine_dirtyable_memory(); 434 unsigned long available_memory = determine_dirtyable_memory();
373 struct task_struct *tsk; 435 struct task_struct *tsk;
374 436
375 dirty_ratio = vm_dirty_ratio; 437 if (vm_dirty_bytes)
376 if (dirty_ratio < 5) 438 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
377 dirty_ratio = 5; 439 else {
440 int dirty_ratio;
441
442 dirty_ratio = vm_dirty_ratio;
443 if (dirty_ratio < 5)
444 dirty_ratio = 5;
445 dirty = (dirty_ratio * available_memory) / 100;
446 }
378 447
379 background_ratio = dirty_background_ratio; 448 if (dirty_background_bytes)
380 if (background_ratio >= dirty_ratio) 449 background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
381 background_ratio = dirty_ratio / 2; 450 else
451 background = (dirty_background_ratio * available_memory) / 100;
382 452
383 background = (background_ratio * available_memory) / 100; 453 if (background >= dirty)
384 dirty = (dirty_ratio * available_memory) / 100; 454 background = dirty / 2;
385 tsk = current; 455 tsk = current;
386 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { 456 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
387 background += background / 4; 457 background += background / 4;
@@ -423,9 +493,9 @@ static void balance_dirty_pages(struct address_space *mapping)
423{ 493{
424 long nr_reclaimable, bdi_nr_reclaimable; 494 long nr_reclaimable, bdi_nr_reclaimable;
425 long nr_writeback, bdi_nr_writeback; 495 long nr_writeback, bdi_nr_writeback;
426 long background_thresh; 496 unsigned long background_thresh;
427 long dirty_thresh; 497 unsigned long dirty_thresh;
428 long bdi_thresh; 498 unsigned long bdi_thresh;
429 unsigned long pages_written = 0; 499 unsigned long pages_written = 0;
430 unsigned long write_chunk = sync_writeback_pages(); 500 unsigned long write_chunk = sync_writeback_pages();
431 501
@@ -580,8 +650,8 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
580 650
581void throttle_vm_writeout(gfp_t gfp_mask) 651void throttle_vm_writeout(gfp_t gfp_mask)
582{ 652{
583 long background_thresh; 653 unsigned long background_thresh;
584 long dirty_thresh; 654 unsigned long dirty_thresh;
585 655
586 for ( ; ; ) { 656 for ( ; ; ) {
587 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); 657 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
@@ -624,8 +694,8 @@ static void background_writeout(unsigned long _min_pages)
624 }; 694 };
625 695
626 for ( ; ; ) { 696 for ( ; ; ) {
627 long background_thresh; 697 unsigned long background_thresh;
628 long dirty_thresh; 698 unsigned long dirty_thresh;
629 699
630 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); 700 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
631 if (global_page_state(NR_FILE_DIRTY) + 701 if (global_page_state(NR_FILE_DIRTY) +
@@ -868,9 +938,11 @@ int write_cache_pages(struct address_space *mapping,
868 int done = 0; 938 int done = 0;
869 struct pagevec pvec; 939 struct pagevec pvec;
870 int nr_pages; 940 int nr_pages;
941 pgoff_t uninitialized_var(writeback_index);
871 pgoff_t index; 942 pgoff_t index;
872 pgoff_t end; /* Inclusive */ 943 pgoff_t end; /* Inclusive */
873 int scanned = 0; 944 pgoff_t done_index;
945 int cycled;
874 int range_whole = 0; 946 int range_whole = 0;
875 long nr_to_write = wbc->nr_to_write; 947 long nr_to_write = wbc->nr_to_write;
876 948
@@ -881,83 +953,143 @@ int write_cache_pages(struct address_space *mapping,
881 953
882 pagevec_init(&pvec, 0); 954 pagevec_init(&pvec, 0);
883 if (wbc->range_cyclic) { 955 if (wbc->range_cyclic) {
884 index = mapping->writeback_index; /* Start from prev offset */ 956 writeback_index = mapping->writeback_index; /* prev offset */
957 index = writeback_index;
958 if (index == 0)
959 cycled = 1;
960 else
961 cycled = 0;
885 end = -1; 962 end = -1;
886 } else { 963 } else {
887 index = wbc->range_start >> PAGE_CACHE_SHIFT; 964 index = wbc->range_start >> PAGE_CACHE_SHIFT;
888 end = wbc->range_end >> PAGE_CACHE_SHIFT; 965 end = wbc->range_end >> PAGE_CACHE_SHIFT;
889 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 966 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
890 range_whole = 1; 967 range_whole = 1;
891 scanned = 1; 968 cycled = 1; /* ignore range_cyclic tests */
892 } 969 }
893retry: 970retry:
894 while (!done && (index <= end) && 971 done_index = index;
895 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 972 while (!done && (index <= end)) {
896 PAGECACHE_TAG_DIRTY, 973 int i;
897 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 974
898 unsigned i; 975 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
976 PAGECACHE_TAG_DIRTY,
977 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
978 if (nr_pages == 0)
979 break;
899 980
900 scanned = 1;
901 for (i = 0; i < nr_pages; i++) { 981 for (i = 0; i < nr_pages; i++) {
902 struct page *page = pvec.pages[i]; 982 struct page *page = pvec.pages[i];
903 983
904 /* 984 /*
905 * At this point we hold neither mapping->tree_lock nor 985 * At this point, the page may be truncated or
906 * lock on the page itself: the page may be truncated or 986 * invalidated (changing page->mapping to NULL), or
907 * invalidated (changing page->mapping to NULL), or even 987 * even swizzled back from swapper_space to tmpfs file
908 * swizzled back from swapper_space to tmpfs file 988 * mapping. However, page->index will not change
909 * mapping 989 * because we have a reference on the page.
910 */ 990 */
991 if (page->index > end) {
992 /*
993 * can't be range_cyclic (1st pass) because
994 * end == -1 in that case.
995 */
996 done = 1;
997 break;
998 }
999
1000 done_index = page->index + 1;
1001
911 lock_page(page); 1002 lock_page(page);
912 1003
1004 /*
1005 * Page truncated or invalidated. We can freely skip it
1006 * then, even for data integrity operations: the page
1007 * has disappeared concurrently, so there could be no
1008 * real expectation of this data interity operation
1009 * even if there is now a new, dirty page at the same
1010 * pagecache address.
1011 */
913 if (unlikely(page->mapping != mapping)) { 1012 if (unlikely(page->mapping != mapping)) {
1013continue_unlock:
914 unlock_page(page); 1014 unlock_page(page);
915 continue; 1015 continue;
916 } 1016 }
917 1017
918 if (!wbc->range_cyclic && page->index > end) { 1018 if (!PageDirty(page)) {
919 done = 1; 1019 /* someone wrote it for us */
920 unlock_page(page); 1020 goto continue_unlock;
921 continue;
922 } 1021 }
923 1022
924 if (wbc->sync_mode != WB_SYNC_NONE) 1023 if (PageWriteback(page)) {
925 wait_on_page_writeback(page); 1024 if (wbc->sync_mode != WB_SYNC_NONE)
926 1025 wait_on_page_writeback(page);
927 if (PageWriteback(page) || 1026 else
928 !clear_page_dirty_for_io(page)) { 1027 goto continue_unlock;
929 unlock_page(page);
930 continue;
931 } 1028 }
932 1029
933 ret = (*writepage)(page, wbc, data); 1030 BUG_ON(PageWriteback(page));
1031 if (!clear_page_dirty_for_io(page))
1032 goto continue_unlock;
934 1033
935 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 1034 ret = (*writepage)(page, wbc, data);
936 unlock_page(page); 1035 if (unlikely(ret)) {
937 ret = 0; 1036 if (ret == AOP_WRITEPAGE_ACTIVATE) {
938 } 1037 unlock_page(page);
939 if (ret || (--nr_to_write <= 0)) 1038 ret = 0;
1039 } else {
1040 /*
1041 * done_index is set past this page,
1042 * so media errors will not choke
1043 * background writeout for the entire
1044 * file. This has consequences for
1045 * range_cyclic semantics (ie. it may
1046 * not be suitable for data integrity
1047 * writeout).
1048 */
1049 done = 1;
1050 break;
1051 }
1052 }
1053
1054 if (nr_to_write > 0)
1055 nr_to_write--;
1056 else if (wbc->sync_mode == WB_SYNC_NONE) {
1057 /*
1058 * We stop writing back only if we are not
1059 * doing integrity sync. In case of integrity
1060 * sync we have to keep going because someone
1061 * may be concurrently dirtying pages, and we
1062 * might have synced a lot of newly appeared
1063 * dirty pages, but have not synced all of the
1064 * old dirty pages.
1065 */
940 done = 1; 1066 done = 1;
1067 break;
1068 }
1069
941 if (wbc->nonblocking && bdi_write_congested(bdi)) { 1070 if (wbc->nonblocking && bdi_write_congested(bdi)) {
942 wbc->encountered_congestion = 1; 1071 wbc->encountered_congestion = 1;
943 done = 1; 1072 done = 1;
1073 break;
944 } 1074 }
945 } 1075 }
946 pagevec_release(&pvec); 1076 pagevec_release(&pvec);
947 cond_resched(); 1077 cond_resched();
948 } 1078 }
949 if (!scanned && !done) { 1079 if (!cycled) {
950 /* 1080 /*
1081 * range_cyclic:
951 * We hit the last page and there is more work to be done: wrap 1082 * We hit the last page and there is more work to be done: wrap
952 * back to the start of the file 1083 * back to the start of the file
953 */ 1084 */
954 scanned = 1; 1085 cycled = 1;
955 index = 0; 1086 index = 0;
1087 end = writeback_index - 1;
956 goto retry; 1088 goto retry;
957 } 1089 }
958 if (!wbc->no_nrwrite_index_update) { 1090 if (!wbc->no_nrwrite_index_update) {
959 if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) 1091 if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
960 mapping->writeback_index = index; 1092 mapping->writeback_index = done_index;
961 wbc->nr_to_write = nr_to_write; 1093 wbc->nr_to_write = nr_to_write;
962 } 1094 }
963 1095
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d8ac01474563..5675b3073854 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,7 +69,7 @@ EXPORT_SYMBOL(node_states);
69 69
70unsigned long totalram_pages __read_mostly; 70unsigned long totalram_pages __read_mostly;
71unsigned long totalreserve_pages __read_mostly; 71unsigned long totalreserve_pages __read_mostly;
72long nr_swap_pages; 72unsigned long highest_memmap_pfn __read_mostly;
73int percpu_pagelist_fraction; 73int percpu_pagelist_fraction;
74 74
75#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 75#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -223,19 +223,41 @@ static inline int bad_range(struct zone *zone, struct page *page)
223 223
224static void bad_page(struct page *page) 224static void bad_page(struct page *page)
225{ 225{
226 printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG 226 static unsigned long resume;
227 "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 227 static unsigned long nr_shown;
228 current->comm, page, (int)(2*sizeof(unsigned long)), 228 static unsigned long nr_unshown;
229 (unsigned long)page->flags, page->mapping, 229
230 page_mapcount(page), page_count(page)); 230 /*
231 * Allow a burst of 60 reports, then keep quiet for that minute;
232 * or allow a steady drip of one report per second.
233 */
234 if (nr_shown == 60) {
235 if (time_before(jiffies, resume)) {
236 nr_unshown++;
237 goto out;
238 }
239 if (nr_unshown) {
240 printk(KERN_ALERT
241 "BUG: Bad page state: %lu messages suppressed\n",
242 nr_unshown);
243 nr_unshown = 0;
244 }
245 nr_shown = 0;
246 }
247 if (nr_shown++ == 0)
248 resume = jiffies + 60 * HZ;
249
250 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
251 current->comm, page_to_pfn(page));
252 printk(KERN_ALERT
253 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
254 page, (void *)page->flags, page_count(page),
255 page_mapcount(page), page->mapping, page->index);
231 256
232 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
233 KERN_EMERG "Backtrace:\n");
234 dump_stack(); 257 dump_stack();
235 page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD; 258out:
236 set_page_count(page, 0); 259 /* Leave bad fields for debug, except PageBuddy could make trouble */
237 reset_page_mapcount(page); 260 __ClearPageBuddy(page);
238 page->mapping = NULL;
239 add_taint(TAINT_BAD_PAGE); 261 add_taint(TAINT_BAD_PAGE);
240} 262}
241 263
@@ -292,25 +314,31 @@ void prep_compound_gigantic_page(struct page *page, unsigned long order)
292} 314}
293#endif 315#endif
294 316
295static void destroy_compound_page(struct page *page, unsigned long order) 317static int destroy_compound_page(struct page *page, unsigned long order)
296{ 318{
297 int i; 319 int i;
298 int nr_pages = 1 << order; 320 int nr_pages = 1 << order;
321 int bad = 0;
299 322
300 if (unlikely(compound_order(page) != order)) 323 if (unlikely(compound_order(page) != order) ||
324 unlikely(!PageHead(page))) {
301 bad_page(page); 325 bad_page(page);
326 bad++;
327 }
302 328
303 if (unlikely(!PageHead(page)))
304 bad_page(page);
305 __ClearPageHead(page); 329 __ClearPageHead(page);
330
306 for (i = 1; i < nr_pages; i++) { 331 for (i = 1; i < nr_pages; i++) {
307 struct page *p = page + i; 332 struct page *p = page + i;
308 333
309 if (unlikely(!PageTail(p) | 334 if (unlikely(!PageTail(p) | (p->first_page != page))) {
310 (p->first_page != page)))
311 bad_page(page); 335 bad_page(page);
336 bad++;
337 }
312 __ClearPageTail(p); 338 __ClearPageTail(p);
313 } 339 }
340
341 return bad;
314} 342}
315 343
316static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 344static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
@@ -430,7 +458,8 @@ static inline void __free_one_page(struct page *page,
430 int migratetype = get_pageblock_migratetype(page); 458 int migratetype = get_pageblock_migratetype(page);
431 459
432 if (unlikely(PageCompound(page))) 460 if (unlikely(PageCompound(page)))
433 destroy_compound_page(page, order); 461 if (unlikely(destroy_compound_page(page, order)))
462 return;
434 463
435 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 464 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
436 465
@@ -467,18 +496,13 @@ static inline int free_pages_check(struct page *page)
467 if (unlikely(page_mapcount(page) | 496 if (unlikely(page_mapcount(page) |
468 (page->mapping != NULL) | 497 (page->mapping != NULL) |
469 (page_count(page) != 0) | 498 (page_count(page) != 0) |
470 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) 499 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
471 bad_page(page); 500 bad_page(page);
472 if (PageDirty(page)) 501 return 1;
473 __ClearPageDirty(page); 502 }
474 if (PageSwapBacked(page)) 503 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
475 __ClearPageSwapBacked(page); 504 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
476 /* 505 return 0;
477 * For now, we report if PG_reserved was found set, but do not
478 * clear it, and do not free the page. But we shall soon need
479 * to do more, for when the ZERO_PAGE count wraps negative.
480 */
481 return PageReserved(page);
482} 506}
483 507
484/* 508/*
@@ -523,11 +547,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
523{ 547{
524 unsigned long flags; 548 unsigned long flags;
525 int i; 549 int i;
526 int reserved = 0; 550 int bad = 0;
527 551
528 for (i = 0 ; i < (1 << order) ; ++i) 552 for (i = 0 ; i < (1 << order) ; ++i)
529 reserved += free_pages_check(page + i); 553 bad += free_pages_check(page + i);
530 if (reserved) 554 if (bad)
531 return; 555 return;
532 556
533 if (!PageHighMem(page)) { 557 if (!PageHighMem(page)) {
@@ -612,23 +636,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
612 if (unlikely(page_mapcount(page) | 636 if (unlikely(page_mapcount(page) |
613 (page->mapping != NULL) | 637 (page->mapping != NULL) |
614 (page_count(page) != 0) | 638 (page_count(page) != 0) |
615 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) 639 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
616 bad_page(page); 640 bad_page(page);
617
618 /*
619 * For now, we report if PG_reserved was found set, but do not
620 * clear it, and do not allocate the page: as a safety net.
621 */
622 if (PageReserved(page))
623 return 1; 641 return 1;
642 }
624 643
625 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
626 1 << PG_referenced | 1 << PG_arch_1 |
627 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk
628#ifdef CONFIG_UNEVICTABLE_LRU
629 | 1 << PG_mlocked
630#endif
631 );
632 set_page_private(page, 0); 644 set_page_private(page, 0);
633 set_page_refcounted(page); 645 set_page_refcounted(page);
634 646
@@ -2609,6 +2621,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2609 unsigned long pfn; 2621 unsigned long pfn;
2610 struct zone *z; 2622 struct zone *z;
2611 2623
2624 if (highest_memmap_pfn < end_pfn - 1)
2625 highest_memmap_pfn = end_pfn - 1;
2626
2612 z = &NODE_DATA(nid)->node_zones[zone]; 2627 z = &NODE_DATA(nid)->node_zones[zone];
2613 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 2628 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
2614 /* 2629 /*
@@ -3381,10 +3396,8 @@ static void __init setup_usemap(struct pglist_data *pgdat,
3381{ 3396{
3382 unsigned long usemapsize = usemap_size(zonesize); 3397 unsigned long usemapsize = usemap_size(zonesize);
3383 zone->pageblock_flags = NULL; 3398 zone->pageblock_flags = NULL;
3384 if (usemapsize) { 3399 if (usemapsize)
3385 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); 3400 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
3386 memset(zone->pageblock_flags, 0, usemapsize);
3387 }
3388} 3401}
3389#else 3402#else
3390static void inline setup_usemap(struct pglist_data *pgdat, 3403static void inline setup_usemap(struct pglist_data *pgdat,
@@ -3469,9 +3482,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3469 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 3482 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
3470 if (realsize >= memmap_pages) { 3483 if (realsize >= memmap_pages) {
3471 realsize -= memmap_pages; 3484 realsize -= memmap_pages;
3472 printk(KERN_DEBUG 3485 if (memmap_pages)
3473 " %s zone: %lu pages used for memmap\n", 3486 printk(KERN_DEBUG
3474 zone_names[j], memmap_pages); 3487 " %s zone: %lu pages used for memmap\n",
3488 zone_names[j], memmap_pages);
3475 } else 3489 } else
3476 printk(KERN_WARNING 3490 printk(KERN_WARNING
3477 " %s zone: %lu pages exceeds realsize %lu\n", 3491 " %s zone: %lu pages exceeds realsize %lu\n",
@@ -3509,10 +3523,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3509 INIT_LIST_HEAD(&zone->lru[l].list); 3523 INIT_LIST_HEAD(&zone->lru[l].list);
3510 zone->lru[l].nr_scan = 0; 3524 zone->lru[l].nr_scan = 0;
3511 } 3525 }
3512 zone->recent_rotated[0] = 0; 3526 zone->reclaim_stat.recent_rotated[0] = 0;
3513 zone->recent_rotated[1] = 0; 3527 zone->reclaim_stat.recent_rotated[1] = 0;
3514 zone->recent_scanned[0] = 0; 3528 zone->reclaim_stat.recent_scanned[0] = 0;
3515 zone->recent_scanned[1] = 0; 3529 zone->reclaim_stat.recent_scanned[1] = 0;
3516 zap_zone_vm_stats(zone); 3530 zap_zone_vm_stats(zone);
3517 zone->flags = 0; 3531 zone->flags = 0;
3518 if (!size) 3532 if (!size)
@@ -4316,7 +4330,7 @@ void setup_per_zone_pages_min(void)
4316 * 1TB 101 10GB 4330 * 1TB 101 10GB
4317 * 10TB 320 32GB 4331 * 10TB 320 32GB
4318 */ 4332 */
4319void setup_per_zone_inactive_ratio(void) 4333static void setup_per_zone_inactive_ratio(void)
4320{ 4334{
4321 struct zone *zone; 4335 struct zone *zone;
4322 4336
@@ -4573,19 +4587,6 @@ void *__init alloc_large_system_hash(const char *tablename,
4573 return table; 4587 return table;
4574} 4588}
4575 4589
4576#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
4577struct page *pfn_to_page(unsigned long pfn)
4578{
4579 return __pfn_to_page(pfn);
4580}
4581unsigned long page_to_pfn(struct page *page)
4582{
4583 return __page_to_pfn(page);
4584}
4585EXPORT_SYMBOL(pfn_to_page);
4586EXPORT_SYMBOL(page_to_pfn);
4587#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
4588
4589/* Return a pointer to the bitmap storing bits affecting a block of pages */ 4590/* Return a pointer to the bitmap storing bits affecting a block of pages */
4590static inline unsigned long *get_pageblock_bitmap(struct zone *zone, 4591static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
4591 unsigned long pfn) 4592 unsigned long pfn)
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index ab27ff750519..7006a11350c8 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -8,6 +8,7 @@
8#include <linux/memory.h> 8#include <linux/memory.h>
9#include <linux/vmalloc.h> 9#include <linux/vmalloc.h>
10#include <linux/cgroup.h> 10#include <linux/cgroup.h>
11#include <linux/swapops.h>
11 12
12static void __meminit 13static void __meminit
13__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) 14__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
@@ -15,6 +16,7 @@ __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
15 pc->flags = 0; 16 pc->flags = 0;
16 pc->mem_cgroup = NULL; 17 pc->mem_cgroup = NULL;
17 pc->page = pfn_to_page(pfn); 18 pc->page = pfn_to_page(pfn);
19 INIT_LIST_HEAD(&pc->lru);
18} 20}
19static unsigned long total_usage; 21static unsigned long total_usage;
20 22
@@ -72,7 +74,7 @@ void __init page_cgroup_init(void)
72 74
73 int nid, fail; 75 int nid, fail;
74 76
75 if (mem_cgroup_subsys.disabled) 77 if (mem_cgroup_disabled())
76 return; 78 return;
77 79
78 for_each_online_node(nid) { 80 for_each_online_node(nid) {
@@ -101,15 +103,13 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
101} 103}
102 104
103/* __alloc_bootmem...() is protected by !slab_available() */ 105/* __alloc_bootmem...() is protected by !slab_available() */
104int __init_refok init_section_page_cgroup(unsigned long pfn) 106static int __init_refok init_section_page_cgroup(unsigned long pfn)
105{ 107{
106 struct mem_section *section; 108 struct mem_section *section = __pfn_to_section(pfn);
107 struct page_cgroup *base, *pc; 109 struct page_cgroup *base, *pc;
108 unsigned long table_size; 110 unsigned long table_size;
109 int nid, index; 111 int nid, index;
110 112
111 section = __pfn_to_section(pfn);
112
113 if (!section->page_cgroup) { 113 if (!section->page_cgroup) {
114 nid = page_to_nid(pfn_to_page(pfn)); 114 nid = page_to_nid(pfn_to_page(pfn));
115 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; 115 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
@@ -145,7 +145,6 @@ int __init_refok init_section_page_cgroup(unsigned long pfn)
145 __init_page_cgroup(pc, pfn + index); 145 __init_page_cgroup(pc, pfn + index);
146 } 146 }
147 147
148 section = __pfn_to_section(pfn);
149 section->page_cgroup = base - pfn; 148 section->page_cgroup = base - pfn;
150 total_usage += table_size; 149 total_usage += table_size;
151 return 0; 150 return 0;
@@ -248,7 +247,7 @@ void __init page_cgroup_init(void)
248 unsigned long pfn; 247 unsigned long pfn;
249 int fail = 0; 248 int fail = 0;
250 249
251 if (mem_cgroup_subsys.disabled) 250 if (mem_cgroup_disabled())
252 return; 251 return;
253 252
254 for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { 253 for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
@@ -273,3 +272,199 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
273} 272}
274 273
275#endif 274#endif
275
276
277#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
278
279static DEFINE_MUTEX(swap_cgroup_mutex);
280struct swap_cgroup_ctrl {
281 struct page **map;
282 unsigned long length;
283};
284
285struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
286
287/*
288 * This 8bytes seems big..maybe we can reduce this when we can use "id" for
289 * cgroup rather than pointer.
290 */
291struct swap_cgroup {
292 struct mem_cgroup *val;
293};
294#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
295#define SC_POS_MASK (SC_PER_PAGE - 1)
296
297/*
298 * SwapCgroup implements "lookup" and "exchange" operations.
299 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
300 * against SwapCache. At swap_free(), this is accessed directly from swap.
301 *
302 * This means,
303 * - we have no race in "exchange" when we're accessed via SwapCache because
304 * SwapCache(and its swp_entry) is under lock.
305 * - When called via swap_free(), there is no user of this entry and no race.
306 * Then, we don't need lock around "exchange".
307 *
308 * TODO: we can push these buffers out to HIGHMEM.
309 */
310
311/*
312 * allocate buffer for swap_cgroup.
313 */
314static int swap_cgroup_prepare(int type)
315{
316 struct page *page;
317 struct swap_cgroup_ctrl *ctrl;
318 unsigned long idx, max;
319
320 if (!do_swap_account)
321 return 0;
322 ctrl = &swap_cgroup_ctrl[type];
323
324 for (idx = 0; idx < ctrl->length; idx++) {
325 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
326 if (!page)
327 goto not_enough_page;
328 ctrl->map[idx] = page;
329 }
330 return 0;
331not_enough_page:
332 max = idx;
333 for (idx = 0; idx < max; idx++)
334 __free_page(ctrl->map[idx]);
335
336 return -ENOMEM;
337}
338
339/**
340 * swap_cgroup_record - record mem_cgroup for this swp_entry.
341 * @ent: swap entry to be recorded into
342 * @mem: mem_cgroup to be recorded
343 *
344 * Returns old value at success, NULL at failure.
345 * (Of course, old value can be NULL.)
346 */
347struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
348{
349 int type = swp_type(ent);
350 unsigned long offset = swp_offset(ent);
351 unsigned long idx = offset / SC_PER_PAGE;
352 unsigned long pos = offset & SC_POS_MASK;
353 struct swap_cgroup_ctrl *ctrl;
354 struct page *mappage;
355 struct swap_cgroup *sc;
356 struct mem_cgroup *old;
357
358 if (!do_swap_account)
359 return NULL;
360
361 ctrl = &swap_cgroup_ctrl[type];
362
363 mappage = ctrl->map[idx];
364 sc = page_address(mappage);
365 sc += pos;
366 old = sc->val;
367 sc->val = mem;
368
369 return old;
370}
371
372/**
373 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
374 * @ent: swap entry to be looked up.
375 *
376 * Returns pointer to mem_cgroup at success. NULL at failure.
377 */
378struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
379{
380 int type = swp_type(ent);
381 unsigned long offset = swp_offset(ent);
382 unsigned long idx = offset / SC_PER_PAGE;
383 unsigned long pos = offset & SC_POS_MASK;
384 struct swap_cgroup_ctrl *ctrl;
385 struct page *mappage;
386 struct swap_cgroup *sc;
387 struct mem_cgroup *ret;
388
389 if (!do_swap_account)
390 return NULL;
391
392 ctrl = &swap_cgroup_ctrl[type];
393 mappage = ctrl->map[idx];
394 sc = page_address(mappage);
395 sc += pos;
396 ret = sc->val;
397 return ret;
398}
399
400int swap_cgroup_swapon(int type, unsigned long max_pages)
401{
402 void *array;
403 unsigned long array_size;
404 unsigned long length;
405 struct swap_cgroup_ctrl *ctrl;
406
407 if (!do_swap_account)
408 return 0;
409
410 length = ((max_pages/SC_PER_PAGE) + 1);
411 array_size = length * sizeof(void *);
412
413 array = vmalloc(array_size);
414 if (!array)
415 goto nomem;
416
417 memset(array, 0, array_size);
418 ctrl = &swap_cgroup_ctrl[type];
419 mutex_lock(&swap_cgroup_mutex);
420 ctrl->length = length;
421 ctrl->map = array;
422 if (swap_cgroup_prepare(type)) {
423 /* memory shortage */
424 ctrl->map = NULL;
425 ctrl->length = 0;
426 vfree(array);
427 mutex_unlock(&swap_cgroup_mutex);
428 goto nomem;
429 }
430 mutex_unlock(&swap_cgroup_mutex);
431
432 printk(KERN_INFO
433 "swap_cgroup: uses %ld bytes of vmalloc for pointer array space"
434 " and %ld bytes to hold mem_cgroup pointers on swap\n",
435 array_size, length * PAGE_SIZE);
436 printk(KERN_INFO
437 "swap_cgroup can be disabled by noswapaccount boot option.\n");
438
439 return 0;
440nomem:
441 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
442 printk(KERN_INFO
443 "swap_cgroup can be disabled by noswapaccount boot option\n");
444 return -ENOMEM;
445}
446
447void swap_cgroup_swapoff(int type)
448{
449 int i;
450 struct swap_cgroup_ctrl *ctrl;
451
452 if (!do_swap_account)
453 return;
454
455 mutex_lock(&swap_cgroup_mutex);
456 ctrl = &swap_cgroup_ctrl[type];
457 if (ctrl->map) {
458 for (i = 0; i < ctrl->length; i++) {
459 struct page *page = ctrl->map[i];
460 if (page)
461 __free_page(page);
462 }
463 vfree(ctrl->map);
464 ctrl->map = NULL;
465 ctrl->length = 0;
466 }
467 mutex_unlock(&swap_cgroup_mutex);
468}
469
470#endif
diff --git a/mm/page_io.c b/mm/page_io.c
index 065c4480eaf0..dc6ce0afbded 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -98,7 +98,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
98 struct bio *bio; 98 struct bio *bio;
99 int ret = 0, rw = WRITE; 99 int ret = 0, rw = WRITE;
100 100
101 if (remove_exclusive_swap_page(page)) { 101 if (try_to_free_swap(page)) {
102 unlock_page(page); 102 unlock_page(page);
103 goto out; 103 goto out;
104 } 104 }
@@ -125,8 +125,8 @@ int swap_readpage(struct file *file, struct page *page)
125 struct bio *bio; 125 struct bio *bio;
126 int ret = 0; 126 int ret = 0;
127 127
128 BUG_ON(!PageLocked(page)); 128 VM_BUG_ON(!PageLocked(page));
129 BUG_ON(PageUptodate(page)); 129 VM_BUG_ON(PageUptodate(page));
130 bio = get_swap_bio(GFP_KERNEL, page_private(page), page, 130 bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
131 end_swap_bio_read); 131 end_swap_bio_read);
132 if (bio == NULL) { 132 if (bio == NULL) {
diff --git a/mm/pdflush.c b/mm/pdflush.c
index a0a14c4d5072..15de509b68fd 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -172,7 +172,16 @@ static int __pdflush(struct pdflush_work *my_work)
172static int pdflush(void *dummy) 172static int pdflush(void *dummy)
173{ 173{
174 struct pdflush_work my_work; 174 struct pdflush_work my_work;
175 cpumask_t cpus_allowed; 175 cpumask_var_t cpus_allowed;
176
177 /*
178 * Since the caller doesn't even check kthread_run() worked, let's not
179 * freak out too much if this fails.
180 */
181 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
182 printk(KERN_WARNING "pdflush failed to allocate cpumask\n");
183 return 0;
184 }
176 185
177 /* 186 /*
178 * pdflush can spend a lot of time doing encryption via dm-crypt. We 187 * pdflush can spend a lot of time doing encryption via dm-crypt. We
@@ -187,8 +196,9 @@ static int pdflush(void *dummy)
187 * This is needed as pdflush's are dynamically created and destroyed. 196 * This is needed as pdflush's are dynamically created and destroyed.
188 * The boottime pdflush's are easily placed w/o these 2 lines. 197 * The boottime pdflush's are easily placed w/o these 2 lines.
189 */ 198 */
190 cpuset_cpus_allowed(current, &cpus_allowed); 199 cpuset_cpus_allowed(current, cpus_allowed);
191 set_cpus_allowed_ptr(current, &cpus_allowed); 200 set_cpus_allowed_ptr(current, cpus_allowed);
201 free_cpumask_var(cpus_allowed);
192 202
193 return __pdflush(&my_work); 203 return __pdflush(&my_work);
194} 204}
diff --git a/mm/rmap.c b/mm/rmap.c
index 10993942d6c9..ac4af8cffbf9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -47,9 +47,9 @@
47#include <linux/rmap.h> 47#include <linux/rmap.h>
48#include <linux/rcupdate.h> 48#include <linux/rcupdate.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kallsyms.h>
51#include <linux/memcontrol.h> 50#include <linux/memcontrol.h>
52#include <linux/mmu_notifier.h> 51#include <linux/mmu_notifier.h>
52#include <linux/migrate.h>
53 53
54#include <asm/tlbflush.h> 54#include <asm/tlbflush.h>
55 55
@@ -191,7 +191,7 @@ void __init anon_vma_init(void)
191 * Getting a lock on a stable anon_vma from a page off the LRU is 191 * Getting a lock on a stable anon_vma from a page off the LRU is
192 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 192 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
193 */ 193 */
194struct anon_vma *page_lock_anon_vma(struct page *page) 194static struct anon_vma *page_lock_anon_vma(struct page *page)
195{ 195{
196 struct anon_vma *anon_vma; 196 struct anon_vma *anon_vma;
197 unsigned long anon_mapping; 197 unsigned long anon_mapping;
@@ -211,7 +211,7 @@ out:
211 return NULL; 211 return NULL;
212} 212}
213 213
214void page_unlock_anon_vma(struct anon_vma *anon_vma) 214static void page_unlock_anon_vma(struct anon_vma *anon_vma)
215{ 215{
216 spin_unlock(&anon_vma->lock); 216 spin_unlock(&anon_vma->lock);
217 rcu_read_unlock(); 217 rcu_read_unlock();
@@ -359,8 +359,17 @@ static int page_referenced_one(struct page *page,
359 goto out_unmap; 359 goto out_unmap;
360 } 360 }
361 361
362 if (ptep_clear_flush_young_notify(vma, address, pte)) 362 if (ptep_clear_flush_young_notify(vma, address, pte)) {
363 referenced++; 363 /*
364 * Don't treat a reference through a sequentially read
365 * mapping as such. If the page has been used in
366 * another mapping, we will catch it; if this other
367 * mapping is already gone, the unmap path will have
368 * set PG_referenced or activated the page.
369 */
370 if (likely(!VM_SequentialReadHint(vma)))
371 referenced++;
372 }
364 373
365 /* Pretend the page is referenced if the task has the 374 /* Pretend the page is referenced if the task has the
366 swap token and is in the middle of a page fault. */ 375 swap token and is in the middle of a page fault. */
@@ -661,9 +670,14 @@ void page_add_anon_rmap(struct page *page,
661void page_add_new_anon_rmap(struct page *page, 670void page_add_new_anon_rmap(struct page *page,
662 struct vm_area_struct *vma, unsigned long address) 671 struct vm_area_struct *vma, unsigned long address)
663{ 672{
664 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 673 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
665 atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ 674 SetPageSwapBacked(page);
675 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
666 __page_set_anon_rmap(page, vma, address); 676 __page_set_anon_rmap(page, vma, address);
677 if (page_evictable(page, vma))
678 lru_cache_add_lru(page, LRU_ACTIVE_ANON);
679 else
680 add_page_to_unevictable_list(page);
667} 681}
668 682
669/** 683/**
@@ -693,7 +707,6 @@ void page_add_file_rmap(struct page *page)
693 */ 707 */
694void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) 708void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
695{ 709{
696 BUG_ON(page_mapcount(page) == 0);
697 if (PageAnon(page)) 710 if (PageAnon(page))
698 __page_check_anon_rmap(page, vma, address); 711 __page_check_anon_rmap(page, vma, address);
699 atomic_inc(&page->_mapcount); 712 atomic_inc(&page->_mapcount);
@@ -703,28 +716,12 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long
703/** 716/**
704 * page_remove_rmap - take down pte mapping from a page 717 * page_remove_rmap - take down pte mapping from a page
705 * @page: page to remove mapping from 718 * @page: page to remove mapping from
706 * @vma: the vm area in which the mapping is removed
707 * 719 *
708 * The caller needs to hold the pte lock. 720 * The caller needs to hold the pte lock.
709 */ 721 */
710void page_remove_rmap(struct page *page, struct vm_area_struct *vma) 722void page_remove_rmap(struct page *page)
711{ 723{
712 if (atomic_add_negative(-1, &page->_mapcount)) { 724 if (atomic_add_negative(-1, &page->_mapcount)) {
713 if (unlikely(page_mapcount(page) < 0)) {
714 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
715 printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page));
716 printk (KERN_EMERG " page->flags = %lx\n", page->flags);
717 printk (KERN_EMERG " page->count = %x\n", page_count(page));
718 printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
719 print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
720 if (vma->vm_ops) {
721 print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault);
722 }
723 if (vma->vm_file && vma->vm_file->f_op)
724 print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
725 BUG();
726 }
727
728 /* 725 /*
729 * Now that the last pte has gone, s390 must transfer dirty 726 * Now that the last pte has gone, s390 must transfer dirty
730 * flag from storage key to struct page. We can usually skip 727 * flag from storage key to struct page. We can usually skip
@@ -818,8 +815,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
818 spin_unlock(&mmlist_lock); 815 spin_unlock(&mmlist_lock);
819 } 816 }
820 dec_mm_counter(mm, anon_rss); 817 dec_mm_counter(mm, anon_rss);
821#ifdef CONFIG_MIGRATION 818 } else if (PAGE_MIGRATION) {
822 } else {
823 /* 819 /*
824 * Store the pfn of the page in a special migration 820 * Store the pfn of the page in a special migration
825 * pte. do_swap_page() will wait until the migration 821 * pte. do_swap_page() will wait until the migration
@@ -827,23 +823,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
827 */ 823 */
828 BUG_ON(!migration); 824 BUG_ON(!migration);
829 entry = make_migration_entry(page, pte_write(pteval)); 825 entry = make_migration_entry(page, pte_write(pteval));
830#endif
831 } 826 }
832 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 827 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
833 BUG_ON(pte_file(*pte)); 828 BUG_ON(pte_file(*pte));
834 } else 829 } else if (PAGE_MIGRATION && migration) {
835#ifdef CONFIG_MIGRATION
836 if (migration) {
837 /* Establish migration entry for a file page */ 830 /* Establish migration entry for a file page */
838 swp_entry_t entry; 831 swp_entry_t entry;
839 entry = make_migration_entry(page, pte_write(pteval)); 832 entry = make_migration_entry(page, pte_write(pteval));
840 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 833 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
841 } else 834 } else
842#endif
843 dec_mm_counter(mm, file_rss); 835 dec_mm_counter(mm, file_rss);
844 836
845 837
846 page_remove_rmap(page, vma); 838 page_remove_rmap(page);
847 page_cache_release(page); 839 page_cache_release(page);
848 840
849out_unmap: 841out_unmap:
@@ -958,7 +950,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
958 if (pte_dirty(pteval)) 950 if (pte_dirty(pteval))
959 set_page_dirty(page); 951 set_page_dirty(page);
960 952
961 page_remove_rmap(page, vma); 953 page_remove_rmap(page);
962 page_cache_release(page); 954 page_cache_release(page);
963 dec_mm_counter(mm, file_rss); 955 dec_mm_counter(mm, file_rss);
964 (*mapcount)--; 956 (*mapcount)--;
diff --git a/mm/shmem.c b/mm/shmem.c
index dd5588f5d939..75199888a6bd 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -14,31 +14,39 @@
14 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> 14 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
15 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> 15 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
16 * 16 *
17 * tiny-shmem:
18 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
19 *
17 * This file is released under the GPL. 20 * This file is released under the GPL.
18 */ 21 */
19 22
23#include <linux/fs.h>
24#include <linux/init.h>
25#include <linux/vfs.h>
26#include <linux/mount.h>
27#include <linux/file.h>
28#include <linux/mm.h>
29#include <linux/module.h>
30#include <linux/swap.h>
31
32static struct vfsmount *shm_mnt;
33
34#ifdef CONFIG_SHMEM
20/* 35/*
21 * This virtual memory filesystem is heavily based on the ramfs. It 36 * This virtual memory filesystem is heavily based on the ramfs. It
22 * extends ramfs by the ability to use swap and honor resource limits 37 * extends ramfs by the ability to use swap and honor resource limits
23 * which makes it a completely usable filesystem. 38 * which makes it a completely usable filesystem.
24 */ 39 */
25 40
26#include <linux/module.h>
27#include <linux/init.h>
28#include <linux/fs.h>
29#include <linux/xattr.h> 41#include <linux/xattr.h>
30#include <linux/exportfs.h> 42#include <linux/exportfs.h>
31#include <linux/generic_acl.h> 43#include <linux/generic_acl.h>
32#include <linux/mm.h>
33#include <linux/mman.h> 44#include <linux/mman.h>
34#include <linux/file.h>
35#include <linux/swap.h>
36#include <linux/pagemap.h> 45#include <linux/pagemap.h>
37#include <linux/string.h> 46#include <linux/string.h>
38#include <linux/slab.h> 47#include <linux/slab.h>
39#include <linux/backing-dev.h> 48#include <linux/backing-dev.h>
40#include <linux/shmem_fs.h> 49#include <linux/shmem_fs.h>
41#include <linux/mount.h>
42#include <linux/writeback.h> 50#include <linux/writeback.h>
43#include <linux/vfs.h> 51#include <linux/vfs.h>
44#include <linux/blkdev.h> 52#include <linux/blkdev.h>
@@ -921,7 +929,11 @@ found:
921 error = 1; 929 error = 1;
922 if (!inode) 930 if (!inode)
923 goto out; 931 goto out;
924 /* Precharge page using GFP_KERNEL while we can wait */ 932 /*
933 * Charge page using GFP_KERNEL while we can wait.
934 * Charged back to the user(not to caller) when swap account is used.
935 * add_to_page_cache() will be called with GFP_NOWAIT.
936 */
925 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); 937 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
926 if (error) 938 if (error)
927 goto out; 939 goto out;
@@ -1313,15 +1325,19 @@ repeat:
1313 } else { 1325 } else {
1314 shmem_swp_unmap(entry); 1326 shmem_swp_unmap(entry);
1315 spin_unlock(&info->lock); 1327 spin_unlock(&info->lock);
1316 unlock_page(swappage);
1317 page_cache_release(swappage);
1318 if (error == -ENOMEM) { 1328 if (error == -ENOMEM) {
1319 /* allow reclaim from this memory cgroup */ 1329 /* allow reclaim from this memory cgroup */
1320 error = mem_cgroup_shrink_usage(current->mm, 1330 error = mem_cgroup_shrink_usage(swappage,
1331 current->mm,
1321 gfp); 1332 gfp);
1322 if (error) 1333 if (error) {
1334 unlock_page(swappage);
1335 page_cache_release(swappage);
1323 goto failed; 1336 goto failed;
1337 }
1324 } 1338 }
1339 unlock_page(swappage);
1340 page_cache_release(swappage);
1325 goto repeat; 1341 goto repeat;
1326 } 1342 }
1327 } else if (sgp == SGP_READ && !filepage) { 1343 } else if (sgp == SGP_READ && !filepage) {
@@ -1372,7 +1388,7 @@ repeat:
1372 1388
1373 /* Precharge page while we can wait, compensate after */ 1389 /* Precharge page while we can wait, compensate after */
1374 error = mem_cgroup_cache_charge(filepage, current->mm, 1390 error = mem_cgroup_cache_charge(filepage, current->mm,
1375 gfp & ~__GFP_HIGHMEM); 1391 GFP_KERNEL);
1376 if (error) { 1392 if (error) {
1377 page_cache_release(filepage); 1393 page_cache_release(filepage);
1378 shmem_unacct_blocks(info->flags, 1); 1394 shmem_unacct_blocks(info->flags, 1);
@@ -1445,7 +1461,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1445 if (error) 1461 if (error)
1446 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1462 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1447 1463
1448 mark_page_accessed(vmf->page);
1449 return ret | VM_FAULT_LOCKED; 1464 return ret | VM_FAULT_LOCKED;
1450} 1465}
1451 1466
@@ -2487,7 +2502,6 @@ static struct file_system_type tmpfs_fs_type = {
2487 .get_sb = shmem_get_sb, 2502 .get_sb = shmem_get_sb,
2488 .kill_sb = kill_litter_super, 2503 .kill_sb = kill_litter_super,
2489}; 2504};
2490static struct vfsmount *shm_mnt;
2491 2505
2492static int __init init_tmpfs(void) 2506static int __init init_tmpfs(void)
2493{ 2507{
@@ -2526,7 +2540,51 @@ out4:
2526 shm_mnt = ERR_PTR(error); 2540 shm_mnt = ERR_PTR(error);
2527 return error; 2541 return error;
2528} 2542}
2529module_init(init_tmpfs) 2543
2544#else /* !CONFIG_SHMEM */
2545
2546/*
2547 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
2548 *
2549 * This is intended for small system where the benefits of the full
2550 * shmem code (swap-backed and resource-limited) are outweighed by
2551 * their complexity. On systems without swap this code should be
2552 * effectively equivalent, but much lighter weight.
2553 */
2554
2555#include <linux/ramfs.h>
2556
2557static struct file_system_type tmpfs_fs_type = {
2558 .name = "tmpfs",
2559 .get_sb = ramfs_get_sb,
2560 .kill_sb = kill_litter_super,
2561};
2562
2563static int __init init_tmpfs(void)
2564{
2565 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
2566
2567 shm_mnt = kern_mount(&tmpfs_fs_type);
2568 BUG_ON(IS_ERR(shm_mnt));
2569
2570 return 0;
2571}
2572
2573int shmem_unuse(swp_entry_t entry, struct page *page)
2574{
2575 return 0;
2576}
2577
2578#define shmem_file_operations ramfs_file_operations
2579#define shmem_vm_ops generic_file_vm_ops
2580#define shmem_get_inode ramfs_get_inode
2581#define shmem_acct_size(a, b) 0
2582#define shmem_unacct_size(a, b) do {} while (0)
2583#define SHMEM_MAX_BYTES LLONG_MAX
2584
2585#endif /* CONFIG_SHMEM */
2586
2587/* common code */
2530 2588
2531/** 2589/**
2532 * shmem_file_setup - get an unlinked file living in tmpfs 2590 * shmem_file_setup - get an unlinked file living in tmpfs
@@ -2570,12 +2628,20 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2570 if (!inode) 2628 if (!inode)
2571 goto close_file; 2629 goto close_file;
2572 2630
2573 SHMEM_I(inode)->flags = flags & VM_ACCOUNT; 2631#ifdef CONFIG_SHMEM
2632 SHMEM_I(inode)->flags = (flags & VM_NORESERVE) ? 0 : VM_ACCOUNT;
2633#endif
2574 d_instantiate(dentry, inode); 2634 d_instantiate(dentry, inode);
2575 inode->i_size = size; 2635 inode->i_size = size;
2576 inode->i_nlink = 0; /* It is unlinked */ 2636 inode->i_nlink = 0; /* It is unlinked */
2577 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, 2637 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
2578 &shmem_file_operations); 2638 &shmem_file_operations);
2639
2640#ifndef CONFIG_MMU
2641 error = ramfs_nommu_expand_for_mapping(inode, size);
2642 if (error)
2643 goto close_file;
2644#endif
2579 return file; 2645 return file;
2580 2646
2581close_file: 2647close_file:
@@ -2608,3 +2674,5 @@ int shmem_zero_setup(struct vm_area_struct *vma)
2608 vma->vm_ops = &shmem_vm_ops; 2674 vma->vm_ops = &shmem_vm_ops;
2609 return 0; 2675 return 0;
2610} 2676}
2677
2678module_init(init_tmpfs)
diff --git a/mm/slab.c b/mm/slab.c
index f97e564bdf11..ddc41f337d58 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2157,7 +2157,7 @@ kmem_cache_create (const char *name, size_t size, size_t align,
2157 2157
2158 /* 2158 /*
2159 * We use cache_chain_mutex to ensure a consistent view of 2159 * We use cache_chain_mutex to ensure a consistent view of
2160 * cpu_online_map as well. Please see cpuup_callback 2160 * cpu_online_mask as well. Please see cpuup_callback
2161 */ 2161 */
2162 get_online_cpus(); 2162 get_online_cpus();
2163 mutex_lock(&cache_chain_mutex); 2163 mutex_lock(&cache_chain_mutex);
diff --git a/mm/slub.c b/mm/slub.c
index 6cb7ad107852..bdc9abb08a23 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1970,7 +1970,7 @@ static DEFINE_PER_CPU(struct kmem_cache_cpu,
1970 kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; 1970 kmem_cache_cpu)[NR_KMEM_CACHE_CPU];
1971 1971
1972static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); 1972static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free);
1973static cpumask_t kmem_cach_cpu_free_init_once = CPU_MASK_NONE; 1973static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS);
1974 1974
1975static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, 1975static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
1976 int cpu, gfp_t flags) 1976 int cpu, gfp_t flags)
@@ -1996,7 +1996,7 @@ static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s,
1996static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) 1996static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu)
1997{ 1997{
1998 if (c < per_cpu(kmem_cache_cpu, cpu) || 1998 if (c < per_cpu(kmem_cache_cpu, cpu) ||
1999 c > per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { 1999 c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) {
2000 kfree(c); 2000 kfree(c);
2001 return; 2001 return;
2002 } 2002 }
@@ -2045,13 +2045,13 @@ static void init_alloc_cpu_cpu(int cpu)
2045{ 2045{
2046 int i; 2046 int i;
2047 2047
2048 if (cpu_isset(cpu, kmem_cach_cpu_free_init_once)) 2048 if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)))
2049 return; 2049 return;
2050 2050
2051 for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) 2051 for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--)
2052 free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); 2052 free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu);
2053 2053
2054 cpu_set(cpu, kmem_cach_cpu_free_init_once); 2054 cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once));
2055} 2055}
2056 2056
2057static void __init init_alloc_cpu(void) 2057static void __init init_alloc_cpu(void)
@@ -2254,7 +2254,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2254 * Add some empty padding so that we can catch 2254 * Add some empty padding so that we can catch
2255 * overwrites from earlier objects rather than let 2255 * overwrites from earlier objects rather than let
2256 * tracking information or the free pointer be 2256 * tracking information or the free pointer be
2257 * corrupted if an user writes before the start 2257 * corrupted if a user writes before the start
2258 * of the object. 2258 * of the object.
2259 */ 2259 */
2260 size += sizeof(void *); 2260 size += sizeof(void *);
@@ -3451,7 +3451,7 @@ struct location {
3451 long max_time; 3451 long max_time;
3452 long min_pid; 3452 long min_pid;
3453 long max_pid; 3453 long max_pid;
3454 cpumask_t cpus; 3454 DECLARE_BITMAP(cpus, NR_CPUS);
3455 nodemask_t nodes; 3455 nodemask_t nodes;
3456}; 3456};
3457 3457
@@ -3526,7 +3526,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
3526 if (track->pid > l->max_pid) 3526 if (track->pid > l->max_pid)
3527 l->max_pid = track->pid; 3527 l->max_pid = track->pid;
3528 3528
3529 cpu_set(track->cpu, l->cpus); 3529 cpumask_set_cpu(track->cpu,
3530 to_cpumask(l->cpus));
3530 } 3531 }
3531 node_set(page_to_nid(virt_to_page(track)), l->nodes); 3532 node_set(page_to_nid(virt_to_page(track)), l->nodes);
3532 return 1; 3533 return 1;
@@ -3556,8 +3557,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
3556 l->max_time = age; 3557 l->max_time = age;
3557 l->min_pid = track->pid; 3558 l->min_pid = track->pid;
3558 l->max_pid = track->pid; 3559 l->max_pid = track->pid;
3559 cpus_clear(l->cpus); 3560 cpumask_clear(to_cpumask(l->cpus));
3560 cpu_set(track->cpu, l->cpus); 3561 cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
3561 nodes_clear(l->nodes); 3562 nodes_clear(l->nodes);
3562 node_set(page_to_nid(virt_to_page(track)), l->nodes); 3563 node_set(page_to_nid(virt_to_page(track)), l->nodes);
3563 return 1; 3564 return 1;
@@ -3638,11 +3639,12 @@ static int list_locations(struct kmem_cache *s, char *buf,
3638 len += sprintf(buf + len, " pid=%ld", 3639 len += sprintf(buf + len, " pid=%ld",
3639 l->min_pid); 3640 l->min_pid);
3640 3641
3641 if (num_online_cpus() > 1 && !cpus_empty(l->cpus) && 3642 if (num_online_cpus() > 1 &&
3643 !cpumask_empty(to_cpumask(l->cpus)) &&
3642 len < PAGE_SIZE - 60) { 3644 len < PAGE_SIZE - 60) {
3643 len += sprintf(buf + len, " cpus="); 3645 len += sprintf(buf + len, " cpus=");
3644 len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50, 3646 len += cpulist_scnprintf(buf + len, PAGE_SIZE - len - 50,
3645 l->cpus); 3647 to_cpumask(l->cpus));
3646 } 3648 }
3647 3649
3648 if (num_online_nodes() > 1 && !nodes_empty(l->nodes) && 3650 if (num_online_nodes() > 1 && !nodes_empty(l->nodes) &&
diff --git a/mm/swap.c b/mm/swap.c
index b135ec90cdeb..8adb9feb61e1 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -151,6 +151,26 @@ void rotate_reclaimable_page(struct page *page)
151 } 151 }
152} 152}
153 153
154static void update_page_reclaim_stat(struct zone *zone, struct page *page,
155 int file, int rotated)
156{
157 struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat;
158 struct zone_reclaim_stat *memcg_reclaim_stat;
159
160 memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
161
162 reclaim_stat->recent_scanned[file]++;
163 if (rotated)
164 reclaim_stat->recent_rotated[file]++;
165
166 if (!memcg_reclaim_stat)
167 return;
168
169 memcg_reclaim_stat->recent_scanned[file]++;
170 if (rotated)
171 memcg_reclaim_stat->recent_rotated[file]++;
172}
173
154/* 174/*
155 * FIXME: speed this up? 175 * FIXME: speed this up?
156 */ 176 */
@@ -168,10 +188,8 @@ void activate_page(struct page *page)
168 lru += LRU_ACTIVE; 188 lru += LRU_ACTIVE;
169 add_page_to_lru_list(zone, page, lru); 189 add_page_to_lru_list(zone, page, lru);
170 __count_vm_event(PGACTIVATE); 190 __count_vm_event(PGACTIVATE);
171 mem_cgroup_move_lists(page, lru);
172 191
173 zone->recent_rotated[!!file]++; 192 update_page_reclaim_stat(zone, page, !!file, 1);
174 zone->recent_scanned[!!file]++;
175 } 193 }
176 spin_unlock_irq(&zone->lru_lock); 194 spin_unlock_irq(&zone->lru_lock);
177} 195}
@@ -246,25 +264,6 @@ void add_page_to_unevictable_list(struct page *page)
246 spin_unlock_irq(&zone->lru_lock); 264 spin_unlock_irq(&zone->lru_lock);
247} 265}
248 266
249/**
250 * lru_cache_add_active_or_unevictable
251 * @page: the page to be added to LRU
252 * @vma: vma in which page is mapped for determining reclaimability
253 *
254 * place @page on active or unevictable LRU list, depending on
255 * page_evictable(). Note that if the page is not evictable,
256 * it goes directly back onto it's zone's unevictable list. It does
257 * NOT use a per cpu pagevec.
258 */
259void lru_cache_add_active_or_unevictable(struct page *page,
260 struct vm_area_struct *vma)
261{
262 if (page_evictable(page, vma))
263 lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page));
264 else
265 add_page_to_unevictable_list(page);
266}
267
268/* 267/*
269 * Drain pages out of the cpu's pagevecs. 268 * Drain pages out of the cpu's pagevecs.
270 * Either "cpu" is the current CPU, and preemption has already been 269 * Either "cpu" is the current CPU, and preemption has already been
@@ -398,28 +397,6 @@ void __pagevec_release(struct pagevec *pvec)
398EXPORT_SYMBOL(__pagevec_release); 397EXPORT_SYMBOL(__pagevec_release);
399 398
400/* 399/*
401 * pagevec_release() for pages which are known to not be on the LRU
402 *
403 * This function reinitialises the caller's pagevec.
404 */
405void __pagevec_release_nonlru(struct pagevec *pvec)
406{
407 int i;
408 struct pagevec pages_to_free;
409
410 pagevec_init(&pages_to_free, pvec->cold);
411 for (i = 0; i < pagevec_count(pvec); i++) {
412 struct page *page = pvec->pages[i];
413
414 VM_BUG_ON(PageLRU(page));
415 if (put_page_testzero(page))
416 pagevec_add(&pages_to_free, page);
417 }
418 pagevec_free(&pages_to_free);
419 pagevec_reinit(pvec);
420}
421
422/*
423 * Add the passed pages to the LRU, then drop the caller's refcount 400 * Add the passed pages to the LRU, then drop the caller's refcount
424 * on them. Reinitialises the caller's pagevec. 401 * on them. Reinitialises the caller's pagevec.
425 */ 402 */
@@ -427,12 +404,14 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
427{ 404{
428 int i; 405 int i;
429 struct zone *zone = NULL; 406 struct zone *zone = NULL;
407
430 VM_BUG_ON(is_unevictable_lru(lru)); 408 VM_BUG_ON(is_unevictable_lru(lru));
431 409
432 for (i = 0; i < pagevec_count(pvec); i++) { 410 for (i = 0; i < pagevec_count(pvec); i++) {
433 struct page *page = pvec->pages[i]; 411 struct page *page = pvec->pages[i];
434 struct zone *pagezone = page_zone(page); 412 struct zone *pagezone = page_zone(page);
435 int file; 413 int file;
414 int active;
436 415
437 if (pagezone != zone) { 416 if (pagezone != zone) {
438 if (zone) 417 if (zone)
@@ -444,12 +423,11 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
444 VM_BUG_ON(PageUnevictable(page)); 423 VM_BUG_ON(PageUnevictable(page));
445 VM_BUG_ON(PageLRU(page)); 424 VM_BUG_ON(PageLRU(page));
446 SetPageLRU(page); 425 SetPageLRU(page);
426 active = is_active_lru(lru);
447 file = is_file_lru(lru); 427 file = is_file_lru(lru);
448 zone->recent_scanned[file]++; 428 if (active)
449 if (is_active_lru(lru)) {
450 SetPageActive(page); 429 SetPageActive(page);
451 zone->recent_rotated[file]++; 430 update_page_reclaim_stat(zone, page, file, active);
452 }
453 add_page_to_lru_list(zone, page, lru); 431 add_page_to_lru_list(zone, page, lru);
454 } 432 }
455 if (zone) 433 if (zone)
@@ -495,8 +473,7 @@ void pagevec_swap_free(struct pagevec *pvec)
495 struct page *page = pvec->pages[i]; 473 struct page *page = pvec->pages[i];
496 474
497 if (PageSwapCache(page) && trylock_page(page)) { 475 if (PageSwapCache(page) && trylock_page(page)) {
498 if (PageSwapCache(page)) 476 try_to_free_swap(page);
499 remove_exclusive_swap_page_ref(page);
500 unlock_page(page); 477 unlock_page(page);
501 } 478 }
502 } 479 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3353c9029cef..3ecea98ecb45 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,6 +17,7 @@
17#include <linux/backing-dev.h> 17#include <linux/backing-dev.h>
18#include <linux/pagevec.h> 18#include <linux/pagevec.h>
19#include <linux/migrate.h> 19#include <linux/migrate.h>
20#include <linux/page_cgroup.h>
20 21
21#include <asm/pgtable.h> 22#include <asm/pgtable.h>
22 23
@@ -72,10 +73,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
72{ 73{
73 int error; 74 int error;
74 75
75 BUG_ON(!PageLocked(page)); 76 VM_BUG_ON(!PageLocked(page));
76 BUG_ON(PageSwapCache(page)); 77 VM_BUG_ON(PageSwapCache(page));
77 BUG_ON(PagePrivate(page)); 78 VM_BUG_ON(!PageSwapBacked(page));
78 BUG_ON(!PageSwapBacked(page)); 79
79 error = radix_tree_preload(gfp_mask); 80 error = radix_tree_preload(gfp_mask);
80 if (!error) { 81 if (!error) {
81 page_cache_get(page); 82 page_cache_get(page);
@@ -108,10 +109,11 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
108 */ 109 */
109void __delete_from_swap_cache(struct page *page) 110void __delete_from_swap_cache(struct page *page)
110{ 111{
111 BUG_ON(!PageLocked(page)); 112 swp_entry_t ent = {.val = page_private(page)};
112 BUG_ON(!PageSwapCache(page)); 113
113 BUG_ON(PageWriteback(page)); 114 VM_BUG_ON(!PageLocked(page));
114 BUG_ON(PagePrivate(page)); 115 VM_BUG_ON(!PageSwapCache(page));
116 VM_BUG_ON(PageWriteback(page));
115 117
116 radix_tree_delete(&swapper_space.page_tree, page_private(page)); 118 radix_tree_delete(&swapper_space.page_tree, page_private(page));
117 set_page_private(page, 0); 119 set_page_private(page, 0);
@@ -119,6 +121,7 @@ void __delete_from_swap_cache(struct page *page)
119 total_swapcache_pages--; 121 total_swapcache_pages--;
120 __dec_zone_page_state(page, NR_FILE_PAGES); 122 __dec_zone_page_state(page, NR_FILE_PAGES);
121 INC_CACHE_INFO(del_total); 123 INC_CACHE_INFO(del_total);
124 mem_cgroup_uncharge_swapcache(page, ent);
122} 125}
123 126
124/** 127/**
@@ -129,13 +132,13 @@ void __delete_from_swap_cache(struct page *page)
129 * Allocate swap space for the page and add the page to the 132 * Allocate swap space for the page and add the page to the
130 * swap cache. Caller needs to hold the page lock. 133 * swap cache. Caller needs to hold the page lock.
131 */ 134 */
132int add_to_swap(struct page * page, gfp_t gfp_mask) 135int add_to_swap(struct page *page)
133{ 136{
134 swp_entry_t entry; 137 swp_entry_t entry;
135 int err; 138 int err;
136 139
137 BUG_ON(!PageLocked(page)); 140 VM_BUG_ON(!PageLocked(page));
138 BUG_ON(!PageUptodate(page)); 141 VM_BUG_ON(!PageUptodate(page));
139 142
140 for (;;) { 143 for (;;) {
141 entry = get_swap_page(); 144 entry = get_swap_page();
@@ -154,7 +157,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
154 * Add it to the swap cache and mark it dirty 157 * Add it to the swap cache and mark it dirty
155 */ 158 */
156 err = add_to_swap_cache(page, entry, 159 err = add_to_swap_cache(page, entry,
157 gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); 160 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
158 161
159 switch (err) { 162 switch (err) {
160 case 0: /* Success */ 163 case 0: /* Success */
@@ -196,14 +199,14 @@ void delete_from_swap_cache(struct page *page)
196 * If we are the only user, then try to free up the swap cache. 199 * If we are the only user, then try to free up the swap cache.
197 * 200 *
198 * Its ok to check for PageSwapCache without the page lock 201 * Its ok to check for PageSwapCache without the page lock
199 * here because we are going to recheck again inside 202 * here because we are going to recheck again inside
200 * exclusive_swap_page() _with_ the lock. 203 * try_to_free_swap() _with_ the lock.
201 * - Marcelo 204 * - Marcelo
202 */ 205 */
203static inline void free_swap_cache(struct page *page) 206static inline void free_swap_cache(struct page *page)
204{ 207{
205 if (PageSwapCache(page) && trylock_page(page)) { 208 if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
206 remove_exclusive_swap_page(page); 209 try_to_free_swap(page);
207 unlock_page(page); 210 unlock_page(page);
208 } 211 }
209} 212}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 54a9f87e5162..7e6304dfafab 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -16,6 +16,7 @@
16#include <linux/namei.h> 16#include <linux/namei.h>
17#include <linux/shm.h> 17#include <linux/shm.h>
18#include <linux/blkdev.h> 18#include <linux/blkdev.h>
19#include <linux/random.h>
19#include <linux/writeback.h> 20#include <linux/writeback.h>
20#include <linux/proc_fs.h> 21#include <linux/proc_fs.h>
21#include <linux/seq_file.h> 22#include <linux/seq_file.h>
@@ -32,9 +33,11 @@
32#include <asm/pgtable.h> 33#include <asm/pgtable.h>
33#include <asm/tlbflush.h> 34#include <asm/tlbflush.h>
34#include <linux/swapops.h> 35#include <linux/swapops.h>
36#include <linux/page_cgroup.h>
35 37
36static DEFINE_SPINLOCK(swap_lock); 38static DEFINE_SPINLOCK(swap_lock);
37static unsigned int nr_swapfiles; 39static unsigned int nr_swapfiles;
40long nr_swap_pages;
38long total_swap_pages; 41long total_swap_pages;
39static int swap_overflow; 42static int swap_overflow;
40static int least_priority; 43static int least_priority;
@@ -83,15 +86,96 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
83 up_read(&swap_unplug_sem); 86 up_read(&swap_unplug_sem);
84} 87}
85 88
89/*
90 * swapon tell device that all the old swap contents can be discarded,
91 * to allow the swap device to optimize its wear-levelling.
92 */
93static int discard_swap(struct swap_info_struct *si)
94{
95 struct swap_extent *se;
96 int err = 0;
97
98 list_for_each_entry(se, &si->extent_list, list) {
99 sector_t start_block = se->start_block << (PAGE_SHIFT - 9);
100 sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
101
102 if (se->start_page == 0) {
103 /* Do not discard the swap header page! */
104 start_block += 1 << (PAGE_SHIFT - 9);
105 nr_blocks -= 1 << (PAGE_SHIFT - 9);
106 if (!nr_blocks)
107 continue;
108 }
109
110 err = blkdev_issue_discard(si->bdev, start_block,
111 nr_blocks, GFP_KERNEL);
112 if (err)
113 break;
114
115 cond_resched();
116 }
117 return err; /* That will often be -EOPNOTSUPP */
118}
119
120/*
121 * swap allocation tell device that a cluster of swap can now be discarded,
122 * to allow the swap device to optimize its wear-levelling.
123 */
124static void discard_swap_cluster(struct swap_info_struct *si,
125 pgoff_t start_page, pgoff_t nr_pages)
126{
127 struct swap_extent *se = si->curr_swap_extent;
128 int found_extent = 0;
129
130 while (nr_pages) {
131 struct list_head *lh;
132
133 if (se->start_page <= start_page &&
134 start_page < se->start_page + se->nr_pages) {
135 pgoff_t offset = start_page - se->start_page;
136 sector_t start_block = se->start_block + offset;
137 sector_t nr_blocks = se->nr_pages - offset;
138
139 if (nr_blocks > nr_pages)
140 nr_blocks = nr_pages;
141 start_page += nr_blocks;
142 nr_pages -= nr_blocks;
143
144 if (!found_extent++)
145 si->curr_swap_extent = se;
146
147 start_block <<= PAGE_SHIFT - 9;
148 nr_blocks <<= PAGE_SHIFT - 9;
149 if (blkdev_issue_discard(si->bdev, start_block,
150 nr_blocks, GFP_NOIO))
151 break;
152 }
153
154 lh = se->list.next;
155 if (lh == &si->extent_list)
156 lh = lh->next;
157 se = list_entry(lh, struct swap_extent, list);
158 }
159}
160
161static int wait_for_discard(void *word)
162{
163 schedule();
164 return 0;
165}
166
86#define SWAPFILE_CLUSTER 256 167#define SWAPFILE_CLUSTER 256
87#define LATENCY_LIMIT 256 168#define LATENCY_LIMIT 256
88 169
89static inline unsigned long scan_swap_map(struct swap_info_struct *si) 170static inline unsigned long scan_swap_map(struct swap_info_struct *si)
90{ 171{
91 unsigned long offset, last_in_cluster; 172 unsigned long offset;
173 unsigned long scan_base;
174 unsigned long last_in_cluster = 0;
92 int latency_ration = LATENCY_LIMIT; 175 int latency_ration = LATENCY_LIMIT;
176 int found_free_cluster = 0;
93 177
94 /* 178 /*
95 * We try to cluster swap pages by allocating them sequentially 179 * We try to cluster swap pages by allocating them sequentially
96 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this 180 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this
97 * way, however, we resort to first-free allocation, starting 181 * way, however, we resort to first-free allocation, starting
@@ -99,16 +183,42 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
99 * all over the entire swap partition, so that we reduce 183 * all over the entire swap partition, so that we reduce
100 * overall disk seek times between swap pages. -- sct 184 * overall disk seek times between swap pages. -- sct
101 * But we do now try to find an empty cluster. -Andrea 185 * But we do now try to find an empty cluster. -Andrea
186 * And we let swap pages go all over an SSD partition. Hugh
102 */ 187 */
103 188
104 si->flags += SWP_SCANNING; 189 si->flags += SWP_SCANNING;
105 if (unlikely(!si->cluster_nr)) { 190 scan_base = offset = si->cluster_next;
106 si->cluster_nr = SWAPFILE_CLUSTER - 1; 191
107 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) 192 if (unlikely(!si->cluster_nr--)) {
108 goto lowest; 193 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
194 si->cluster_nr = SWAPFILE_CLUSTER - 1;
195 goto checks;
196 }
197 if (si->flags & SWP_DISCARDABLE) {
198 /*
199 * Start range check on racing allocations, in case
200 * they overlap the cluster we eventually decide on
201 * (we scan without swap_lock to allow preemption).
202 * It's hardly conceivable that cluster_nr could be
203 * wrapped during our scan, but don't depend on it.
204 */
205 if (si->lowest_alloc)
206 goto checks;
207 si->lowest_alloc = si->max;
208 si->highest_alloc = 0;
209 }
109 spin_unlock(&swap_lock); 210 spin_unlock(&swap_lock);
110 211
111 offset = si->lowest_bit; 212 /*
213 * If seek is expensive, start searching for new cluster from
214 * start of partition, to minimize the span of allocated swap.
215 * But if seek is cheap, search from our current position, so
216 * that swap is allocated from all over the partition: if the
217 * Flash Translation Layer only remaps within limited zones,
218 * we don't want to wear out the first zone too quickly.
219 */
220 if (!(si->flags & SWP_SOLIDSTATE))
221 scan_base = offset = si->lowest_bit;
112 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 222 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
113 223
114 /* Locate the first empty (unaligned) cluster */ 224 /* Locate the first empty (unaligned) cluster */
@@ -117,43 +227,124 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
117 last_in_cluster = offset + SWAPFILE_CLUSTER; 227 last_in_cluster = offset + SWAPFILE_CLUSTER;
118 else if (offset == last_in_cluster) { 228 else if (offset == last_in_cluster) {
119 spin_lock(&swap_lock); 229 spin_lock(&swap_lock);
120 si->cluster_next = offset-SWAPFILE_CLUSTER+1; 230 offset -= SWAPFILE_CLUSTER - 1;
121 goto cluster; 231 si->cluster_next = offset;
232 si->cluster_nr = SWAPFILE_CLUSTER - 1;
233 found_free_cluster = 1;
234 goto checks;
122 } 235 }
123 if (unlikely(--latency_ration < 0)) { 236 if (unlikely(--latency_ration < 0)) {
124 cond_resched(); 237 cond_resched();
125 latency_ration = LATENCY_LIMIT; 238 latency_ration = LATENCY_LIMIT;
126 } 239 }
127 } 240 }
241
242 offset = si->lowest_bit;
243 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
244
245 /* Locate the first empty (unaligned) cluster */
246 for (; last_in_cluster < scan_base; offset++) {
247 if (si->swap_map[offset])
248 last_in_cluster = offset + SWAPFILE_CLUSTER;
249 else if (offset == last_in_cluster) {
250 spin_lock(&swap_lock);
251 offset -= SWAPFILE_CLUSTER - 1;
252 si->cluster_next = offset;
253 si->cluster_nr = SWAPFILE_CLUSTER - 1;
254 found_free_cluster = 1;
255 goto checks;
256 }
257 if (unlikely(--latency_ration < 0)) {
258 cond_resched();
259 latency_ration = LATENCY_LIMIT;
260 }
261 }
262
263 offset = scan_base;
128 spin_lock(&swap_lock); 264 spin_lock(&swap_lock);
129 goto lowest; 265 si->cluster_nr = SWAPFILE_CLUSTER - 1;
266 si->lowest_alloc = 0;
130 } 267 }
131 268
132 si->cluster_nr--; 269checks:
133cluster: 270 if (!(si->flags & SWP_WRITEOK))
134 offset = si->cluster_next;
135 if (offset > si->highest_bit)
136lowest: offset = si->lowest_bit;
137checks: if (!(si->flags & SWP_WRITEOK))
138 goto no_page; 271 goto no_page;
139 if (!si->highest_bit) 272 if (!si->highest_bit)
140 goto no_page; 273 goto no_page;
141 if (!si->swap_map[offset]) { 274 if (offset > si->highest_bit)
142 if (offset == si->lowest_bit) 275 scan_base = offset = si->lowest_bit;
143 si->lowest_bit++; 276 if (si->swap_map[offset])
144 if (offset == si->highest_bit) 277 goto scan;
145 si->highest_bit--; 278
146 si->inuse_pages++; 279 if (offset == si->lowest_bit)
147 if (si->inuse_pages == si->pages) { 280 si->lowest_bit++;
148 si->lowest_bit = si->max; 281 if (offset == si->highest_bit)
149 si->highest_bit = 0; 282 si->highest_bit--;
283 si->inuse_pages++;
284 if (si->inuse_pages == si->pages) {
285 si->lowest_bit = si->max;
286 si->highest_bit = 0;
287 }
288 si->swap_map[offset] = 1;
289 si->cluster_next = offset + 1;
290 si->flags -= SWP_SCANNING;
291
292 if (si->lowest_alloc) {
293 /*
294 * Only set when SWP_DISCARDABLE, and there's a scan
295 * for a free cluster in progress or just completed.
296 */
297 if (found_free_cluster) {
298 /*
299 * To optimize wear-levelling, discard the
300 * old data of the cluster, taking care not to
301 * discard any of its pages that have already
302 * been allocated by racing tasks (offset has
303 * already stepped over any at the beginning).
304 */
305 if (offset < si->highest_alloc &&
306 si->lowest_alloc <= last_in_cluster)
307 last_in_cluster = si->lowest_alloc - 1;
308 si->flags |= SWP_DISCARDING;
309 spin_unlock(&swap_lock);
310
311 if (offset < last_in_cluster)
312 discard_swap_cluster(si, offset,
313 last_in_cluster - offset + 1);
314
315 spin_lock(&swap_lock);
316 si->lowest_alloc = 0;
317 si->flags &= ~SWP_DISCARDING;
318
319 smp_mb(); /* wake_up_bit advises this */
320 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
321
322 } else if (si->flags & SWP_DISCARDING) {
323 /*
324 * Delay using pages allocated by racing tasks
325 * until the whole discard has been issued. We
326 * could defer that delay until swap_writepage,
327 * but it's easier to keep this self-contained.
328 */
329 spin_unlock(&swap_lock);
330 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
331 wait_for_discard, TASK_UNINTERRUPTIBLE);
332 spin_lock(&swap_lock);
333 } else {
334 /*
335 * Note pages allocated by racing tasks while
336 * scan for a free cluster is in progress, so
337 * that its final discard can exclude them.
338 */
339 if (offset < si->lowest_alloc)
340 si->lowest_alloc = offset;
341 if (offset > si->highest_alloc)
342 si->highest_alloc = offset;
150 } 343 }
151 si->swap_map[offset] = 1;
152 si->cluster_next = offset + 1;
153 si->flags -= SWP_SCANNING;
154 return offset;
155 } 344 }
345 return offset;
156 346
347scan:
157 spin_unlock(&swap_lock); 348 spin_unlock(&swap_lock);
158 while (++offset <= si->highest_bit) { 349 while (++offset <= si->highest_bit) {
159 if (!si->swap_map[offset]) { 350 if (!si->swap_map[offset]) {
@@ -165,8 +356,18 @@ checks: if (!(si->flags & SWP_WRITEOK))
165 latency_ration = LATENCY_LIMIT; 356 latency_ration = LATENCY_LIMIT;
166 } 357 }
167 } 358 }
359 offset = si->lowest_bit;
360 while (++offset < scan_base) {
361 if (!si->swap_map[offset]) {
362 spin_lock(&swap_lock);
363 goto checks;
364 }
365 if (unlikely(--latency_ration < 0)) {
366 cond_resched();
367 latency_ration = LATENCY_LIMIT;
368 }
369 }
168 spin_lock(&swap_lock); 370 spin_lock(&swap_lock);
169 goto lowest;
170 371
171no_page: 372no_page:
172 si->flags -= SWP_SCANNING; 373 si->flags -= SWP_SCANNING;
@@ -268,10 +469,11 @@ bad_nofile:
268 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); 469 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
269out: 470out:
270 return NULL; 471 return NULL;
271} 472}
272 473
273static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) 474static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent)
274{ 475{
476 unsigned long offset = swp_offset(ent);
275 int count = p->swap_map[offset]; 477 int count = p->swap_map[offset];
276 478
277 if (count < SWAP_MAP_MAX) { 479 if (count < SWAP_MAP_MAX) {
@@ -286,6 +488,7 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
286 swap_list.next = p - swap_info; 488 swap_list.next = p - swap_info;
287 nr_swap_pages++; 489 nr_swap_pages++;
288 p->inuse_pages--; 490 p->inuse_pages--;
491 mem_cgroup_uncharge_swap(ent);
289 } 492 }
290 } 493 }
291 return count; 494 return count;
@@ -301,7 +504,7 @@ void swap_free(swp_entry_t entry)
301 504
302 p = swap_info_get(entry); 505 p = swap_info_get(entry);
303 if (p) { 506 if (p) {
304 swap_entry_free(p, swp_offset(entry)); 507 swap_entry_free(p, entry);
305 spin_unlock(&swap_lock); 508 spin_unlock(&swap_lock);
306 } 509 }
307} 510}
@@ -326,101 +529,62 @@ static inline int page_swapcount(struct page *page)
326} 529}
327 530
328/* 531/*
329 * We can use this swap cache entry directly 532 * We can write to an anon page without COW if there are no other references
330 * if there are no other references to it. 533 * to it. And as a side-effect, free up its swap: because the old content
534 * on disk will never be read, and seeking back there to write new content
535 * later would only waste time away from clustering.
331 */ 536 */
332int can_share_swap_page(struct page *page) 537int reuse_swap_page(struct page *page)
333{ 538{
334 int count; 539 int count;
335 540
336 BUG_ON(!PageLocked(page)); 541 VM_BUG_ON(!PageLocked(page));
337 count = page_mapcount(page); 542 count = page_mapcount(page);
338 if (count <= 1 && PageSwapCache(page)) 543 if (count <= 1 && PageSwapCache(page)) {
339 count += page_swapcount(page); 544 count += page_swapcount(page);
545 if (count == 1 && !PageWriteback(page)) {
546 delete_from_swap_cache(page);
547 SetPageDirty(page);
548 }
549 }
340 return count == 1; 550 return count == 1;
341} 551}
342 552
343/* 553/*
344 * Work out if there are any other processes sharing this 554 * If swap is getting full, or if there are no more mappings of this page,
345 * swap cache page. Free it if you can. Return success. 555 * then try_to_free_swap is called to free its swap space.
346 */ 556 */
347static int remove_exclusive_swap_page_count(struct page *page, int count) 557int try_to_free_swap(struct page *page)
348{ 558{
349 int retval; 559 VM_BUG_ON(!PageLocked(page));
350 struct swap_info_struct * p;
351 swp_entry_t entry;
352
353 BUG_ON(PagePrivate(page));
354 BUG_ON(!PageLocked(page));
355 560
356 if (!PageSwapCache(page)) 561 if (!PageSwapCache(page))
357 return 0; 562 return 0;
358 if (PageWriteback(page)) 563 if (PageWriteback(page))
359 return 0; 564 return 0;
360 if (page_count(page) != count) /* us + cache + ptes */ 565 if (page_swapcount(page))
361 return 0; 566 return 0;
362 567
363 entry.val = page_private(page); 568 delete_from_swap_cache(page);
364 p = swap_info_get(entry); 569 SetPageDirty(page);
365 if (!p) 570 return 1;
366 return 0;
367
368 /* Is the only swap cache user the cache itself? */
369 retval = 0;
370 if (p->swap_map[swp_offset(entry)] == 1) {
371 /* Recheck the page count with the swapcache lock held.. */
372 spin_lock_irq(&swapper_space.tree_lock);
373 if ((page_count(page) == count) && !PageWriteback(page)) {
374 __delete_from_swap_cache(page);
375 SetPageDirty(page);
376 retval = 1;
377 }
378 spin_unlock_irq(&swapper_space.tree_lock);
379 }
380 spin_unlock(&swap_lock);
381
382 if (retval) {
383 swap_free(entry);
384 page_cache_release(page);
385 }
386
387 return retval;
388}
389
390/*
391 * Most of the time the page should have two references: one for the
392 * process and one for the swap cache.
393 */
394int remove_exclusive_swap_page(struct page *page)
395{
396 return remove_exclusive_swap_page_count(page, 2);
397}
398
399/*
400 * The pageout code holds an extra reference to the page. That raises
401 * the reference count to test for to 2 for a page that is only in the
402 * swap cache plus 1 for each process that maps the page.
403 */
404int remove_exclusive_swap_page_ref(struct page *page)
405{
406 return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page));
407} 571}
408 572
409/* 573/*
410 * Free the swap entry like above, but also try to 574 * Free the swap entry like above, but also try to
411 * free the page cache entry if it is the last user. 575 * free the page cache entry if it is the last user.
412 */ 576 */
413void free_swap_and_cache(swp_entry_t entry) 577int free_swap_and_cache(swp_entry_t entry)
414{ 578{
415 struct swap_info_struct * p; 579 struct swap_info_struct *p;
416 struct page *page = NULL; 580 struct page *page = NULL;
417 581
418 if (is_migration_entry(entry)) 582 if (is_migration_entry(entry))
419 return; 583 return 1;
420 584
421 p = swap_info_get(entry); 585 p = swap_info_get(entry);
422 if (p) { 586 if (p) {
423 if (swap_entry_free(p, swp_offset(entry)) == 1) { 587 if (swap_entry_free(p, entry) == 1) {
424 page = find_get_page(&swapper_space, entry.val); 588 page = find_get_page(&swapper_space, entry.val);
425 if (page && !trylock_page(page)) { 589 if (page && !trylock_page(page)) {
426 page_cache_release(page); 590 page_cache_release(page);
@@ -430,20 +594,19 @@ void free_swap_and_cache(swp_entry_t entry)
430 spin_unlock(&swap_lock); 594 spin_unlock(&swap_lock);
431 } 595 }
432 if (page) { 596 if (page) {
433 int one_user; 597 /*
434 598 * Not mapped elsewhere, or swap space full? Free it!
435 BUG_ON(PagePrivate(page)); 599 * Also recheck PageSwapCache now page is locked (above).
436 one_user = (page_count(page) == 2); 600 */
437 /* Only cache user (+us), or swap space full? Free it! */
438 /* Also recheck PageSwapCache after page is locked (above) */
439 if (PageSwapCache(page) && !PageWriteback(page) && 601 if (PageSwapCache(page) && !PageWriteback(page) &&
440 (one_user || vm_swap_full())) { 602 (!page_mapped(page) || vm_swap_full())) {
441 delete_from_swap_cache(page); 603 delete_from_swap_cache(page);
442 SetPageDirty(page); 604 SetPageDirty(page);
443 } 605 }
444 unlock_page(page); 606 unlock_page(page);
445 page_cache_release(page); 607 page_cache_release(page);
446 } 608 }
609 return p != NULL;
447} 610}
448 611
449#ifdef CONFIG_HIBERNATION 612#ifdef CONFIG_HIBERNATION
@@ -530,17 +693,20 @@ unsigned int count_swap_pages(int type, int free)
530static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 693static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
531 unsigned long addr, swp_entry_t entry, struct page *page) 694 unsigned long addr, swp_entry_t entry, struct page *page)
532{ 695{
696 struct mem_cgroup *ptr = NULL;
533 spinlock_t *ptl; 697 spinlock_t *ptl;
534 pte_t *pte; 698 pte_t *pte;
535 int ret = 1; 699 int ret = 1;
536 700
537 if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) 701 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr)) {
538 ret = -ENOMEM; 702 ret = -ENOMEM;
703 goto out_nolock;
704 }
539 705
540 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 706 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
541 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { 707 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
542 if (ret > 0) 708 if (ret > 0)
543 mem_cgroup_uncharge_page(page); 709 mem_cgroup_cancel_charge_swapin(ptr);
544 ret = 0; 710 ret = 0;
545 goto out; 711 goto out;
546 } 712 }
@@ -550,6 +716,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
550 set_pte_at(vma->vm_mm, addr, pte, 716 set_pte_at(vma->vm_mm, addr, pte,
551 pte_mkold(mk_pte(page, vma->vm_page_prot))); 717 pte_mkold(mk_pte(page, vma->vm_page_prot)));
552 page_add_anon_rmap(page, vma, addr); 718 page_add_anon_rmap(page, vma, addr);
719 mem_cgroup_commit_charge_swapin(page, ptr);
553 swap_free(entry); 720 swap_free(entry);
554 /* 721 /*
555 * Move the page to the active list so it is not 722 * Move the page to the active list so it is not
@@ -558,6 +725,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
558 activate_page(page); 725 activate_page(page);
559out: 726out:
560 pte_unmap_unlock(pte, ptl); 727 pte_unmap_unlock(pte, ptl);
728out_nolock:
561 return ret; 729 return ret;
562} 730}
563 731
@@ -776,10 +944,10 @@ static int try_to_unuse(unsigned int type)
776 break; 944 break;
777 } 945 }
778 946
779 /* 947 /*
780 * Get a page for the entry, using the existing swap 948 * Get a page for the entry, using the existing swap
781 * cache page if there is one. Otherwise, get a clean 949 * cache page if there is one. Otherwise, get a clean
782 * page and read the swap into it. 950 * page and read the swap into it.
783 */ 951 */
784 swap_map = &si->swap_map[i]; 952 swap_map = &si->swap_map[i];
785 entry = swp_entry(type, i); 953 entry = swp_entry(type, i);
@@ -930,7 +1098,16 @@ static int try_to_unuse(unsigned int type)
930 lock_page(page); 1098 lock_page(page);
931 wait_on_page_writeback(page); 1099 wait_on_page_writeback(page);
932 } 1100 }
933 if (PageSwapCache(page)) 1101
1102 /*
1103 * It is conceivable that a racing task removed this page from
1104 * swap cache just before we acquired the page lock at the top,
1105 * or while we dropped it in unuse_mm(). The page might even
1106 * be back in swap cache on another swap area: that we must not
1107 * delete, since it may not have been written out to swap yet.
1108 */
1109 if (PageSwapCache(page) &&
1110 likely(page_private(page) == entry.val))
934 delete_from_swap_cache(page); 1111 delete_from_swap_cache(page);
935 1112
936 /* 1113 /*
@@ -1203,27 +1380,7 @@ out:
1203 return ret; 1380 return ret;
1204} 1381}
1205 1382
1206#if 0 /* We don't need this yet */ 1383SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
1207#include <linux/backing-dev.h>
1208int page_queue_congested(struct page *page)
1209{
1210 struct backing_dev_info *bdi;
1211
1212 BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
1213
1214 if (PageSwapCache(page)) {
1215 swp_entry_t entry = { .val = page_private(page) };
1216 struct swap_info_struct *sis;
1217
1218 sis = get_swap_info_struct(swp_type(entry));
1219 bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
1220 } else
1221 bdi = page->mapping->backing_dev_info;
1222 return bdi_write_congested(bdi);
1223}
1224#endif
1225
1226asmlinkage long sys_swapoff(const char __user * specialfile)
1227{ 1384{
1228 struct swap_info_struct * p = NULL; 1385 struct swap_info_struct * p = NULL;
1229 unsigned short *swap_map; 1386 unsigned short *swap_map;
@@ -1233,7 +1390,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1233 char * pathname; 1390 char * pathname;
1234 int i, type, prev; 1391 int i, type, prev;
1235 int err; 1392 int err;
1236 1393
1237 if (!capable(CAP_SYS_ADMIN)) 1394 if (!capable(CAP_SYS_ADMIN))
1238 return -EPERM; 1395 return -EPERM;
1239 1396
@@ -1253,7 +1410,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1253 spin_lock(&swap_lock); 1410 spin_lock(&swap_lock);
1254 for (type = swap_list.head; type >= 0; type = swap_info[type].next) { 1411 for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
1255 p = swap_info + type; 1412 p = swap_info + type;
1256 if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { 1413 if (p->flags & SWP_WRITEOK) {
1257 if (p->swap_file->f_mapping == mapping) 1414 if (p->swap_file->f_mapping == mapping)
1258 break; 1415 break;
1259 } 1416 }
@@ -1343,6 +1500,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1343 spin_unlock(&swap_lock); 1500 spin_unlock(&swap_lock);
1344 mutex_unlock(&swapon_mutex); 1501 mutex_unlock(&swapon_mutex);
1345 vfree(swap_map); 1502 vfree(swap_map);
1503 /* Destroy swap account informatin */
1504 swap_cgroup_swapoff(type);
1505
1346 inode = mapping->host; 1506 inode = mapping->host;
1347 if (S_ISBLK(inode->i_mode)) { 1507 if (S_ISBLK(inode->i_mode)) {
1348 struct block_device *bdev = I_BDEV(inode); 1508 struct block_device *bdev = I_BDEV(inode);
@@ -1426,12 +1586,12 @@ static int swap_show(struct seq_file *swap, void *v)
1426 file = ptr->swap_file; 1586 file = ptr->swap_file;
1427 len = seq_path(swap, &file->f_path, " \t\n\\"); 1587 len = seq_path(swap, &file->f_path, " \t\n\\");
1428 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 1588 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1429 len < 40 ? 40 - len : 1, " ", 1589 len < 40 ? 40 - len : 1, " ",
1430 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? 1590 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1431 "partition" : "file\t", 1591 "partition" : "file\t",
1432 ptr->pages << (PAGE_SHIFT - 10), 1592 ptr->pages << (PAGE_SHIFT - 10),
1433 ptr->inuse_pages << (PAGE_SHIFT - 10), 1593 ptr->inuse_pages << (PAGE_SHIFT - 10),
1434 ptr->prio); 1594 ptr->prio);
1435 return 0; 1595 return 0;
1436} 1596}
1437 1597
@@ -1476,7 +1636,7 @@ late_initcall(max_swapfiles_check);
1476 * 1636 *
1477 * The swapon system call 1637 * The swapon system call
1478 */ 1638 */
1479asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) 1639SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
1480{ 1640{
1481 struct swap_info_struct * p; 1641 struct swap_info_struct * p;
1482 char *name = NULL; 1642 char *name = NULL;
@@ -1487,12 +1647,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1487 int i, prev; 1647 int i, prev;
1488 int error; 1648 int error;
1489 union swap_header *swap_header = NULL; 1649 union swap_header *swap_header = NULL;
1490 int swap_header_version;
1491 unsigned int nr_good_pages = 0; 1650 unsigned int nr_good_pages = 0;
1492 int nr_extents = 0; 1651 int nr_extents = 0;
1493 sector_t span; 1652 sector_t span;
1494 unsigned long maxpages = 1; 1653 unsigned long maxpages = 1;
1495 int swapfilesize; 1654 unsigned long swapfilepages;
1496 unsigned short *swap_map = NULL; 1655 unsigned short *swap_map = NULL;
1497 struct page *page = NULL; 1656 struct page *page = NULL;
1498 struct inode *inode = NULL; 1657 struct inode *inode = NULL;
@@ -1570,7 +1729,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1570 goto bad_swap; 1729 goto bad_swap;
1571 } 1730 }
1572 1731
1573 swapfilesize = i_size_read(inode) >> PAGE_SHIFT; 1732 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1574 1733
1575 /* 1734 /*
1576 * Read the swap header. 1735 * Read the swap header.
@@ -1584,102 +1743,92 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1584 error = PTR_ERR(page); 1743 error = PTR_ERR(page);
1585 goto bad_swap; 1744 goto bad_swap;
1586 } 1745 }
1587 kmap(page); 1746 swap_header = kmap(page);
1588 swap_header = page_address(page);
1589 1747
1590 if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) 1748 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1591 swap_header_version = 1;
1592 else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
1593 swap_header_version = 2;
1594 else {
1595 printk(KERN_ERR "Unable to find swap-space signature\n"); 1749 printk(KERN_ERR "Unable to find swap-space signature\n");
1596 error = -EINVAL; 1750 error = -EINVAL;
1597 goto bad_swap; 1751 goto bad_swap;
1598 } 1752 }
1599 1753
1600 switch (swap_header_version) { 1754 /* swap partition endianess hack... */
1601 case 1: 1755 if (swab32(swap_header->info.version) == 1) {
1602 printk(KERN_ERR "version 0 swap is no longer supported. " 1756 swab32s(&swap_header->info.version);
1603 "Use mkswap -v1 %s\n", name); 1757 swab32s(&swap_header->info.last_page);
1758 swab32s(&swap_header->info.nr_badpages);
1759 for (i = 0; i < swap_header->info.nr_badpages; i++)
1760 swab32s(&swap_header->info.badpages[i]);
1761 }
1762 /* Check the swap header's sub-version */
1763 if (swap_header->info.version != 1) {
1764 printk(KERN_WARNING
1765 "Unable to handle swap header version %d\n",
1766 swap_header->info.version);
1604 error = -EINVAL; 1767 error = -EINVAL;
1605 goto bad_swap; 1768 goto bad_swap;
1606 case 2: 1769 }
1607 /* swap partition endianess hack... */
1608 if (swab32(swap_header->info.version) == 1) {
1609 swab32s(&swap_header->info.version);
1610 swab32s(&swap_header->info.last_page);
1611 swab32s(&swap_header->info.nr_badpages);
1612 for (i = 0; i < swap_header->info.nr_badpages; i++)
1613 swab32s(&swap_header->info.badpages[i]);
1614 }
1615 /* Check the swap header's sub-version and the size of
1616 the swap file and bad block lists */
1617 if (swap_header->info.version != 1) {
1618 printk(KERN_WARNING
1619 "Unable to handle swap header version %d\n",
1620 swap_header->info.version);
1621 error = -EINVAL;
1622 goto bad_swap;
1623 }
1624 1770
1625 p->lowest_bit = 1; 1771 p->lowest_bit = 1;
1626 p->cluster_next = 1; 1772 p->cluster_next = 1;
1627 1773
1628 /* 1774 /*
1629 * Find out how many pages are allowed for a single swap 1775 * Find out how many pages are allowed for a single swap
1630 * device. There are two limiting factors: 1) the number of 1776 * device. There are two limiting factors: 1) the number of
1631 * bits for the swap offset in the swp_entry_t type and 1777 * bits for the swap offset in the swp_entry_t type and
1632 * 2) the number of bits in the a swap pte as defined by 1778 * 2) the number of bits in the a swap pte as defined by
1633 * the different architectures. In order to find the 1779 * the different architectures. In order to find the
1634 * largest possible bit mask a swap entry with swap type 0 1780 * largest possible bit mask a swap entry with swap type 0
1635 * and swap offset ~0UL is created, encoded to a swap pte, 1781 * and swap offset ~0UL is created, encoded to a swap pte,
1636 * decoded to a swp_entry_t again and finally the swap 1782 * decoded to a swp_entry_t again and finally the swap
1637 * offset is extracted. This will mask all the bits from 1783 * offset is extracted. This will mask all the bits from
1638 * the initial ~0UL mask that can't be encoded in either 1784 * the initial ~0UL mask that can't be encoded in either
1639 * the swp_entry_t or the architecture definition of a 1785 * the swp_entry_t or the architecture definition of a
1640 * swap pte. 1786 * swap pte.
1641 */ 1787 */
1642 maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1; 1788 maxpages = swp_offset(pte_to_swp_entry(
1643 if (maxpages > swap_header->info.last_page) 1789 swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1;
1644 maxpages = swap_header->info.last_page; 1790 if (maxpages > swap_header->info.last_page)
1645 p->highest_bit = maxpages - 1; 1791 maxpages = swap_header->info.last_page;
1792 p->highest_bit = maxpages - 1;
1646 1793
1647 error = -EINVAL; 1794 error = -EINVAL;
1648 if (!maxpages) 1795 if (!maxpages)
1649 goto bad_swap; 1796 goto bad_swap;
1650 if (swapfilesize && maxpages > swapfilesize) { 1797 if (swapfilepages && maxpages > swapfilepages) {
1651 printk(KERN_WARNING 1798 printk(KERN_WARNING
1652 "Swap area shorter than signature indicates\n"); 1799 "Swap area shorter than signature indicates\n");
1653 goto bad_swap; 1800 goto bad_swap;
1654 } 1801 }
1655 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 1802 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1656 goto bad_swap; 1803 goto bad_swap;
1657 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1804 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1658 goto bad_swap; 1805 goto bad_swap;
1659 1806
1660 /* OK, set up the swap map and apply the bad block list */ 1807 /* OK, set up the swap map and apply the bad block list */
1661 swap_map = vmalloc(maxpages * sizeof(short)); 1808 swap_map = vmalloc(maxpages * sizeof(short));
1662 if (!swap_map) { 1809 if (!swap_map) {
1663 error = -ENOMEM; 1810 error = -ENOMEM;
1664 goto bad_swap; 1811 goto bad_swap;
1665 } 1812 }
1666 1813
1667 error = 0; 1814 memset(swap_map, 0, maxpages * sizeof(short));
1668 memset(swap_map, 0, maxpages * sizeof(short)); 1815 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1669 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1816 int page_nr = swap_header->info.badpages[i];
1670 int page_nr = swap_header->info.badpages[i]; 1817 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
1671 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) 1818 error = -EINVAL;
1672 error = -EINVAL;
1673 else
1674 swap_map[page_nr] = SWAP_MAP_BAD;
1675 }
1676 nr_good_pages = swap_header->info.last_page -
1677 swap_header->info.nr_badpages -
1678 1 /* header page */;
1679 if (error)
1680 goto bad_swap; 1819 goto bad_swap;
1820 }
1821 swap_map[page_nr] = SWAP_MAP_BAD;
1681 } 1822 }
1682 1823
1824 error = swap_cgroup_swapon(type, maxpages);
1825 if (error)
1826 goto bad_swap;
1827
1828 nr_good_pages = swap_header->info.last_page -
1829 swap_header->info.nr_badpages -
1830 1 /* header page */;
1831
1683 if (nr_good_pages) { 1832 if (nr_good_pages) {
1684 swap_map[0] = SWAP_MAP_BAD; 1833 swap_map[0] = SWAP_MAP_BAD;
1685 p->max = maxpages; 1834 p->max = maxpages;
@@ -1697,6 +1846,13 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1697 goto bad_swap; 1846 goto bad_swap;
1698 } 1847 }
1699 1848
1849 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
1850 p->flags |= SWP_SOLIDSTATE;
1851 p->cluster_next = 1 + (random32() % p->highest_bit);
1852 }
1853 if (discard_swap(p) == 0)
1854 p->flags |= SWP_DISCARDABLE;
1855
1700 mutex_lock(&swapon_mutex); 1856 mutex_lock(&swapon_mutex);
1701 spin_lock(&swap_lock); 1857 spin_lock(&swap_lock);
1702 if (swap_flags & SWAP_FLAG_PREFER) 1858 if (swap_flags & SWAP_FLAG_PREFER)
@@ -1705,14 +1861,16 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1705 else 1861 else
1706 p->prio = --least_priority; 1862 p->prio = --least_priority;
1707 p->swap_map = swap_map; 1863 p->swap_map = swap_map;
1708 p->flags = SWP_ACTIVE; 1864 p->flags |= SWP_WRITEOK;
1709 nr_swap_pages += nr_good_pages; 1865 nr_swap_pages += nr_good_pages;
1710 total_swap_pages += nr_good_pages; 1866 total_swap_pages += nr_good_pages;
1711 1867
1712 printk(KERN_INFO "Adding %uk swap on %s. " 1868 printk(KERN_INFO "Adding %uk swap on %s. "
1713 "Priority:%d extents:%d across:%lluk\n", 1869 "Priority:%d extents:%d across:%lluk %s%s\n",
1714 nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, 1870 nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
1715 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10)); 1871 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
1872 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
1873 (p->flags & SWP_DISCARDABLE) ? "D" : "");
1716 1874
1717 /* insert swap space into swap_list: */ 1875 /* insert swap space into swap_list: */
1718 prev = -1; 1876 prev = -1;
@@ -1738,6 +1896,7 @@ bad_swap:
1738 bd_release(bdev); 1896 bd_release(bdev);
1739 } 1897 }
1740 destroy_swap_extents(p); 1898 destroy_swap_extents(p);
1899 swap_cgroup_swapoff(type);
1741bad_swap_2: 1900bad_swap_2:
1742 spin_lock(&swap_lock); 1901 spin_lock(&swap_lock);
1743 p->swap_file = NULL; 1902 p->swap_file = NULL;
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
deleted file mode 100644
index 3e67d575ee6e..000000000000
--- a/mm/tiny-shmem.c
+++ /dev/null
@@ -1,134 +0,0 @@
1/*
2 * tiny-shmem.c: simple shmemfs and tmpfs using ramfs code
3 *
4 * Matt Mackall <mpm@selenic.com> January, 2004
5 * derived from mm/shmem.c and fs/ramfs/inode.c
6 *
7 * This is intended for small system where the benefits of the full
8 * shmem code (swap-backed and resource-limited) are outweighed by
9 * their complexity. On systems without swap this code should be
10 * effectively equivalent, but much lighter weight.
11 */
12
13#include <linux/fs.h>
14#include <linux/init.h>
15#include <linux/vfs.h>
16#include <linux/mount.h>
17#include <linux/file.h>
18#include <linux/mm.h>
19#include <linux/module.h>
20#include <linux/swap.h>
21#include <linux/ramfs.h>
22
23static struct file_system_type tmpfs_fs_type = {
24 .name = "tmpfs",
25 .get_sb = ramfs_get_sb,
26 .kill_sb = kill_litter_super,
27};
28
29static struct vfsmount *shm_mnt;
30
31static int __init init_tmpfs(void)
32{
33 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
34
35 shm_mnt = kern_mount(&tmpfs_fs_type);
36 BUG_ON(IS_ERR(shm_mnt));
37
38 return 0;
39}
40module_init(init_tmpfs)
41
42/**
43 * shmem_file_setup - get an unlinked file living in tmpfs
44 * @name: name for dentry (to be seen in /proc/<pid>/maps
45 * @size: size to be set for the file
46 * @flags: vm_flags
47 */
48struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
49{
50 int error;
51 struct file *file;
52 struct inode *inode;
53 struct dentry *dentry, *root;
54 struct qstr this;
55
56 if (IS_ERR(shm_mnt))
57 return (void *)shm_mnt;
58
59 error = -ENOMEM;
60 this.name = name;
61 this.len = strlen(name);
62 this.hash = 0; /* will go */
63 root = shm_mnt->mnt_root;
64 dentry = d_alloc(root, &this);
65 if (!dentry)
66 goto put_memory;
67
68 error = -ENFILE;
69 file = get_empty_filp();
70 if (!file)
71 goto put_dentry;
72
73 error = -ENOSPC;
74 inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
75 if (!inode)
76 goto close_file;
77
78 d_instantiate(dentry, inode);
79 inode->i_size = size;
80 inode->i_nlink = 0; /* It is unlinked */
81 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
82 &ramfs_file_operations);
83
84#ifndef CONFIG_MMU
85 error = ramfs_nommu_expand_for_mapping(inode, size);
86 if (error)
87 goto close_file;
88#endif
89 return file;
90
91close_file:
92 put_filp(file);
93put_dentry:
94 dput(dentry);
95put_memory:
96 return ERR_PTR(error);
97}
98EXPORT_SYMBOL_GPL(shmem_file_setup);
99
100/**
101 * shmem_zero_setup - setup a shared anonymous mapping
102 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
103 */
104int shmem_zero_setup(struct vm_area_struct *vma)
105{
106 struct file *file;
107 loff_t size = vma->vm_end - vma->vm_start;
108
109 file = shmem_file_setup("dev/zero", size, vma->vm_flags);
110 if (IS_ERR(file))
111 return PTR_ERR(file);
112
113 if (vma->vm_file)
114 fput(vma->vm_file);
115 vma->vm_file = file;
116 vma->vm_ops = &generic_file_vm_ops;
117 return 0;
118}
119
120int shmem_unuse(swp_entry_t entry, struct page *page)
121{
122 return 0;
123}
124
125#ifndef CONFIG_MMU
126unsigned long shmem_get_unmapped_area(struct file *file,
127 unsigned long addr,
128 unsigned long len,
129 unsigned long pgoff,
130 unsigned long flags)
131{
132 return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
133}
134#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 1ddb77ba3995..75f49d312e8c 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -23,6 +23,7 @@
23#include <linux/rbtree.h> 23#include <linux/rbtree.h>
24#include <linux/radix-tree.h> 24#include <linux/radix-tree.h>
25#include <linux/rcupdate.h> 25#include <linux/rcupdate.h>
26#include <linux/bootmem.h>
26 27
27#include <asm/atomic.h> 28#include <asm/atomic.h>
28#include <asm/uaccess.h> 29#include <asm/uaccess.h>
@@ -151,11 +152,12 @@ static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
151 * 152 *
152 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] 153 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
153 */ 154 */
154static int vmap_page_range(unsigned long addr, unsigned long end, 155static int vmap_page_range(unsigned long start, unsigned long end,
155 pgprot_t prot, struct page **pages) 156 pgprot_t prot, struct page **pages)
156{ 157{
157 pgd_t *pgd; 158 pgd_t *pgd;
158 unsigned long next; 159 unsigned long next;
160 unsigned long addr = start;
159 int err = 0; 161 int err = 0;
160 int nr = 0; 162 int nr = 0;
161 163
@@ -167,7 +169,7 @@ static int vmap_page_range(unsigned long addr, unsigned long end,
167 if (err) 169 if (err)
168 break; 170 break;
169 } while (pgd++, addr = next, addr != end); 171 } while (pgd++, addr = next, addr != end);
170 flush_cache_vmap(addr, end); 172 flush_cache_vmap(start, end);
171 173
172 if (unlikely(err)) 174 if (unlikely(err))
173 return err; 175 return err;
@@ -380,8 +382,9 @@ found:
380 goto retry; 382 goto retry;
381 } 383 }
382 if (printk_ratelimit()) 384 if (printk_ratelimit())
383 printk(KERN_WARNING "vmap allocation failed: " 385 printk(KERN_WARNING
384 "use vmalloc=<size> to increase size.\n"); 386 "vmap allocation for size %lu failed: "
387 "use vmalloc=<size> to increase size.\n", size);
385 return ERR_PTR(-EBUSY); 388 return ERR_PTR(-EBUSY);
386 } 389 }
387 390
@@ -431,6 +434,27 @@ static void unmap_vmap_area(struct vmap_area *va)
431 vunmap_page_range(va->va_start, va->va_end); 434 vunmap_page_range(va->va_start, va->va_end);
432} 435}
433 436
437static void vmap_debug_free_range(unsigned long start, unsigned long end)
438{
439 /*
440 * Unmap page tables and force a TLB flush immediately if
441 * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free
442 * bugs similarly to those in linear kernel virtual address
443 * space after a page has been freed.
444 *
445 * All the lazy freeing logic is still retained, in order to
446 * minimise intrusiveness of this debugging feature.
447 *
448 * This is going to be *slow* (linear kernel virtual address
449 * debugging doesn't do a broadcast TLB flush so it is a lot
450 * faster).
451 */
452#ifdef CONFIG_DEBUG_PAGEALLOC
453 vunmap_page_range(start, end);
454 flush_tlb_kernel_range(start, end);
455#endif
456}
457
434/* 458/*
435 * lazy_max_pages is the maximum amount of virtual address space we gather up 459 * lazy_max_pages is the maximum amount of virtual address space we gather up
436 * before attempting to purge with a TLB flush. 460 * before attempting to purge with a TLB flush.
@@ -911,6 +935,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
911 BUG_ON(addr & (PAGE_SIZE-1)); 935 BUG_ON(addr & (PAGE_SIZE-1));
912 936
913 debug_check_no_locks_freed(mem, size); 937 debug_check_no_locks_freed(mem, size);
938 vmap_debug_free_range(addr, addr+size);
914 939
915 if (likely(count <= VMAP_MAX_ALLOC)) 940 if (likely(count <= VMAP_MAX_ALLOC))
916 vb_free(mem, size); 941 vb_free(mem, size);
@@ -959,6 +984,8 @@ EXPORT_SYMBOL(vm_map_ram);
959 984
960void __init vmalloc_init(void) 985void __init vmalloc_init(void)
961{ 986{
987 struct vmap_area *va;
988 struct vm_struct *tmp;
962 int i; 989 int i;
963 990
964 for_each_possible_cpu(i) { 991 for_each_possible_cpu(i) {
@@ -971,6 +998,14 @@ void __init vmalloc_init(void)
971 vbq->nr_dirty = 0; 998 vbq->nr_dirty = 0;
972 } 999 }
973 1000
1001 /* Import existing vmlist entries. */
1002 for (tmp = vmlist; tmp; tmp = tmp->next) {
1003 va = alloc_bootmem(sizeof(struct vmap_area));
1004 va->flags = tmp->flags | VM_VM_AREA;
1005 va->va_start = (unsigned long)tmp->addr;
1006 va->va_end = va->va_start + tmp->size;
1007 __insert_vmap_area(va);
1008 }
974 vmap_initialized = true; 1009 vmap_initialized = true;
975} 1010}
976 1011
@@ -1127,6 +1162,8 @@ struct vm_struct *remove_vm_area(const void *addr)
1127 if (va && va->flags & VM_VM_AREA) { 1162 if (va && va->flags & VM_VM_AREA) {
1128 struct vm_struct *vm = va->private; 1163 struct vm_struct *vm = va->private;
1129 struct vm_struct *tmp, **p; 1164 struct vm_struct *tmp, **p;
1165
1166 vmap_debug_free_range(va->va_start, va->va_end);
1130 free_unmap_vmap_area(va); 1167 free_unmap_vmap_area(va);
1131 vm->size -= PAGE_SIZE; 1168 vm->size -= PAGE_SIZE;
1132 1169
@@ -1374,7 +1411,8 @@ void *vmalloc_user(unsigned long size)
1374 struct vm_struct *area; 1411 struct vm_struct *area;
1375 void *ret; 1412 void *ret;
1376 1413
1377 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); 1414 ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
1415 PAGE_KERNEL, -1, __builtin_return_address(0));
1378 if (ret) { 1416 if (ret) {
1379 area = find_vm_area(ret); 1417 area = find_vm_area(ret);
1380 area->flags |= VM_USERMAP; 1418 area->flags |= VM_USERMAP;
@@ -1419,7 +1457,8 @@ EXPORT_SYMBOL(vmalloc_node);
1419 1457
1420void *vmalloc_exec(unsigned long size) 1458void *vmalloc_exec(unsigned long size)
1421{ 1459{
1422 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); 1460 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
1461 -1, __builtin_return_address(0));
1423} 1462}
1424 1463
1425#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 1464#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
@@ -1439,7 +1478,8 @@ void *vmalloc_exec(unsigned long size)
1439 */ 1478 */
1440void *vmalloc_32(unsigned long size) 1479void *vmalloc_32(unsigned long size)
1441{ 1480{
1442 return __vmalloc(size, GFP_VMALLOC32, PAGE_KERNEL); 1481 return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL,
1482 -1, __builtin_return_address(0));
1443} 1483}
1444EXPORT_SYMBOL(vmalloc_32); 1484EXPORT_SYMBOL(vmalloc_32);
1445 1485
@@ -1455,7 +1495,8 @@ void *vmalloc_32_user(unsigned long size)
1455 struct vm_struct *area; 1495 struct vm_struct *area;
1456 void *ret; 1496 void *ret;
1457 1497
1458 ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); 1498 ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
1499 -1, __builtin_return_address(0));
1459 if (ret) { 1500 if (ret) {
1460 area = find_vm_area(ret); 1501 area = find_vm_area(ret);
1461 area->flags |= VM_USERMAP; 1502 area->flags |= VM_USERMAP;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 62e7f62fb559..9a27c44aa327 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -52,6 +52,9 @@ struct scan_control {
52 /* Incremented by the number of inactive pages that were scanned */ 52 /* Incremented by the number of inactive pages that were scanned */
53 unsigned long nr_scanned; 53 unsigned long nr_scanned;
54 54
55 /* Number of pages freed so far during a call to shrink_zones() */
56 unsigned long nr_reclaimed;
57
55 /* This context's GFP mask */ 58 /* This context's GFP mask */
56 gfp_t gfp_mask; 59 gfp_t gfp_mask;
57 60
@@ -122,11 +125,30 @@ static LIST_HEAD(shrinker_list);
122static DECLARE_RWSEM(shrinker_rwsem); 125static DECLARE_RWSEM(shrinker_rwsem);
123 126
124#ifdef CONFIG_CGROUP_MEM_RES_CTLR 127#ifdef CONFIG_CGROUP_MEM_RES_CTLR
125#define scan_global_lru(sc) (!(sc)->mem_cgroup) 128#define scanning_global_lru(sc) (!(sc)->mem_cgroup)
126#else 129#else
127#define scan_global_lru(sc) (1) 130#define scanning_global_lru(sc) (1)
128#endif 131#endif
129 132
133static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
134 struct scan_control *sc)
135{
136 if (!scanning_global_lru(sc))
137 return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone);
138
139 return &zone->reclaim_stat;
140}
141
142static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc,
143 enum lru_list lru)
144{
145 if (!scanning_global_lru(sc))
146 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
147
148 return zone_page_state(zone, NR_LRU_BASE + lru);
149}
150
151
130/* 152/*
131 * Add a shrinker callback to be called from the vm 153 * Add a shrinker callback to be called from the vm
132 */ 154 */
@@ -509,7 +531,6 @@ redo:
509 lru = LRU_UNEVICTABLE; 531 lru = LRU_UNEVICTABLE;
510 add_page_to_unevictable_list(page); 532 add_page_to_unevictable_list(page);
511 } 533 }
512 mem_cgroup_move_lists(page, lru);
513 534
514 /* 535 /*
515 * page's status can change while we move it among lru. If an evictable 536 * page's status can change while we move it among lru. If an evictable
@@ -544,7 +565,6 @@ void putback_lru_page(struct page *page)
544 565
545 lru = !!TestClearPageActive(page) + page_is_file_cache(page); 566 lru = !!TestClearPageActive(page) + page_is_file_cache(page);
546 lru_cache_add_lru(page, lru); 567 lru_cache_add_lru(page, lru);
547 mem_cgroup_move_lists(page, lru);
548 put_page(page); 568 put_page(page);
549} 569}
550#endif /* CONFIG_UNEVICTABLE_LRU */ 570#endif /* CONFIG_UNEVICTABLE_LRU */
@@ -617,7 +637,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
617 referenced && page_mapping_inuse(page)) 637 referenced && page_mapping_inuse(page))
618 goto activate_locked; 638 goto activate_locked;
619 639
620#ifdef CONFIG_SWAP
621 /* 640 /*
622 * Anonymous process memory has backing store? 641 * Anonymous process memory has backing store?
623 * Try to allocate it some swap space here. 642 * Try to allocate it some swap space here.
@@ -625,20 +644,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
625 if (PageAnon(page) && !PageSwapCache(page)) { 644 if (PageAnon(page) && !PageSwapCache(page)) {
626 if (!(sc->gfp_mask & __GFP_IO)) 645 if (!(sc->gfp_mask & __GFP_IO))
627 goto keep_locked; 646 goto keep_locked;
628 switch (try_to_munlock(page)) { 647 if (!add_to_swap(page))
629 case SWAP_FAIL: /* shouldn't happen */
630 case SWAP_AGAIN:
631 goto keep_locked;
632 case SWAP_MLOCK:
633 goto cull_mlocked;
634 case SWAP_SUCCESS:
635 ; /* fall thru'; add to swap cache */
636 }
637 if (!add_to_swap(page, GFP_ATOMIC))
638 goto activate_locked; 648 goto activate_locked;
639 may_enter_fs = 1; 649 may_enter_fs = 1;
640 } 650 }
641#endif /* CONFIG_SWAP */
642 651
643 mapping = page_mapping(page); 652 mapping = page_mapping(page);
644 653
@@ -752,6 +761,8 @@ free_it:
752 continue; 761 continue;
753 762
754cull_mlocked: 763cull_mlocked:
764 if (PageSwapCache(page))
765 try_to_free_swap(page);
755 unlock_page(page); 766 unlock_page(page);
756 putback_lru_page(page); 767 putback_lru_page(page);
757 continue; 768 continue;
@@ -759,7 +770,7 @@ cull_mlocked:
759activate_locked: 770activate_locked:
760 /* Not a candidate for swapping, so reclaim swap space. */ 771 /* Not a candidate for swapping, so reclaim swap space. */
761 if (PageSwapCache(page) && vm_swap_full()) 772 if (PageSwapCache(page) && vm_swap_full())
762 remove_exclusive_swap_page_ref(page); 773 try_to_free_swap(page);
763 VM_BUG_ON(PageActive(page)); 774 VM_BUG_ON(PageActive(page));
764 SetPageActive(page); 775 SetPageActive(page);
765 pgactivate++; 776 pgactivate++;
@@ -819,6 +830,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
819 return ret; 830 return ret;
820 831
821 ret = -EBUSY; 832 ret = -EBUSY;
833
822 if (likely(get_page_unless_zero(page))) { 834 if (likely(get_page_unless_zero(page))) {
823 /* 835 /*
824 * Be careful not to clear PageLRU until after we're 836 * Be careful not to clear PageLRU until after we're
@@ -827,6 +839,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
827 */ 839 */
828 ClearPageLRU(page); 840 ClearPageLRU(page);
829 ret = 0; 841 ret = 0;
842 mem_cgroup_del_lru(page);
830 } 843 }
831 844
832 return ret; 845 return ret;
@@ -1035,6 +1048,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1035 struct pagevec pvec; 1048 struct pagevec pvec;
1036 unsigned long nr_scanned = 0; 1049 unsigned long nr_scanned = 0;
1037 unsigned long nr_reclaimed = 0; 1050 unsigned long nr_reclaimed = 0;
1051 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1038 1052
1039 pagevec_init(&pvec, 1); 1053 pagevec_init(&pvec, 1);
1040 1054
@@ -1076,13 +1090,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1076 __mod_zone_page_state(zone, NR_INACTIVE_ANON, 1090 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1077 -count[LRU_INACTIVE_ANON]); 1091 -count[LRU_INACTIVE_ANON]);
1078 1092
1079 if (scan_global_lru(sc)) { 1093 if (scanning_global_lru(sc))
1080 zone->pages_scanned += nr_scan; 1094 zone->pages_scanned += nr_scan;
1081 zone->recent_scanned[0] += count[LRU_INACTIVE_ANON]; 1095
1082 zone->recent_scanned[0] += count[LRU_ACTIVE_ANON]; 1096 reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
1083 zone->recent_scanned[1] += count[LRU_INACTIVE_FILE]; 1097 reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
1084 zone->recent_scanned[1] += count[LRU_ACTIVE_FILE]; 1098 reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE];
1085 } 1099 reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE];
1100
1086 spin_unlock_irq(&zone->lru_lock); 1101 spin_unlock_irq(&zone->lru_lock);
1087 1102
1088 nr_scanned += nr_scan; 1103 nr_scanned += nr_scan;
@@ -1114,7 +1129,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1114 if (current_is_kswapd()) { 1129 if (current_is_kswapd()) {
1115 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); 1130 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
1116 __count_vm_events(KSWAPD_STEAL, nr_freed); 1131 __count_vm_events(KSWAPD_STEAL, nr_freed);
1117 } else if (scan_global_lru(sc)) 1132 } else if (scanning_global_lru(sc))
1118 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); 1133 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
1119 1134
1120 __count_zone_vm_events(PGSTEAL, zone, nr_freed); 1135 __count_zone_vm_events(PGSTEAL, zone, nr_freed);
@@ -1140,10 +1155,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1140 SetPageLRU(page); 1155 SetPageLRU(page);
1141 lru = page_lru(page); 1156 lru = page_lru(page);
1142 add_page_to_lru_list(zone, page, lru); 1157 add_page_to_lru_list(zone, page, lru);
1143 mem_cgroup_move_lists(page, lru); 1158 if (PageActive(page)) {
1144 if (PageActive(page) && scan_global_lru(sc)) {
1145 int file = !!page_is_file_cache(page); 1159 int file = !!page_is_file_cache(page);
1146 zone->recent_rotated[file]++; 1160 reclaim_stat->recent_rotated[file]++;
1147 } 1161 }
1148 if (!pagevec_add(&pvec, page)) { 1162 if (!pagevec_add(&pvec, page)) {
1149 spin_unlock_irq(&zone->lru_lock); 1163 spin_unlock_irq(&zone->lru_lock);
@@ -1173,11 +1187,6 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1173 zone->prev_priority = priority; 1187 zone->prev_priority = priority;
1174} 1188}
1175 1189
1176static inline int zone_is_near_oom(struct zone *zone)
1177{
1178 return zone->pages_scanned >= (zone_lru_pages(zone) * 3);
1179}
1180
1181/* 1190/*
1182 * This moves pages from the active list to the inactive list. 1191 * This moves pages from the active list to the inactive list.
1183 * 1192 *
@@ -1208,6 +1217,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1208 struct page *page; 1217 struct page *page;
1209 struct pagevec pvec; 1218 struct pagevec pvec;
1210 enum lru_list lru; 1219 enum lru_list lru;
1220 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1211 1221
1212 lru_add_drain(); 1222 lru_add_drain();
1213 spin_lock_irq(&zone->lru_lock); 1223 spin_lock_irq(&zone->lru_lock);
@@ -1218,10 +1228,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1218 * zone->pages_scanned is used for detect zone's oom 1228 * zone->pages_scanned is used for detect zone's oom
1219 * mem_cgroup remembers nr_scan by itself. 1229 * mem_cgroup remembers nr_scan by itself.
1220 */ 1230 */
1221 if (scan_global_lru(sc)) { 1231 if (scanning_global_lru(sc)) {
1222 zone->pages_scanned += pgscanned; 1232 zone->pages_scanned += pgscanned;
1223 zone->recent_scanned[!!file] += pgmoved;
1224 } 1233 }
1234 reclaim_stat->recent_scanned[!!file] += pgmoved;
1225 1235
1226 if (file) 1236 if (file)
1227 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); 1237 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
@@ -1248,6 +1258,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1248 list_add(&page->lru, &l_inactive); 1258 list_add(&page->lru, &l_inactive);
1249 } 1259 }
1250 1260
1261 /*
1262 * Move the pages to the [file or anon] inactive list.
1263 */
1264 pagevec_init(&pvec, 1);
1265 pgmoved = 0;
1266 lru = LRU_BASE + file * LRU_FILE;
1267
1251 spin_lock_irq(&zone->lru_lock); 1268 spin_lock_irq(&zone->lru_lock);
1252 /* 1269 /*
1253 * Count referenced pages from currently used mappings as 1270 * Count referenced pages from currently used mappings as
@@ -1255,15 +1272,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1255 * This helps balance scan pressure between file and anonymous 1272 * This helps balance scan pressure between file and anonymous
1256 * pages in get_scan_ratio. 1273 * pages in get_scan_ratio.
1257 */ 1274 */
1258 zone->recent_rotated[!!file] += pgmoved; 1275 reclaim_stat->recent_rotated[!!file] += pgmoved;
1259 1276
1260 /*
1261 * Move the pages to the [file or anon] inactive list.
1262 */
1263 pagevec_init(&pvec, 1);
1264
1265 pgmoved = 0;
1266 lru = LRU_BASE + file * LRU_FILE;
1267 while (!list_empty(&l_inactive)) { 1277 while (!list_empty(&l_inactive)) {
1268 page = lru_to_page(&l_inactive); 1278 page = lru_to_page(&l_inactive);
1269 prefetchw_prev_lru_page(page, &l_inactive, flags); 1279 prefetchw_prev_lru_page(page, &l_inactive, flags);
@@ -1273,7 +1283,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1273 ClearPageActive(page); 1283 ClearPageActive(page);
1274 1284
1275 list_move(&page->lru, &zone->lru[lru].list); 1285 list_move(&page->lru, &zone->lru[lru].list);
1276 mem_cgroup_move_lists(page, lru); 1286 mem_cgroup_add_lru_list(page, lru);
1277 pgmoved++; 1287 pgmoved++;
1278 if (!pagevec_add(&pvec, page)) { 1288 if (!pagevec_add(&pvec, page)) {
1279 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); 1289 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
@@ -1302,6 +1312,38 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1302 pagevec_release(&pvec); 1312 pagevec_release(&pvec);
1303} 1313}
1304 1314
1315static int inactive_anon_is_low_global(struct zone *zone)
1316{
1317 unsigned long active, inactive;
1318
1319 active = zone_page_state(zone, NR_ACTIVE_ANON);
1320 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1321
1322 if (inactive * zone->inactive_ratio < active)
1323 return 1;
1324
1325 return 0;
1326}
1327
1328/**
1329 * inactive_anon_is_low - check if anonymous pages need to be deactivated
1330 * @zone: zone to check
1331 * @sc: scan control of this context
1332 *
1333 * Returns true if the zone does not have enough inactive anon pages,
1334 * meaning some active anon pages need to be deactivated.
1335 */
1336static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1337{
1338 int low;
1339
1340 if (scanning_global_lru(sc))
1341 low = inactive_anon_is_low_global(zone);
1342 else
1343 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
1344 return low;
1345}
1346
1305static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1347static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1306 struct zone *zone, struct scan_control *sc, int priority) 1348 struct zone *zone, struct scan_control *sc, int priority)
1307{ 1349{
@@ -1312,8 +1354,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1312 return 0; 1354 return 0;
1313 } 1355 }
1314 1356
1315 if (lru == LRU_ACTIVE_ANON && 1357 if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) {
1316 (!scan_global_lru(sc) || inactive_anon_is_low(zone))) {
1317 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1358 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1318 return 0; 1359 return 0;
1319 } 1360 }
@@ -1335,12 +1376,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1335 unsigned long anon, file, free; 1376 unsigned long anon, file, free;
1336 unsigned long anon_prio, file_prio; 1377 unsigned long anon_prio, file_prio;
1337 unsigned long ap, fp; 1378 unsigned long ap, fp;
1338 1379 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1339 anon = zone_page_state(zone, NR_ACTIVE_ANON) +
1340 zone_page_state(zone, NR_INACTIVE_ANON);
1341 file = zone_page_state(zone, NR_ACTIVE_FILE) +
1342 zone_page_state(zone, NR_INACTIVE_FILE);
1343 free = zone_page_state(zone, NR_FREE_PAGES);
1344 1380
1345 /* If we have no swap space, do not bother scanning anon pages. */ 1381 /* If we have no swap space, do not bother scanning anon pages. */
1346 if (nr_swap_pages <= 0) { 1382 if (nr_swap_pages <= 0) {
@@ -1349,11 +1385,20 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1349 return; 1385 return;
1350 } 1386 }
1351 1387
1352 /* If we have very few page cache pages, force-scan anon pages. */ 1388 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
1353 if (unlikely(file + free <= zone->pages_high)) { 1389 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
1354 percent[0] = 100; 1390 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
1355 percent[1] = 0; 1391 zone_nr_pages(zone, sc, LRU_INACTIVE_FILE);
1356 return; 1392
1393 if (scanning_global_lru(sc)) {
1394 free = zone_page_state(zone, NR_FREE_PAGES);
1395 /* If we have very few page cache pages,
1396 force-scan anon pages. */
1397 if (unlikely(file + free <= zone->pages_high)) {
1398 percent[0] = 100;
1399 percent[1] = 0;
1400 return;
1401 }
1357 } 1402 }
1358 1403
1359 /* 1404 /*
@@ -1367,17 +1412,17 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1367 * 1412 *
1368 * anon in [0], file in [1] 1413 * anon in [0], file in [1]
1369 */ 1414 */
1370 if (unlikely(zone->recent_scanned[0] > anon / 4)) { 1415 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1371 spin_lock_irq(&zone->lru_lock); 1416 spin_lock_irq(&zone->lru_lock);
1372 zone->recent_scanned[0] /= 2; 1417 reclaim_stat->recent_scanned[0] /= 2;
1373 zone->recent_rotated[0] /= 2; 1418 reclaim_stat->recent_rotated[0] /= 2;
1374 spin_unlock_irq(&zone->lru_lock); 1419 spin_unlock_irq(&zone->lru_lock);
1375 } 1420 }
1376 1421
1377 if (unlikely(zone->recent_scanned[1] > file / 4)) { 1422 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1378 spin_lock_irq(&zone->lru_lock); 1423 spin_lock_irq(&zone->lru_lock);
1379 zone->recent_scanned[1] /= 2; 1424 reclaim_stat->recent_scanned[1] /= 2;
1380 zone->recent_rotated[1] /= 2; 1425 reclaim_stat->recent_rotated[1] /= 2;
1381 spin_unlock_irq(&zone->lru_lock); 1426 spin_unlock_irq(&zone->lru_lock);
1382 } 1427 }
1383 1428
@@ -1393,11 +1438,11 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1393 * proportional to the fraction of recently scanned pages on 1438 * proportional to the fraction of recently scanned pages on
1394 * each list that were recently referenced and in active use. 1439 * each list that were recently referenced and in active use.
1395 */ 1440 */
1396 ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1); 1441 ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
1397 ap /= zone->recent_rotated[0] + 1; 1442 ap /= reclaim_stat->recent_rotated[0] + 1;
1398 1443
1399 fp = (file_prio + 1) * (zone->recent_scanned[1] + 1); 1444 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1400 fp /= zone->recent_rotated[1] + 1; 1445 fp /= reclaim_stat->recent_rotated[1] + 1;
1401 1446
1402 /* Normalize to percentages */ 1447 /* Normalize to percentages */
1403 percent[0] = 100 * ap / (ap + fp + 1); 1448 percent[0] = 100 * ap / (ap + fp + 1);
@@ -1408,69 +1453,72 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1408/* 1453/*
1409 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1454 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1410 */ 1455 */
1411static unsigned long shrink_zone(int priority, struct zone *zone, 1456static void shrink_zone(int priority, struct zone *zone,
1412 struct scan_control *sc) 1457 struct scan_control *sc)
1413{ 1458{
1414 unsigned long nr[NR_LRU_LISTS]; 1459 unsigned long nr[NR_LRU_LISTS];
1415 unsigned long nr_to_scan; 1460 unsigned long nr_to_scan;
1416 unsigned long nr_reclaimed = 0;
1417 unsigned long percent[2]; /* anon @ 0; file @ 1 */ 1461 unsigned long percent[2]; /* anon @ 0; file @ 1 */
1418 enum lru_list l; 1462 enum lru_list l;
1463 unsigned long nr_reclaimed = sc->nr_reclaimed;
1464 unsigned long swap_cluster_max = sc->swap_cluster_max;
1419 1465
1420 get_scan_ratio(zone, sc, percent); 1466 get_scan_ratio(zone, sc, percent);
1421 1467
1422 for_each_evictable_lru(l) { 1468 for_each_evictable_lru(l) {
1423 if (scan_global_lru(sc)) { 1469 int file = is_file_lru(l);
1424 int file = is_file_lru(l); 1470 int scan;
1425 int scan; 1471
1426 1472 scan = zone_page_state(zone, NR_LRU_BASE + l);
1427 scan = zone_page_state(zone, NR_LRU_BASE + l); 1473 if (priority) {
1428 if (priority) { 1474 scan >>= priority;
1429 scan >>= priority; 1475 scan = (scan * percent[file]) / 100;
1430 scan = (scan * percent[file]) / 100; 1476 }
1431 } 1477 if (scanning_global_lru(sc)) {
1432 zone->lru[l].nr_scan += scan; 1478 zone->lru[l].nr_scan += scan;
1433 nr[l] = zone->lru[l].nr_scan; 1479 nr[l] = zone->lru[l].nr_scan;
1434 if (nr[l] >= sc->swap_cluster_max) 1480 if (nr[l] >= swap_cluster_max)
1435 zone->lru[l].nr_scan = 0; 1481 zone->lru[l].nr_scan = 0;
1436 else 1482 else
1437 nr[l] = 0; 1483 nr[l] = 0;
1438 } else { 1484 } else
1439 /* 1485 nr[l] = scan;
1440 * This reclaim occurs not because zone memory shortage
1441 * but because memory controller hits its limit.
1442 * Don't modify zone reclaim related data.
1443 */
1444 nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
1445 priority, l);
1446 }
1447 } 1486 }
1448 1487
1449 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1488 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1450 nr[LRU_INACTIVE_FILE]) { 1489 nr[LRU_INACTIVE_FILE]) {
1451 for_each_evictable_lru(l) { 1490 for_each_evictable_lru(l) {
1452 if (nr[l]) { 1491 if (nr[l]) {
1453 nr_to_scan = min(nr[l], 1492 nr_to_scan = min(nr[l], swap_cluster_max);
1454 (unsigned long)sc->swap_cluster_max);
1455 nr[l] -= nr_to_scan; 1493 nr[l] -= nr_to_scan;
1456 1494
1457 nr_reclaimed += shrink_list(l, nr_to_scan, 1495 nr_reclaimed += shrink_list(l, nr_to_scan,
1458 zone, sc, priority); 1496 zone, sc, priority);
1459 } 1497 }
1460 } 1498 }
1499 /*
1500 * On large memory systems, scan >> priority can become
1501 * really large. This is fine for the starting priority;
1502 * we want to put equal scanning pressure on each zone.
1503 * However, if the VM has a harder time of freeing pages,
1504 * with multiple processes reclaiming pages, the total
1505 * freeing target can get unreasonably large.
1506 */
1507 if (nr_reclaimed > swap_cluster_max &&
1508 priority < DEF_PRIORITY && !current_is_kswapd())
1509 break;
1461 } 1510 }
1462 1511
1512 sc->nr_reclaimed = nr_reclaimed;
1513
1463 /* 1514 /*
1464 * Even if we did not try to evict anon pages at all, we want to 1515 * Even if we did not try to evict anon pages at all, we want to
1465 * rebalance the anon lru active/inactive ratio. 1516 * rebalance the anon lru active/inactive ratio.
1466 */ 1517 */
1467 if (!scan_global_lru(sc) || inactive_anon_is_low(zone)) 1518 if (inactive_anon_is_low(zone, sc))
1468 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1469 else if (!scan_global_lru(sc))
1470 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1519 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1471 1520
1472 throttle_vm_writeout(sc->gfp_mask); 1521 throttle_vm_writeout(sc->gfp_mask);
1473 return nr_reclaimed;
1474} 1522}
1475 1523
1476/* 1524/*
@@ -1484,16 +1532,13 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1484 * b) The zones may be over pages_high but they must go *over* pages_high to 1532 * b) The zones may be over pages_high but they must go *over* pages_high to
1485 * satisfy the `incremental min' zone defense algorithm. 1533 * satisfy the `incremental min' zone defense algorithm.
1486 * 1534 *
1487 * Returns the number of reclaimed pages.
1488 *
1489 * If a zone is deemed to be full of pinned pages then just give it a light 1535 * If a zone is deemed to be full of pinned pages then just give it a light
1490 * scan then give up on it. 1536 * scan then give up on it.
1491 */ 1537 */
1492static unsigned long shrink_zones(int priority, struct zonelist *zonelist, 1538static void shrink_zones(int priority, struct zonelist *zonelist,
1493 struct scan_control *sc) 1539 struct scan_control *sc)
1494{ 1540{
1495 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); 1541 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1496 unsigned long nr_reclaimed = 0;
1497 struct zoneref *z; 1542 struct zoneref *z;
1498 struct zone *zone; 1543 struct zone *zone;
1499 1544
@@ -1505,7 +1550,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
1505 * Take care memory controller reclaiming has small influence 1550 * Take care memory controller reclaiming has small influence
1506 * to global LRU. 1551 * to global LRU.
1507 */ 1552 */
1508 if (scan_global_lru(sc)) { 1553 if (scanning_global_lru(sc)) {
1509 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1554 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1510 continue; 1555 continue;
1511 note_zone_scanning_priority(zone, priority); 1556 note_zone_scanning_priority(zone, priority);
@@ -1524,10 +1569,8 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
1524 priority); 1569 priority);
1525 } 1570 }
1526 1571
1527 nr_reclaimed += shrink_zone(priority, zone, sc); 1572 shrink_zone(priority, zone, sc);
1528 } 1573 }
1529
1530 return nr_reclaimed;
1531} 1574}
1532 1575
1533/* 1576/*
@@ -1552,7 +1595,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1552 int priority; 1595 int priority;
1553 unsigned long ret = 0; 1596 unsigned long ret = 0;
1554 unsigned long total_scanned = 0; 1597 unsigned long total_scanned = 0;
1555 unsigned long nr_reclaimed = 0;
1556 struct reclaim_state *reclaim_state = current->reclaim_state; 1598 struct reclaim_state *reclaim_state = current->reclaim_state;
1557 unsigned long lru_pages = 0; 1599 unsigned long lru_pages = 0;
1558 struct zoneref *z; 1600 struct zoneref *z;
@@ -1561,12 +1603,12 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1561 1603
1562 delayacct_freepages_start(); 1604 delayacct_freepages_start();
1563 1605
1564 if (scan_global_lru(sc)) 1606 if (scanning_global_lru(sc))
1565 count_vm_event(ALLOCSTALL); 1607 count_vm_event(ALLOCSTALL);
1566 /* 1608 /*
1567 * mem_cgroup will not do shrink_slab. 1609 * mem_cgroup will not do shrink_slab.
1568 */ 1610 */
1569 if (scan_global_lru(sc)) { 1611 if (scanning_global_lru(sc)) {
1570 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1612 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1571 1613
1572 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1614 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
@@ -1580,21 +1622,21 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1580 sc->nr_scanned = 0; 1622 sc->nr_scanned = 0;
1581 if (!priority) 1623 if (!priority)
1582 disable_swap_token(); 1624 disable_swap_token();
1583 nr_reclaimed += shrink_zones(priority, zonelist, sc); 1625 shrink_zones(priority, zonelist, sc);
1584 /* 1626 /*
1585 * Don't shrink slabs when reclaiming memory from 1627 * Don't shrink slabs when reclaiming memory from
1586 * over limit cgroups 1628 * over limit cgroups
1587 */ 1629 */
1588 if (scan_global_lru(sc)) { 1630 if (scanning_global_lru(sc)) {
1589 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); 1631 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
1590 if (reclaim_state) { 1632 if (reclaim_state) {
1591 nr_reclaimed += reclaim_state->reclaimed_slab; 1633 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
1592 reclaim_state->reclaimed_slab = 0; 1634 reclaim_state->reclaimed_slab = 0;
1593 } 1635 }
1594 } 1636 }
1595 total_scanned += sc->nr_scanned; 1637 total_scanned += sc->nr_scanned;
1596 if (nr_reclaimed >= sc->swap_cluster_max) { 1638 if (sc->nr_reclaimed >= sc->swap_cluster_max) {
1597 ret = nr_reclaimed; 1639 ret = sc->nr_reclaimed;
1598 goto out; 1640 goto out;
1599 } 1641 }
1600 1642
@@ -1616,8 +1658,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1616 congestion_wait(WRITE, HZ/10); 1658 congestion_wait(WRITE, HZ/10);
1617 } 1659 }
1618 /* top priority shrink_zones still had more to do? don't OOM, then */ 1660 /* top priority shrink_zones still had more to do? don't OOM, then */
1619 if (!sc->all_unreclaimable && scan_global_lru(sc)) 1661 if (!sc->all_unreclaimable && scanning_global_lru(sc))
1620 ret = nr_reclaimed; 1662 ret = sc->nr_reclaimed;
1621out: 1663out:
1622 /* 1664 /*
1623 * Now that we've scanned all the zones at this priority level, note 1665 * Now that we've scanned all the zones at this priority level, note
@@ -1629,7 +1671,7 @@ out:
1629 if (priority < 0) 1671 if (priority < 0)
1630 priority = 0; 1672 priority = 0;
1631 1673
1632 if (scan_global_lru(sc)) { 1674 if (scanning_global_lru(sc)) {
1633 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1675 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1634 1676
1635 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1677 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
@@ -1665,19 +1707,24 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1665#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1707#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1666 1708
1667unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 1709unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1668 gfp_t gfp_mask) 1710 gfp_t gfp_mask,
1711 bool noswap,
1712 unsigned int swappiness)
1669{ 1713{
1670 struct scan_control sc = { 1714 struct scan_control sc = {
1671 .may_writepage = !laptop_mode, 1715 .may_writepage = !laptop_mode,
1672 .may_swap = 1, 1716 .may_swap = 1,
1673 .swap_cluster_max = SWAP_CLUSTER_MAX, 1717 .swap_cluster_max = SWAP_CLUSTER_MAX,
1674 .swappiness = vm_swappiness, 1718 .swappiness = swappiness,
1675 .order = 0, 1719 .order = 0,
1676 .mem_cgroup = mem_cont, 1720 .mem_cgroup = mem_cont,
1677 .isolate_pages = mem_cgroup_isolate_pages, 1721 .isolate_pages = mem_cgroup_isolate_pages,
1678 }; 1722 };
1679 struct zonelist *zonelist; 1723 struct zonelist *zonelist;
1680 1724
1725 if (noswap)
1726 sc.may_swap = 0;
1727
1681 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 1728 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1682 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 1729 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1683 zonelist = NODE_DATA(numa_node_id())->node_zonelists; 1730 zonelist = NODE_DATA(numa_node_id())->node_zonelists;
@@ -1712,7 +1759,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1712 int priority; 1759 int priority;
1713 int i; 1760 int i;
1714 unsigned long total_scanned; 1761 unsigned long total_scanned;
1715 unsigned long nr_reclaimed;
1716 struct reclaim_state *reclaim_state = current->reclaim_state; 1762 struct reclaim_state *reclaim_state = current->reclaim_state;
1717 struct scan_control sc = { 1763 struct scan_control sc = {
1718 .gfp_mask = GFP_KERNEL, 1764 .gfp_mask = GFP_KERNEL,
@@ -1731,7 +1777,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1731 1777
1732loop_again: 1778loop_again:
1733 total_scanned = 0; 1779 total_scanned = 0;
1734 nr_reclaimed = 0; 1780 sc.nr_reclaimed = 0;
1735 sc.may_writepage = !laptop_mode; 1781 sc.may_writepage = !laptop_mode;
1736 count_vm_event(PAGEOUTRUN); 1782 count_vm_event(PAGEOUTRUN);
1737 1783
@@ -1766,7 +1812,7 @@ loop_again:
1766 * Do some background aging of the anon list, to give 1812 * Do some background aging of the anon list, to give
1767 * pages a chance to be referenced before reclaiming. 1813 * pages a chance to be referenced before reclaiming.
1768 */ 1814 */
1769 if (inactive_anon_is_low(zone)) 1815 if (inactive_anon_is_low(zone, &sc))
1770 shrink_active_list(SWAP_CLUSTER_MAX, zone, 1816 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1771 &sc, priority, 0); 1817 &sc, priority, 0);
1772 1818
@@ -1817,11 +1863,11 @@ loop_again:
1817 */ 1863 */
1818 if (!zone_watermark_ok(zone, order, 8*zone->pages_high, 1864 if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
1819 end_zone, 0)) 1865 end_zone, 0))
1820 nr_reclaimed += shrink_zone(priority, zone, &sc); 1866 shrink_zone(priority, zone, &sc);
1821 reclaim_state->reclaimed_slab = 0; 1867 reclaim_state->reclaimed_slab = 0;
1822 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1868 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1823 lru_pages); 1869 lru_pages);
1824 nr_reclaimed += reclaim_state->reclaimed_slab; 1870 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
1825 total_scanned += sc.nr_scanned; 1871 total_scanned += sc.nr_scanned;
1826 if (zone_is_all_unreclaimable(zone)) 1872 if (zone_is_all_unreclaimable(zone))
1827 continue; 1873 continue;
@@ -1835,7 +1881,7 @@ loop_again:
1835 * even in laptop mode 1881 * even in laptop mode
1836 */ 1882 */
1837 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 1883 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
1838 total_scanned > nr_reclaimed + nr_reclaimed / 2) 1884 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
1839 sc.may_writepage = 1; 1885 sc.may_writepage = 1;
1840 } 1886 }
1841 if (all_zones_ok) 1887 if (all_zones_ok)
@@ -1853,7 +1899,7 @@ loop_again:
1853 * matches the direct reclaim path behaviour in terms of impact 1899 * matches the direct reclaim path behaviour in terms of impact
1854 * on zone->*_priority. 1900 * on zone->*_priority.
1855 */ 1901 */
1856 if (nr_reclaimed >= SWAP_CLUSTER_MAX) 1902 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
1857 break; 1903 break;
1858 } 1904 }
1859out: 1905out:
@@ -1872,10 +1918,27 @@ out:
1872 1918
1873 try_to_freeze(); 1919 try_to_freeze();
1874 1920
1921 /*
1922 * Fragmentation may mean that the system cannot be
1923 * rebalanced for high-order allocations in all zones.
1924 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX,
1925 * it means the zones have been fully scanned and are still
1926 * not balanced. For high-order allocations, there is
1927 * little point trying all over again as kswapd may
1928 * infinite loop.
1929 *
1930 * Instead, recheck all watermarks at order-0 as they
1931 * are the most important. If watermarks are ok, kswapd will go
1932 * back to sleep. High-order users can still perform direct
1933 * reclaim if they wish.
1934 */
1935 if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
1936 order = sc.order = 0;
1937
1875 goto loop_again; 1938 goto loop_again;
1876 } 1939 }
1877 1940
1878 return nr_reclaimed; 1941 return sc.nr_reclaimed;
1879} 1942}
1880 1943
1881/* 1944/*
@@ -1902,7 +1965,7 @@ static int kswapd(void *p)
1902 }; 1965 };
1903 node_to_cpumask_ptr(cpumask, pgdat->node_id); 1966 node_to_cpumask_ptr(cpumask, pgdat->node_id);
1904 1967
1905 if (!cpus_empty(*cpumask)) 1968 if (!cpumask_empty(cpumask))
1906 set_cpus_allowed_ptr(tsk, cpumask); 1969 set_cpus_allowed_ptr(tsk, cpumask);
1907 current->reclaim_state = &reclaim_state; 1970 current->reclaim_state = &reclaim_state;
1908 1971
@@ -2141,7 +2204,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
2141 pg_data_t *pgdat = NODE_DATA(nid); 2204 pg_data_t *pgdat = NODE_DATA(nid);
2142 node_to_cpumask_ptr(mask, pgdat->node_id); 2205 node_to_cpumask_ptr(mask, pgdat->node_id);
2143 2206
2144 if (any_online_cpu(*mask) < nr_cpu_ids) 2207 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
2145 /* One of our CPUs online: restore mask */ 2208 /* One of our CPUs online: restore mask */
2146 set_cpus_allowed_ptr(pgdat->kswapd, mask); 2209 set_cpus_allowed_ptr(pgdat->kswapd, mask);
2147 } 2210 }
@@ -2227,7 +2290,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2227 struct task_struct *p = current; 2290 struct task_struct *p = current;
2228 struct reclaim_state reclaim_state; 2291 struct reclaim_state reclaim_state;
2229 int priority; 2292 int priority;
2230 unsigned long nr_reclaimed = 0;
2231 struct scan_control sc = { 2293 struct scan_control sc = {
2232 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 2294 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
2233 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), 2295 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -2260,9 +2322,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2260 priority = ZONE_RECLAIM_PRIORITY; 2322 priority = ZONE_RECLAIM_PRIORITY;
2261 do { 2323 do {
2262 note_zone_scanning_priority(zone, priority); 2324 note_zone_scanning_priority(zone, priority);
2263 nr_reclaimed += shrink_zone(priority, zone, &sc); 2325 shrink_zone(priority, zone, &sc);
2264 priority--; 2326 priority--;
2265 } while (priority >= 0 && nr_reclaimed < nr_pages); 2327 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
2266 } 2328 }
2267 2329
2268 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 2330 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -2286,13 +2348,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2286 * Update nr_reclaimed by the number of slab pages we 2348 * Update nr_reclaimed by the number of slab pages we
2287 * reclaimed from this zone. 2349 * reclaimed from this zone.
2288 */ 2350 */
2289 nr_reclaimed += slab_reclaimable - 2351 sc.nr_reclaimed += slab_reclaimable -
2290 zone_page_state(zone, NR_SLAB_RECLAIMABLE); 2352 zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2291 } 2353 }
2292 2354
2293 p->reclaim_state = NULL; 2355 p->reclaim_state = NULL;
2294 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 2356 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
2295 return nr_reclaimed >= nr_pages; 2357 return sc.nr_reclaimed >= nr_pages;
2296} 2358}
2297 2359
2298int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 2360int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
@@ -2393,6 +2455,7 @@ retry:
2393 2455
2394 __dec_zone_state(zone, NR_UNEVICTABLE); 2456 __dec_zone_state(zone, NR_UNEVICTABLE);
2395 list_move(&page->lru, &zone->lru[l].list); 2457 list_move(&page->lru, &zone->lru[l].list);
2458 mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
2396 __inc_zone_state(zone, NR_INACTIVE_ANON + l); 2459 __inc_zone_state(zone, NR_INACTIVE_ANON + l);
2397 __count_vm_event(UNEVICTABLE_PGRESCUED); 2460 __count_vm_event(UNEVICTABLE_PGRESCUED);
2398 } else { 2461 } else {
@@ -2401,6 +2464,7 @@ retry:
2401 */ 2464 */
2402 SetPageUnevictable(page); 2465 SetPageUnevictable(page);
2403 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); 2466 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
2467 mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
2404 if (page_evictable(page, NULL)) 2468 if (page_evictable(page, NULL))
2405 goto retry; 2469 goto retry;
2406 } 2470 }
@@ -2472,7 +2536,7 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
2472 * back onto @zone's unevictable list. 2536 * back onto @zone's unevictable list.
2473 */ 2537 */
2474#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ 2538#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
2475void scan_zone_unevictable_pages(struct zone *zone) 2539static void scan_zone_unevictable_pages(struct zone *zone)
2476{ 2540{
2477 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; 2541 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
2478 unsigned long scan; 2542 unsigned long scan;
@@ -2514,7 +2578,7 @@ void scan_zone_unevictable_pages(struct zone *zone)
2514 * that has possibly/probably made some previously unevictable pages 2578 * that has possibly/probably made some previously unevictable pages
2515 * evictable. 2579 * evictable.
2516 */ 2580 */
2517void scan_all_zones_unevictable_pages(void) 2581static void scan_all_zones_unevictable_pages(void)
2518{ 2582{
2519 struct zone *zone; 2583 struct zone *zone;
2520 2584
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c3ccfda23adc..91149746bb8d 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -20,7 +20,7 @@
20DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; 20DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
21EXPORT_PER_CPU_SYMBOL(vm_event_states); 21EXPORT_PER_CPU_SYMBOL(vm_event_states);
22 22
23static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask) 23static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
24{ 24{
25 int cpu; 25 int cpu;
26 int i; 26 int i;
@@ -43,7 +43,7 @@ static void sum_vm_events(unsigned long *ret, cpumask_t *cpumask)
43void all_vm_events(unsigned long *ret) 43void all_vm_events(unsigned long *ret)
44{ 44{
45 get_online_cpus(); 45 get_online_cpus();
46 sum_vm_events(ret, &cpu_online_map); 46 sum_vm_events(ret, cpu_online_mask);
47 put_online_cpus(); 47 put_online_cpus();
48} 48}
49EXPORT_SYMBOL_GPL(all_vm_events); 49EXPORT_SYMBOL_GPL(all_vm_events);