aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-01-10 21:43:52 -0500
committerIngo Molnar <mingo@elte.hu>2009-01-10 21:43:52 -0500
commit99cd7074891f87c49660e3b2880564324a4733ac (patch)
tree903d2665bcb445f1f265d1adf7a99f265bcefc15 /mm
parente8a9cbf6ae620d9e5ba9cb42001c033287a284a3 (diff)
parentc59765042f53a79a7a65585042ff463b69cb248c (diff)
Merge commit 'v2.6.29-rc1' into tracing/urgent
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig6
-rw-r--r--mm/Makefile4
-rw-r--r--mm/backing-dev.c8
-rw-r--r--mm/bootmem.c8
-rw-r--r--mm/filemap.c32
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/fremap.c2
-rw-r--r--mm/hugetlb.c46
-rw-r--r--mm/internal.h2
-rw-r--r--mm/memcontrol.c1847
-rw-r--r--mm/memory.c204
-rw-r--r--mm/memory_hotplug.c20
-rw-r--r--mm/migrate.c131
-rw-r--r--mm/mlock.c9
-rw-r--r--mm/mmap.c32
-rw-r--r--mm/mprotect.c6
-rw-r--r--mm/nommu.c1027
-rw-r--r--mm/oom_kill.c119
-rw-r--r--mm/page-writeback.c245
-rw-r--r--mm/page_alloc.c143
-rw-r--r--mm/page_cgroup.c209
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/rmap.c60
-rw-r--r--mm/shmem.c102
-rw-r--r--mm/slub.c2
-rw-r--r--mm/swap.c77
-rw-r--r--mm/swap_state.c35
-rw-r--r--mm/swapfile.c600
-rw-r--r--mm/tiny-shmem.c134
-rw-r--r--mm/vmalloc.c50
-rw-r--r--mm/vmscan.c324
31 files changed, 3801 insertions, 1691 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 5b5790f8a816..a5b77811fdf2 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -181,12 +181,6 @@ config MIGRATION
181 example on NUMA systems to put pages nearer to the processors accessing 181 example on NUMA systems to put pages nearer to the processors accessing
182 the page. 182 the page.
183 183
184config RESOURCES_64BIT
185 bool "64 bit Memory and IO resources (EXPERIMENTAL)" if (!64BIT && EXPERIMENTAL)
186 default 64BIT
187 help
188 This option allows memory and IO resources to be 64 bit.
189
190config PHYS_ADDR_T_64BIT 184config PHYS_ADDR_T_64BIT
191 def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT 185 def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
192 186
diff --git a/mm/Makefile b/mm/Makefile
index 51c27709cc7c..72255be57f89 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -9,7 +9,7 @@ mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
9 9
10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ 10obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o pdflush.o \ 11 maccess.o page_alloc.o page-writeback.o pdflush.o \
12 readahead.o swap.o truncate.o vmscan.o \ 12 readahead.o swap.o truncate.o vmscan.o shmem.o \
13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ 13 prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
14 page_isolation.o mm_init.o $(mmu-y) 14 page_isolation.o mm_init.o $(mmu-y)
15 15
@@ -21,9 +21,7 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o
21obj-$(CONFIG_NUMA) += mempolicy.o 21obj-$(CONFIG_NUMA) += mempolicy.o
22obj-$(CONFIG_SPARSEMEM) += sparse.o 22obj-$(CONFIG_SPARSEMEM) += sparse.o
23obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o 23obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
24obj-$(CONFIG_SHMEM) += shmem.o
25obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o 24obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
26obj-$(CONFIG_TINY_SHMEM) += tiny-shmem.o
27obj-$(CONFIG_SLOB) += slob.o 25obj-$(CONFIG_SLOB) += slob.o
28obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 26obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
29obj-$(CONFIG_SLAB) += slab.o 27obj-$(CONFIG_SLAB) += slab.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 801c08b046e6..8e8587444132 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -24,9 +24,9 @@ static void bdi_debug_init(void)
24static int bdi_debug_stats_show(struct seq_file *m, void *v) 24static int bdi_debug_stats_show(struct seq_file *m, void *v)
25{ 25{
26 struct backing_dev_info *bdi = m->private; 26 struct backing_dev_info *bdi = m->private;
27 long background_thresh; 27 unsigned long background_thresh;
28 long dirty_thresh; 28 unsigned long dirty_thresh;
29 long bdi_thresh; 29 unsigned long bdi_thresh;
30 30
31 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); 31 get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
32 32
@@ -223,7 +223,7 @@ int bdi_init(struct backing_dev_info *bdi)
223 bdi->max_prop_frac = PROP_FRAC_BASE; 223 bdi->max_prop_frac = PROP_FRAC_BASE;
224 224
225 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 225 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
226 err = percpu_counter_init_irq(&bdi->bdi_stat[i], 0); 226 err = percpu_counter_init(&bdi->bdi_stat[i], 0);
227 if (err) 227 if (err)
228 goto err; 228 goto err;
229 } 229 }
diff --git a/mm/bootmem.c b/mm/bootmem.c
index ac5a891f142a..51a0ccf61e0e 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -435,6 +435,10 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
435 unsigned long fallback = 0; 435 unsigned long fallback = 0;
436 unsigned long min, max, start, sidx, midx, step; 436 unsigned long min, max, start, sidx, midx, step;
437 437
438 bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
439 bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
440 align, goal, limit);
441
438 BUG_ON(!size); 442 BUG_ON(!size);
439 BUG_ON(align & (align - 1)); 443 BUG_ON(align & (align - 1));
440 BUG_ON(limit && goal + size > limit); 444 BUG_ON(limit && goal + size > limit);
@@ -442,10 +446,6 @@ static void * __init alloc_bootmem_core(struct bootmem_data *bdata,
442 if (!bdata->node_bootmem_map) 446 if (!bdata->node_bootmem_map)
443 return NULL; 447 return NULL;
444 448
445 bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
446 bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
447 align, goal, limit);
448
449 min = bdata->node_min_pfn; 449 min = bdata->node_min_pfn;
450 max = bdata->node_low_pfn; 450 max = bdata->node_low_pfn;
451 451
diff --git a/mm/filemap.c b/mm/filemap.c
index f5769b4dc075..ceba0bd03662 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -210,7 +210,7 @@ int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
210 int ret; 210 int ret;
211 struct writeback_control wbc = { 211 struct writeback_control wbc = {
212 .sync_mode = sync_mode, 212 .sync_mode = sync_mode,
213 .nr_to_write = mapping->nrpages * 2, 213 .nr_to_write = LONG_MAX,
214 .range_start = start, 214 .range_start = start,
215 .range_end = end, 215 .range_end = end,
216 }; 216 };
@@ -460,7 +460,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
460 VM_BUG_ON(!PageLocked(page)); 460 VM_BUG_ON(!PageLocked(page));
461 461
462 error = mem_cgroup_cache_charge(page, current->mm, 462 error = mem_cgroup_cache_charge(page, current->mm,
463 gfp_mask & ~__GFP_HIGHMEM); 463 gfp_mask & GFP_RECLAIM_MASK);
464 if (error) 464 if (error)
465 goto out; 465 goto out;
466 466
@@ -741,7 +741,14 @@ repeat:
741 page = __page_cache_alloc(gfp_mask); 741 page = __page_cache_alloc(gfp_mask);
742 if (!page) 742 if (!page)
743 return NULL; 743 return NULL;
744 err = add_to_page_cache_lru(page, mapping, index, gfp_mask); 744 /*
745 * We want a regular kernel memory (not highmem or DMA etc)
746 * allocation for the radix tree nodes, but we need to honour
747 * the context-specific requirements the caller has asked for.
748 * GFP_RECLAIM_MASK collects those requirements.
749 */
750 err = add_to_page_cache_lru(page, mapping, index,
751 (gfp_mask & GFP_RECLAIM_MASK));
745 if (unlikely(err)) { 752 if (unlikely(err)) {
746 page_cache_release(page); 753 page_cache_release(page);
747 page = NULL; 754 page = NULL;
@@ -950,7 +957,7 @@ grab_cache_page_nowait(struct address_space *mapping, pgoff_t index)
950 return NULL; 957 return NULL;
951 } 958 }
952 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); 959 page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS);
953 if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { 960 if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) {
954 page_cache_release(page); 961 page_cache_release(page);
955 page = NULL; 962 page = NULL;
956 } 963 }
@@ -1317,7 +1324,8 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1317 goto out; /* skip atime */ 1324 goto out; /* skip atime */
1318 size = i_size_read(inode); 1325 size = i_size_read(inode);
1319 if (pos < size) { 1326 if (pos < size) {
1320 retval = filemap_write_and_wait(mapping); 1327 retval = filemap_write_and_wait_range(mapping, pos,
1328 pos + iov_length(iov, nr_segs) - 1);
1321 if (!retval) { 1329 if (!retval) {
1322 retval = mapping->a_ops->direct_IO(READ, iocb, 1330 retval = mapping->a_ops->direct_IO(READ, iocb,
1323 iov, pos, nr_segs); 1331 iov, pos, nr_segs);
@@ -1530,7 +1538,6 @@ retry_find:
1530 /* 1538 /*
1531 * Found the page and have a reference on it. 1539 * Found the page and have a reference on it.
1532 */ 1540 */
1533 mark_page_accessed(page);
1534 ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT; 1541 ra->prev_pos = (loff_t)page->index << PAGE_CACHE_SHIFT;
1535 vmf->page = page; 1542 vmf->page = page;
1536 return ret | VM_FAULT_LOCKED; 1543 return ret | VM_FAULT_LOCKED;
@@ -2060,18 +2067,10 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2060 if (count != ocount) 2067 if (count != ocount)
2061 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count); 2068 *nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
2062 2069
2063 /*
2064 * Unmap all mmappings of the file up-front.
2065 *
2066 * This will cause any pte dirty bits to be propagated into the
2067 * pageframes for the subsequent filemap_write_and_wait().
2068 */
2069 write_len = iov_length(iov, *nr_segs); 2070 write_len = iov_length(iov, *nr_segs);
2070 end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT; 2071 end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
2071 if (mapping_mapped(mapping))
2072 unmap_mapping_range(mapping, pos, write_len, 0);
2073 2072
2074 written = filemap_write_and_wait(mapping); 2073 written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
2075 if (written) 2074 if (written)
2076 goto out; 2075 goto out;
2077 2076
@@ -2291,7 +2290,8 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
2291 * the file data here, to try to honour O_DIRECT expectations. 2290 * the file data here, to try to honour O_DIRECT expectations.
2292 */ 2291 */
2293 if (unlikely(file->f_flags & O_DIRECT) && written) 2292 if (unlikely(file->f_flags & O_DIRECT) && written)
2294 status = filemap_write_and_wait(mapping); 2293 status = filemap_write_and_wait_range(mapping,
2294 pos, pos + written - 1);
2295 2295
2296 return written ? written : status; 2296 return written ? written : status;
2297} 2297}
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index b5167dfb2f2d..0c04615651b7 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -193,7 +193,7 @@ retry:
193 /* Nuke the page table entry. */ 193 /* Nuke the page table entry. */
194 flush_cache_page(vma, address, pte_pfn(*pte)); 194 flush_cache_page(vma, address, pte_pfn(*pte));
195 pteval = ptep_clear_flush_notify(vma, address, pte); 195 pteval = ptep_clear_flush_notify(vma, address, pte);
196 page_remove_rmap(page, vma); 196 page_remove_rmap(page);
197 dec_mm_counter(mm, file_rss); 197 dec_mm_counter(mm, file_rss);
198 BUG_ON(pte_dirty(pteval)); 198 BUG_ON(pte_dirty(pteval));
199 pte_unmap_unlock(pte, ptl); 199 pte_unmap_unlock(pte, ptl);
diff --git a/mm/fremap.c b/mm/fremap.c
index 7d12ca70ef7b..62d5bbda921a 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -37,7 +37,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
37 if (page) { 37 if (page) {
38 if (pte_dirty(pte)) 38 if (pte_dirty(pte))
39 set_page_dirty(page); 39 set_page_dirty(page);
40 page_remove_rmap(page, vma); 40 page_remove_rmap(page);
41 page_cache_release(page); 41 page_cache_release(page);
42 update_hiwater_rss(mm); 42 update_hiwater_rss(mm);
43 dec_mm_counter(mm, file_rss); 43 dec_mm_counter(mm, file_rss);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 6058b53dcb89..618e98304080 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -220,6 +220,35 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
220} 220}
221 221
222/* 222/*
223 * Return the size of the pages allocated when backing a VMA. In the majority
224 * cases this will be same size as used by the page table entries.
225 */
226unsigned long vma_kernel_pagesize(struct vm_area_struct *vma)
227{
228 struct hstate *hstate;
229
230 if (!is_vm_hugetlb_page(vma))
231 return PAGE_SIZE;
232
233 hstate = hstate_vma(vma);
234
235 return 1UL << (hstate->order + PAGE_SHIFT);
236}
237
238/*
239 * Return the page size being used by the MMU to back a VMA. In the majority
240 * of cases, the page size used by the kernel matches the MMU size. On
241 * architectures where it differs, an architecture-specific version of this
242 * function is required.
243 */
244#ifndef vma_mmu_pagesize
245unsigned long vma_mmu_pagesize(struct vm_area_struct *vma)
246{
247 return vma_kernel_pagesize(vma);
248}
249#endif
250
251/*
223 * Flags for MAP_PRIVATE reservations. These are stored in the bottom 252 * Flags for MAP_PRIVATE reservations. These are stored in the bottom
224 * bits of the reservation map pointer, which are always clear due to 253 * bits of the reservation map pointer, which are always clear due to
225 * alignment. 254 * alignment.
@@ -371,8 +400,10 @@ static void clear_huge_page(struct page *page,
371{ 400{
372 int i; 401 int i;
373 402
374 if (unlikely(sz > MAX_ORDER_NR_PAGES)) 403 if (unlikely(sz > MAX_ORDER_NR_PAGES)) {
375 return clear_gigantic_page(page, addr, sz); 404 clear_gigantic_page(page, addr, sz);
405 return;
406 }
376 407
377 might_sleep(); 408 might_sleep();
378 for (i = 0; i < sz/PAGE_SIZE; i++) { 409 for (i = 0; i < sz/PAGE_SIZE; i++) {
@@ -404,8 +435,10 @@ static void copy_huge_page(struct page *dst, struct page *src,
404 int i; 435 int i;
405 struct hstate *h = hstate_vma(vma); 436 struct hstate *h = hstate_vma(vma);
406 437
407 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) 438 if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
408 return copy_gigantic_page(dst, src, addr, vma); 439 copy_gigantic_page(dst, src, addr, vma);
440 return;
441 }
409 442
410 might_sleep(); 443 might_sleep();
411 for (i = 0; i < pages_per_huge_page(h); i++) { 444 for (i = 0; i < pages_per_huge_page(h); i++) {
@@ -972,7 +1005,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
972 return page; 1005 return page;
973} 1006}
974 1007
975__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) 1008int __weak alloc_bootmem_huge_page(struct hstate *h)
976{ 1009{
977 struct huge_bootmem_page *m; 1010 struct huge_bootmem_page *m;
978 int nr_nodes = nodes_weight(node_online_map); 1011 int nr_nodes = nodes_weight(node_online_map);
@@ -991,8 +1024,7 @@ __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h)
991 * puts them into the mem_map). 1024 * puts them into the mem_map).
992 */ 1025 */
993 m = addr; 1026 m = addr;
994 if (m) 1027 goto found;
995 goto found;
996 } 1028 }
997 hstate_next_node(h); 1029 hstate_next_node(h);
998 nr_nodes--; 1030 nr_nodes--;
diff --git a/mm/internal.h b/mm/internal.h
index 13333bc2eb68..478223b73a2a 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -49,6 +49,7 @@ extern void putback_lru_page(struct page *page);
49/* 49/*
50 * in mm/page_alloc.c 50 * in mm/page_alloc.c
51 */ 51 */
52extern unsigned long highest_memmap_pfn;
52extern void __free_pages_bootmem(struct page *page, unsigned int order); 53extern void __free_pages_bootmem(struct page *page, unsigned int order);
53 54
54/* 55/*
@@ -275,6 +276,7 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn,
275#define GUP_FLAGS_WRITE 0x1 276#define GUP_FLAGS_WRITE 0x1
276#define GUP_FLAGS_FORCE 0x2 277#define GUP_FLAGS_FORCE 0x2
277#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 278#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4
279#define GUP_FLAGS_IGNORE_SIGKILL 0x8
278 280
279int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, 281int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
280 unsigned long start, int len, int flags, 282 unsigned long start, int len, int flags,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 866dcc7eeb0c..e2996b80601f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -21,11 +21,13 @@
21#include <linux/memcontrol.h> 21#include <linux/memcontrol.h>
22#include <linux/cgroup.h> 22#include <linux/cgroup.h>
23#include <linux/mm.h> 23#include <linux/mm.h>
24#include <linux/pagemap.h>
24#include <linux/smp.h> 25#include <linux/smp.h>
25#include <linux/page-flags.h> 26#include <linux/page-flags.h>
26#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
27#include <linux/bit_spinlock.h> 28#include <linux/bit_spinlock.h>
28#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30#include <linux/mutex.h>
29#include <linux/slab.h> 31#include <linux/slab.h>
30#include <linux/swap.h> 32#include <linux/swap.h>
31#include <linux/spinlock.h> 33#include <linux/spinlock.h>
@@ -34,12 +36,23 @@
34#include <linux/vmalloc.h> 36#include <linux/vmalloc.h>
35#include <linux/mm_inline.h> 37#include <linux/mm_inline.h>
36#include <linux/page_cgroup.h> 38#include <linux/page_cgroup.h>
39#include "internal.h"
37 40
38#include <asm/uaccess.h> 41#include <asm/uaccess.h>
39 42
40struct cgroup_subsys mem_cgroup_subsys __read_mostly; 43struct cgroup_subsys mem_cgroup_subsys __read_mostly;
41#define MEM_CGROUP_RECLAIM_RETRIES 5 44#define MEM_CGROUP_RECLAIM_RETRIES 5
42 45
46#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
47/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
48int do_swap_account __read_mostly;
49static int really_do_swap_account __initdata = 1; /* for remember boot option*/
50#else
51#define do_swap_account (0)
52#endif
53
54static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */
55
43/* 56/*
44 * Statistics for memory cgroup. 57 * Statistics for memory cgroup.
45 */ 58 */
@@ -60,7 +73,7 @@ struct mem_cgroup_stat_cpu {
60} ____cacheline_aligned_in_smp; 73} ____cacheline_aligned_in_smp;
61 74
62struct mem_cgroup_stat { 75struct mem_cgroup_stat {
63 struct mem_cgroup_stat_cpu cpustat[NR_CPUS]; 76 struct mem_cgroup_stat_cpu cpustat[0];
64}; 77};
65 78
66/* 79/*
@@ -89,9 +102,10 @@ struct mem_cgroup_per_zone {
89 /* 102 /*
90 * spin_lock to protect the per cgroup LRU 103 * spin_lock to protect the per cgroup LRU
91 */ 104 */
92 spinlock_t lru_lock;
93 struct list_head lists[NR_LRU_LISTS]; 105 struct list_head lists[NR_LRU_LISTS];
94 unsigned long count[NR_LRU_LISTS]; 106 unsigned long count[NR_LRU_LISTS];
107
108 struct zone_reclaim_stat reclaim_stat;
95}; 109};
96/* Macro for accessing counter */ 110/* Macro for accessing counter */
97#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) 111#define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)])
@@ -122,44 +136,73 @@ struct mem_cgroup {
122 */ 136 */
123 struct res_counter res; 137 struct res_counter res;
124 /* 138 /*
139 * the counter to account for mem+swap usage.
140 */
141 struct res_counter memsw;
142 /*
125 * Per cgroup active and inactive list, similar to the 143 * Per cgroup active and inactive list, similar to the
126 * per zone LRU lists. 144 * per zone LRU lists.
127 */ 145 */
128 struct mem_cgroup_lru_info info; 146 struct mem_cgroup_lru_info info;
129 147
148 /*
149 protect against reclaim related member.
150 */
151 spinlock_t reclaim_param_lock;
152
130 int prev_priority; /* for recording reclaim priority */ 153 int prev_priority; /* for recording reclaim priority */
154
155 /*
156 * While reclaiming in a hiearchy, we cache the last child we
157 * reclaimed from. Protected by hierarchy_mutex
158 */
159 struct mem_cgroup *last_scanned_child;
131 /* 160 /*
132 * statistics. 161 * Should the accounting and control be hierarchical, per subtree?
162 */
163 bool use_hierarchy;
164 unsigned long last_oom_jiffies;
165 atomic_t refcnt;
166
167 unsigned int swappiness;
168
169 /*
170 * statistics. This must be placed at the end of memcg.
133 */ 171 */
134 struct mem_cgroup_stat stat; 172 struct mem_cgroup_stat stat;
135}; 173};
136static struct mem_cgroup init_mem_cgroup;
137 174
138enum charge_type { 175enum charge_type {
139 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 176 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
140 MEM_CGROUP_CHARGE_TYPE_MAPPED, 177 MEM_CGROUP_CHARGE_TYPE_MAPPED,
141 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ 178 MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */
142 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ 179 MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */
180 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
143 NR_CHARGE_TYPE, 181 NR_CHARGE_TYPE,
144}; 182};
145 183
146/* only for here (for easy reading.) */ 184/* only for here (for easy reading.) */
147#define PCGF_CACHE (1UL << PCG_CACHE) 185#define PCGF_CACHE (1UL << PCG_CACHE)
148#define PCGF_USED (1UL << PCG_USED) 186#define PCGF_USED (1UL << PCG_USED)
149#define PCGF_ACTIVE (1UL << PCG_ACTIVE)
150#define PCGF_LOCK (1UL << PCG_LOCK) 187#define PCGF_LOCK (1UL << PCG_LOCK)
151#define PCGF_FILE (1UL << PCG_FILE)
152static const unsigned long 188static const unsigned long
153pcg_default_flags[NR_CHARGE_TYPE] = { 189pcg_default_flags[NR_CHARGE_TYPE] = {
154 PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ 190 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */
155 PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ 191 PCGF_USED | PCGF_LOCK, /* Anon */
156 PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ 192 PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */
157 0, /* FORCE */ 193 0, /* FORCE */
158}; 194};
159 195
160/* 196/* for encoding cft->private value on file */
161 * Always modified under lru lock. Then, not necessary to preempt_disable() 197#define _MEM (0)
162 */ 198#define _MEMSWAP (1)
199#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val))
200#define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff)
201#define MEMFILE_ATTR(val) ((val) & 0xffff)
202
203static void mem_cgroup_get(struct mem_cgroup *mem);
204static void mem_cgroup_put(struct mem_cgroup *mem);
205
163static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, 206static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
164 struct page_cgroup *pc, 207 struct page_cgroup *pc,
165 bool charge) 208 bool charge)
@@ -167,10 +210,9 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
167 int val = (charge)? 1 : -1; 210 int val = (charge)? 1 : -1;
168 struct mem_cgroup_stat *stat = &mem->stat; 211 struct mem_cgroup_stat *stat = &mem->stat;
169 struct mem_cgroup_stat_cpu *cpustat; 212 struct mem_cgroup_stat_cpu *cpustat;
213 int cpu = get_cpu();
170 214
171 VM_BUG_ON(!irqs_disabled()); 215 cpustat = &stat->cpustat[cpu];
172
173 cpustat = &stat->cpustat[smp_processor_id()];
174 if (PageCgroupCache(pc)) 216 if (PageCgroupCache(pc))
175 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); 217 __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
176 else 218 else
@@ -182,6 +224,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
182 else 224 else
183 __mem_cgroup_stat_add_safe(cpustat, 225 __mem_cgroup_stat_add_safe(cpustat,
184 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); 226 MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
227 put_cpu();
185} 228}
186 229
187static struct mem_cgroup_per_zone * 230static struct mem_cgroup_per_zone *
@@ -197,6 +240,9 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
197 int nid = page_cgroup_nid(pc); 240 int nid = page_cgroup_nid(pc);
198 int zid = page_cgroup_zid(pc); 241 int zid = page_cgroup_zid(pc);
199 242
243 if (!mem)
244 return NULL;
245
200 return mem_cgroup_zoneinfo(mem, nid, zid); 246 return mem_cgroup_zoneinfo(mem, nid, zid);
201} 247}
202 248
@@ -236,77 +282,152 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
236 struct mem_cgroup, css); 282 struct mem_cgroup, css);
237} 283}
238 284
239static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, 285static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
240 struct page_cgroup *pc)
241{ 286{
242 int lru = LRU_BASE; 287 struct mem_cgroup *mem = NULL;
288 /*
289 * Because we have no locks, mm->owner's may be being moved to other
290 * cgroup. We use css_tryget() here even if this looks
291 * pessimistic (rather than adding locks here).
292 */
293 rcu_read_lock();
294 do {
295 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
296 if (unlikely(!mem))
297 break;
298 } while (!css_tryget(&mem->css));
299 rcu_read_unlock();
300 return mem;
301}
243 302
244 if (PageCgroupUnevictable(pc)) 303static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem)
245 lru = LRU_UNEVICTABLE; 304{
246 else { 305 if (!mem)
247 if (PageCgroupActive(pc)) 306 return true;
248 lru += LRU_ACTIVE; 307 return css_is_removed(&mem->css);
249 if (PageCgroupFile(pc)) 308}
250 lru += LRU_FILE;
251 }
252 309
253 MEM_CGROUP_ZSTAT(mz, lru) -= 1; 310/*
311 * Following LRU functions are allowed to be used without PCG_LOCK.
312 * Operations are called by routine of global LRU independently from memcg.
313 * What we have to take care of here is validness of pc->mem_cgroup.
314 *
315 * Changes to pc->mem_cgroup happens when
316 * 1. charge
317 * 2. moving account
318 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
319 * It is added to LRU before charge.
320 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
321 * When moving account, the page is not on LRU. It's isolated.
322 */
254 323
255 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false); 324void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru)
256 list_del(&pc->lru); 325{
326 struct page_cgroup *pc;
327 struct mem_cgroup *mem;
328 struct mem_cgroup_per_zone *mz;
329
330 if (mem_cgroup_disabled())
331 return;
332 pc = lookup_page_cgroup(page);
333 /* can happen while we handle swapcache. */
334 if (list_empty(&pc->lru) || !pc->mem_cgroup)
335 return;
336 /*
337 * We don't check PCG_USED bit. It's cleared when the "page" is finally
338 * removed from global LRU.
339 */
340 mz = page_cgroup_zoneinfo(pc);
341 mem = pc->mem_cgroup;
342 MEM_CGROUP_ZSTAT(mz, lru) -= 1;
343 list_del_init(&pc->lru);
344 return;
257} 345}
258 346
259static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, 347void mem_cgroup_del_lru(struct page *page)
260 struct page_cgroup *pc)
261{ 348{
262 int lru = LRU_BASE; 349 mem_cgroup_del_lru_list(page, page_lru(page));
350}
263 351
264 if (PageCgroupUnevictable(pc)) 352void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru)
265 lru = LRU_UNEVICTABLE; 353{
266 else { 354 struct mem_cgroup_per_zone *mz;
267 if (PageCgroupActive(pc)) 355 struct page_cgroup *pc;
268 lru += LRU_ACTIVE;
269 if (PageCgroupFile(pc))
270 lru += LRU_FILE;
271 }
272 356
273 MEM_CGROUP_ZSTAT(mz, lru) += 1; 357 if (mem_cgroup_disabled())
274 list_add(&pc->lru, &mz->lists[lru]); 358 return;
275 359
276 mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); 360 pc = lookup_page_cgroup(page);
361 smp_rmb();
362 /* unused page is not rotated. */
363 if (!PageCgroupUsed(pc))
364 return;
365 mz = page_cgroup_zoneinfo(pc);
366 list_move(&pc->lru, &mz->lists[lru]);
277} 367}
278 368
279static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru) 369void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru)
280{ 370{
281 struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); 371 struct page_cgroup *pc;
282 int active = PageCgroupActive(pc); 372 struct mem_cgroup_per_zone *mz;
283 int file = PageCgroupFile(pc);
284 int unevictable = PageCgroupUnevictable(pc);
285 enum lru_list from = unevictable ? LRU_UNEVICTABLE :
286 (LRU_FILE * !!file + !!active);
287 373
288 if (lru == from) 374 if (mem_cgroup_disabled())
375 return;
376 pc = lookup_page_cgroup(page);
377 /* barrier to sync with "charge" */
378 smp_rmb();
379 if (!PageCgroupUsed(pc))
289 return; 380 return;
290 381
291 MEM_CGROUP_ZSTAT(mz, from) -= 1; 382 mz = page_cgroup_zoneinfo(pc);
383 MEM_CGROUP_ZSTAT(mz, lru) += 1;
384 list_add(&pc->lru, &mz->lists[lru]);
385}
386
387/*
388 * At handling SwapCache, pc->mem_cgroup may be changed while it's linked to
389 * lru because the page may.be reused after it's fully uncharged (because of
390 * SwapCache behavior).To handle that, unlink page_cgroup from LRU when charge
391 * it again. This function is only used to charge SwapCache. It's done under
392 * lock_page and expected that zone->lru_lock is never held.
393 */
394static void mem_cgroup_lru_del_before_commit_swapcache(struct page *page)
395{
396 unsigned long flags;
397 struct zone *zone = page_zone(page);
398 struct page_cgroup *pc = lookup_page_cgroup(page);
399
400 spin_lock_irqsave(&zone->lru_lock, flags);
292 /* 401 /*
293 * However this is done under mz->lru_lock, another flags, which 402 * Forget old LRU when this page_cgroup is *not* used. This Used bit
294 * are not related to LRU, will be modified from out-of-lock. 403 * is guarded by lock_page() because the page is SwapCache.
295 * We have to use atomic set/clear flags.
296 */ 404 */
297 if (is_unevictable_lru(lru)) { 405 if (!PageCgroupUsed(pc))
298 ClearPageCgroupActive(pc); 406 mem_cgroup_del_lru_list(page, page_lru(page));
299 SetPageCgroupUnevictable(pc); 407 spin_unlock_irqrestore(&zone->lru_lock, flags);
300 } else { 408}
301 if (is_active_lru(lru))
302 SetPageCgroupActive(pc);
303 else
304 ClearPageCgroupActive(pc);
305 ClearPageCgroupUnevictable(pc);
306 }
307 409
308 MEM_CGROUP_ZSTAT(mz, lru) += 1; 410static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page)
309 list_move(&pc->lru, &mz->lists[lru]); 411{
412 unsigned long flags;
413 struct zone *zone = page_zone(page);
414 struct page_cgroup *pc = lookup_page_cgroup(page);
415
416 spin_lock_irqsave(&zone->lru_lock, flags);
417 /* link when the page is linked to LRU but page_cgroup isn't */
418 if (PageLRU(page) && list_empty(&pc->lru))
419 mem_cgroup_add_lru_list(page, page_lru(page));
420 spin_unlock_irqrestore(&zone->lru_lock, flags);
421}
422
423
424void mem_cgroup_move_lists(struct page *page,
425 enum lru_list from, enum lru_list to)
426{
427 if (mem_cgroup_disabled())
428 return;
429 mem_cgroup_del_lru_list(page, from);
430 mem_cgroup_add_lru_list(page, to);
310} 431}
311 432
312int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 433int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
@@ -320,37 +441,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
320} 441}
321 442
322/* 443/*
323 * This routine assumes that the appropriate zone's lru lock is already held
324 */
325void mem_cgroup_move_lists(struct page *page, enum lru_list lru)
326{
327 struct page_cgroup *pc;
328 struct mem_cgroup_per_zone *mz;
329 unsigned long flags;
330
331 if (mem_cgroup_subsys.disabled)
332 return;
333
334 /*
335 * We cannot lock_page_cgroup while holding zone's lru_lock,
336 * because other holders of lock_page_cgroup can be interrupted
337 * with an attempt to rotate_reclaimable_page. But we cannot
338 * safely get to page_cgroup without it, so just try_lock it:
339 * mem_cgroup_isolate_pages allows for page left on wrong list.
340 */
341 pc = lookup_page_cgroup(page);
342 if (!trylock_page_cgroup(pc))
343 return;
344 if (pc && PageCgroupUsed(pc)) {
345 mz = page_cgroup_zoneinfo(pc);
346 spin_lock_irqsave(&mz->lru_lock, flags);
347 __mem_cgroup_move_lists(pc, lru);
348 spin_unlock_irqrestore(&mz->lru_lock, flags);
349 }
350 unlock_page_cgroup(pc);
351}
352
353/*
354 * Calculate mapped_ratio under memory controller. This will be used in 444 * Calculate mapped_ratio under memory controller. This will be used in
355 * vmscan.c for deteremining we have to reclaim mapped pages. 445 * vmscan.c for deteremining we have to reclaim mapped pages.
356 */ 446 */
@@ -372,39 +462,108 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
372 */ 462 */
373int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 463int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
374{ 464{
375 return mem->prev_priority; 465 int prev_priority;
466
467 spin_lock(&mem->reclaim_param_lock);
468 prev_priority = mem->prev_priority;
469 spin_unlock(&mem->reclaim_param_lock);
470
471 return prev_priority;
376} 472}
377 473
378void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) 474void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority)
379{ 475{
476 spin_lock(&mem->reclaim_param_lock);
380 if (priority < mem->prev_priority) 477 if (priority < mem->prev_priority)
381 mem->prev_priority = priority; 478 mem->prev_priority = priority;
479 spin_unlock(&mem->reclaim_param_lock);
382} 480}
383 481
384void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) 482void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority)
385{ 483{
484 spin_lock(&mem->reclaim_param_lock);
386 mem->prev_priority = priority; 485 mem->prev_priority = priority;
486 spin_unlock(&mem->reclaim_param_lock);
387} 487}
388 488
389/* 489static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages)
390 * Calculate # of pages to be scanned in this priority/zone. 490{
391 * See also vmscan.c 491 unsigned long active;
392 * 492 unsigned long inactive;
393 * priority starts from "DEF_PRIORITY" and decremented in each loop. 493 unsigned long gb;
394 * (see include/linux/mmzone.h) 494 unsigned long inactive_ratio;
395 */ 495
496 inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON);
497 active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON);
498
499 gb = (inactive + active) >> (30 - PAGE_SHIFT);
500 if (gb)
501 inactive_ratio = int_sqrt(10 * gb);
502 else
503 inactive_ratio = 1;
504
505 if (present_pages) {
506 present_pages[0] = inactive;
507 present_pages[1] = active;
508 }
509
510 return inactive_ratio;
511}
512
513int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg)
514{
515 unsigned long active;
516 unsigned long inactive;
517 unsigned long present_pages[2];
518 unsigned long inactive_ratio;
396 519
397long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone, 520 inactive_ratio = calc_inactive_ratio(memcg, present_pages);
398 int priority, enum lru_list lru) 521
522 inactive = present_pages[0];
523 active = present_pages[1];
524
525 if (inactive * inactive_ratio < active)
526 return 1;
527
528 return 0;
529}
530
531unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
532 struct zone *zone,
533 enum lru_list lru)
399{ 534{
400 long nr_pages;
401 int nid = zone->zone_pgdat->node_id; 535 int nid = zone->zone_pgdat->node_id;
402 int zid = zone_idx(zone); 536 int zid = zone_idx(zone);
403 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 537 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
404 538
405 nr_pages = MEM_CGROUP_ZSTAT(mz, lru); 539 return MEM_CGROUP_ZSTAT(mz, lru);
540}
406 541
407 return (nr_pages >> priority); 542struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
543 struct zone *zone)
544{
545 int nid = zone->zone_pgdat->node_id;
546 int zid = zone_idx(zone);
547 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
548
549 return &mz->reclaim_stat;
550}
551
552struct zone_reclaim_stat *
553mem_cgroup_get_reclaim_stat_from_page(struct page *page)
554{
555 struct page_cgroup *pc;
556 struct mem_cgroup_per_zone *mz;
557
558 if (mem_cgroup_disabled())
559 return NULL;
560
561 pc = lookup_page_cgroup(page);
562 mz = page_cgroup_zoneinfo(pc);
563 if (!mz)
564 return NULL;
565
566 return &mz->reclaim_stat;
408} 567}
409 568
410unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, 569unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
@@ -429,95 +588,281 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
429 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); 588 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
430 src = &mz->lists[lru]; 589 src = &mz->lists[lru];
431 590
432 spin_lock(&mz->lru_lock);
433 scan = 0; 591 scan = 0;
434 list_for_each_entry_safe_reverse(pc, tmp, src, lru) { 592 list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
435 if (scan >= nr_to_scan) 593 if (scan >= nr_to_scan)
436 break; 594 break;
595
596 page = pc->page;
437 if (unlikely(!PageCgroupUsed(pc))) 597 if (unlikely(!PageCgroupUsed(pc)))
438 continue; 598 continue;
439 page = pc->page;
440
441 if (unlikely(!PageLRU(page))) 599 if (unlikely(!PageLRU(page)))
442 continue; 600 continue;
443 601
444 /*
445 * TODO: play better with lumpy reclaim, grabbing anything.
446 */
447 if (PageUnevictable(page) ||
448 (PageActive(page) && !active) ||
449 (!PageActive(page) && active)) {
450 __mem_cgroup_move_lists(pc, page_lru(page));
451 continue;
452 }
453
454 scan++; 602 scan++;
455 list_move(&pc->lru, &pc_list);
456
457 if (__isolate_lru_page(page, mode, file) == 0) { 603 if (__isolate_lru_page(page, mode, file) == 0) {
458 list_move(&page->lru, dst); 604 list_move(&page->lru, dst);
459 nr_taken++; 605 nr_taken++;
460 } 606 }
461 } 607 }
462 608
463 list_splice(&pc_list, src);
464 spin_unlock(&mz->lru_lock);
465
466 *scanned = scan; 609 *scanned = scan;
467 return nr_taken; 610 return nr_taken;
468} 611}
469 612
613#define mem_cgroup_from_res_counter(counter, member) \
614 container_of(counter, struct mem_cgroup, member)
615
470/* 616/*
471 * Charge the memory controller for page usage. 617 * This routine finds the DFS walk successor. This routine should be
472 * Return 618 * called with hierarchy_mutex held
473 * 0 if the charge was successful
474 * < 0 if the cgroup is over its limit
475 */ 619 */
476static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 620static struct mem_cgroup *
477 gfp_t gfp_mask, enum charge_type ctype, 621mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
478 struct mem_cgroup *memcg)
479{ 622{
623 struct cgroup *cgroup, *curr_cgroup, *root_cgroup;
624
625 curr_cgroup = curr->css.cgroup;
626 root_cgroup = root_mem->css.cgroup;
627
628 if (!list_empty(&curr_cgroup->children)) {
629 /*
630 * Walk down to children
631 */
632 mem_cgroup_put(curr);
633 cgroup = list_entry(curr_cgroup->children.next,
634 struct cgroup, sibling);
635 curr = mem_cgroup_from_cont(cgroup);
636 mem_cgroup_get(curr);
637 goto done;
638 }
639
640visit_parent:
641 if (curr_cgroup == root_cgroup) {
642 mem_cgroup_put(curr);
643 curr = root_mem;
644 mem_cgroup_get(curr);
645 goto done;
646 }
647
648 /*
649 * Goto next sibling
650 */
651 if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
652 mem_cgroup_put(curr);
653 cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
654 sibling);
655 curr = mem_cgroup_from_cont(cgroup);
656 mem_cgroup_get(curr);
657 goto done;
658 }
659
660 /*
661 * Go up to next parent and next parent's sibling if need be
662 */
663 curr_cgroup = curr_cgroup->parent;
664 goto visit_parent;
665
666done:
667 root_mem->last_scanned_child = curr;
668 return curr;
669}
670
671/*
672 * Visit the first child (need not be the first child as per the ordering
673 * of the cgroup list, since we track last_scanned_child) of @mem and use
674 * that to reclaim free pages from.
675 */
676static struct mem_cgroup *
677mem_cgroup_get_first_node(struct mem_cgroup *root_mem)
678{
679 struct cgroup *cgroup;
680 struct mem_cgroup *ret;
681 bool obsolete;
682
683 obsolete = mem_cgroup_is_obsolete(root_mem->last_scanned_child);
684
685 /*
686 * Scan all children under the mem_cgroup mem
687 */
688 mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
689 if (list_empty(&root_mem->css.cgroup->children)) {
690 ret = root_mem;
691 goto done;
692 }
693
694 if (!root_mem->last_scanned_child || obsolete) {
695
696 if (obsolete && root_mem->last_scanned_child)
697 mem_cgroup_put(root_mem->last_scanned_child);
698
699 cgroup = list_first_entry(&root_mem->css.cgroup->children,
700 struct cgroup, sibling);
701 ret = mem_cgroup_from_cont(cgroup);
702 mem_cgroup_get(ret);
703 } else
704 ret = mem_cgroup_get_next_node(root_mem->last_scanned_child,
705 root_mem);
706
707done:
708 root_mem->last_scanned_child = ret;
709 mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
710 return ret;
711}
712
713static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
714{
715 if (do_swap_account) {
716 if (res_counter_check_under_limit(&mem->res) &&
717 res_counter_check_under_limit(&mem->memsw))
718 return true;
719 } else
720 if (res_counter_check_under_limit(&mem->res))
721 return true;
722 return false;
723}
724
725static unsigned int get_swappiness(struct mem_cgroup *memcg)
726{
727 struct cgroup *cgrp = memcg->css.cgroup;
728 unsigned int swappiness;
729
730 /* root ? */
731 if (cgrp->parent == NULL)
732 return vm_swappiness;
733
734 spin_lock(&memcg->reclaim_param_lock);
735 swappiness = memcg->swappiness;
736 spin_unlock(&memcg->reclaim_param_lock);
737
738 return swappiness;
739}
740
741/*
742 * Dance down the hierarchy if needed to reclaim memory. We remember the
743 * last child we reclaimed from, so that we don't end up penalizing
744 * one child extensively based on its position in the children list.
745 *
746 * root_mem is the original ancestor that we've been reclaim from.
747 */
748static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
749 gfp_t gfp_mask, bool noswap)
750{
751 struct mem_cgroup *next_mem;
752 int ret = 0;
753
754 /*
755 * Reclaim unconditionally and don't check for return value.
756 * We need to reclaim in the current group and down the tree.
757 * One might think about checking for children before reclaiming,
758 * but there might be left over accounting, even after children
759 * have left.
760 */
761 ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap,
762 get_swappiness(root_mem));
763 if (mem_cgroup_check_under_limit(root_mem))
764 return 0;
765 if (!root_mem->use_hierarchy)
766 return ret;
767
768 next_mem = mem_cgroup_get_first_node(root_mem);
769
770 while (next_mem != root_mem) {
771 if (mem_cgroup_is_obsolete(next_mem)) {
772 mem_cgroup_put(next_mem);
773 next_mem = mem_cgroup_get_first_node(root_mem);
774 continue;
775 }
776 ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap,
777 get_swappiness(next_mem));
778 if (mem_cgroup_check_under_limit(root_mem))
779 return 0;
780 mutex_lock(&mem_cgroup_subsys.hierarchy_mutex);
781 next_mem = mem_cgroup_get_next_node(next_mem, root_mem);
782 mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex);
783 }
784 return ret;
785}
786
787bool mem_cgroup_oom_called(struct task_struct *task)
788{
789 bool ret = false;
480 struct mem_cgroup *mem; 790 struct mem_cgroup *mem;
481 struct page_cgroup *pc; 791 struct mm_struct *mm;
482 unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
483 struct mem_cgroup_per_zone *mz;
484 unsigned long flags;
485 792
486 pc = lookup_page_cgroup(page); 793 rcu_read_lock();
487 /* can happen at boot */ 794 mm = task->mm;
488 if (unlikely(!pc)) 795 if (!mm)
796 mm = &init_mm;
797 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
798 if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
799 ret = true;
800 rcu_read_unlock();
801 return ret;
802}
803/*
804 * Unlike exported interface, "oom" parameter is added. if oom==true,
805 * oom-killer can be invoked.
806 */
807static int __mem_cgroup_try_charge(struct mm_struct *mm,
808 gfp_t gfp_mask, struct mem_cgroup **memcg,
809 bool oom)
810{
811 struct mem_cgroup *mem, *mem_over_limit;
812 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
813 struct res_counter *fail_res;
814
815 if (unlikely(test_thread_flag(TIF_MEMDIE))) {
816 /* Don't account this! */
817 *memcg = NULL;
489 return 0; 818 return 0;
490 prefetchw(pc); 819 }
820
491 /* 821 /*
492 * We always charge the cgroup the mm_struct belongs to. 822 * We always charge the cgroup the mm_struct belongs to.
493 * The mm_struct's mem_cgroup changes on task migration if the 823 * The mm_struct's mem_cgroup changes on task migration if the
494 * thread group leader migrates. It's possible that mm is not 824 * thread group leader migrates. It's possible that mm is not
495 * set, if so charge the init_mm (happens for pagecache usage). 825 * set, if so charge the init_mm (happens for pagecache usage).
496 */ 826 */
497 827 mem = *memcg;
498 if (likely(!memcg)) { 828 if (likely(!mem)) {
499 rcu_read_lock(); 829 mem = try_get_mem_cgroup_from_mm(mm);
500 mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); 830 *memcg = mem;
501 if (unlikely(!mem)) {
502 rcu_read_unlock();
503 return 0;
504 }
505 /*
506 * For every charge from the cgroup, increment reference count
507 */
508 css_get(&mem->css);
509 rcu_read_unlock();
510 } else { 831 } else {
511 mem = memcg; 832 css_get(&mem->css);
512 css_get(&memcg->css);
513 } 833 }
834 if (unlikely(!mem))
835 return 0;
836
837 VM_BUG_ON(mem_cgroup_is_obsolete(mem));
838
839 while (1) {
840 int ret;
841 bool noswap = false;
842
843 ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
844 if (likely(!ret)) {
845 if (!do_swap_account)
846 break;
847 ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
848 &fail_res);
849 if (likely(!ret))
850 break;
851 /* mem+swap counter fails */
852 res_counter_uncharge(&mem->res, PAGE_SIZE);
853 noswap = true;
854 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
855 memsw);
856 } else
857 /* mem counter fails */
858 mem_over_limit = mem_cgroup_from_res_counter(fail_res,
859 res);
514 860
515 while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) {
516 if (!(gfp_mask & __GFP_WAIT)) 861 if (!(gfp_mask & __GFP_WAIT))
517 goto out; 862 goto nomem;
518 863
519 if (try_to_free_mem_cgroup_pages(mem, gfp_mask)) 864 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
520 continue; 865 noswap);
521 866
522 /* 867 /*
523 * try_to_free_mem_cgroup_pages() might not give us a full 868 * try_to_free_mem_cgroup_pages() might not give us a full
@@ -525,49 +870,214 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
525 * moved to swap cache or just unmapped from the cgroup. 870 * moved to swap cache or just unmapped from the cgroup.
526 * Check the limit again to see if the reclaim reduced the 871 * Check the limit again to see if the reclaim reduced the
527 * current usage of the cgroup before giving up 872 * current usage of the cgroup before giving up
873 *
528 */ 874 */
529 if (res_counter_check_under_limit(&mem->res)) 875 if (mem_cgroup_check_under_limit(mem_over_limit))
530 continue; 876 continue;
531 877
532 if (!nr_retries--) { 878 if (!nr_retries--) {
533 mem_cgroup_out_of_memory(mem, gfp_mask); 879 if (oom) {
534 goto out; 880 mutex_lock(&memcg_tasklist);
881 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
882 mutex_unlock(&memcg_tasklist);
883 mem_over_limit->last_oom_jiffies = jiffies;
884 }
885 goto nomem;
535 } 886 }
536 } 887 }
888 return 0;
889nomem:
890 css_put(&mem->css);
891 return -ENOMEM;
892}
537 893
894static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
895{
896 struct mem_cgroup *mem;
897 swp_entry_t ent;
898
899 if (!PageSwapCache(page))
900 return NULL;
901
902 ent.val = page_private(page);
903 mem = lookup_swap_cgroup(ent);
904 if (!mem)
905 return NULL;
906 if (!css_tryget(&mem->css))
907 return NULL;
908 return mem;
909}
910
911/*
912 * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be
913 * USED state. If already USED, uncharge and return.
914 */
915
916static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
917 struct page_cgroup *pc,
918 enum charge_type ctype)
919{
920 /* try_charge() can return NULL to *memcg, taking care of it. */
921 if (!mem)
922 return;
538 923
539 lock_page_cgroup(pc); 924 lock_page_cgroup(pc);
540 if (unlikely(PageCgroupUsed(pc))) { 925 if (unlikely(PageCgroupUsed(pc))) {
541 unlock_page_cgroup(pc); 926 unlock_page_cgroup(pc);
542 res_counter_uncharge(&mem->res, PAGE_SIZE); 927 res_counter_uncharge(&mem->res, PAGE_SIZE);
928 if (do_swap_account)
929 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
543 css_put(&mem->css); 930 css_put(&mem->css);
544 931 return;
545 goto done;
546 } 932 }
547 pc->mem_cgroup = mem; 933 pc->mem_cgroup = mem;
548 /* 934 smp_wmb();
549 * If a page is accounted as a page cache, insert to inactive list.
550 * If anon, insert to active list.
551 */
552 pc->flags = pcg_default_flags[ctype]; 935 pc->flags = pcg_default_flags[ctype];
553 936
554 mz = page_cgroup_zoneinfo(pc); 937 mem_cgroup_charge_statistics(mem, pc, true);
555 938
556 spin_lock_irqsave(&mz->lru_lock, flags);
557 __mem_cgroup_add_list(mz, pc);
558 spin_unlock_irqrestore(&mz->lru_lock, flags);
559 unlock_page_cgroup(pc); 939 unlock_page_cgroup(pc);
940}
560 941
561done: 942/**
562 return 0; 943 * mem_cgroup_move_account - move account of the page
944 * @pc: page_cgroup of the page.
945 * @from: mem_cgroup which the page is moved from.
946 * @to: mem_cgroup which the page is moved to. @from != @to.
947 *
948 * The caller must confirm following.
949 * - page is not on LRU (isolate_page() is useful.)
950 *
951 * returns 0 at success,
952 * returns -EBUSY when lock is busy or "pc" is unstable.
953 *
954 * This function does "uncharge" from old cgroup but doesn't do "charge" to
955 * new cgroup. It should be done by a caller.
956 */
957
958static int mem_cgroup_move_account(struct page_cgroup *pc,
959 struct mem_cgroup *from, struct mem_cgroup *to)
960{
961 struct mem_cgroup_per_zone *from_mz, *to_mz;
962 int nid, zid;
963 int ret = -EBUSY;
964
965 VM_BUG_ON(from == to);
966 VM_BUG_ON(PageLRU(pc->page));
967
968 nid = page_cgroup_nid(pc);
969 zid = page_cgroup_zid(pc);
970 from_mz = mem_cgroup_zoneinfo(from, nid, zid);
971 to_mz = mem_cgroup_zoneinfo(to, nid, zid);
972
973 if (!trylock_page_cgroup(pc))
974 return ret;
975
976 if (!PageCgroupUsed(pc))
977 goto out;
978
979 if (pc->mem_cgroup != from)
980 goto out;
981
982 css_put(&from->css);
983 res_counter_uncharge(&from->res, PAGE_SIZE);
984 mem_cgroup_charge_statistics(from, pc, false);
985 if (do_swap_account)
986 res_counter_uncharge(&from->memsw, PAGE_SIZE);
987 pc->mem_cgroup = to;
988 mem_cgroup_charge_statistics(to, pc, true);
989 css_get(&to->css);
990 ret = 0;
563out: 991out:
564 css_put(&mem->css); 992 unlock_page_cgroup(pc);
565 return -ENOMEM; 993 return ret;
994}
995
996/*
997 * move charges to its parent.
998 */
999
1000static int mem_cgroup_move_parent(struct page_cgroup *pc,
1001 struct mem_cgroup *child,
1002 gfp_t gfp_mask)
1003{
1004 struct page *page = pc->page;
1005 struct cgroup *cg = child->css.cgroup;
1006 struct cgroup *pcg = cg->parent;
1007 struct mem_cgroup *parent;
1008 int ret;
1009
1010 /* Is ROOT ? */
1011 if (!pcg)
1012 return -EINVAL;
1013
1014
1015 parent = mem_cgroup_from_cont(pcg);
1016
1017
1018 ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
1019 if (ret || !parent)
1020 return ret;
1021
1022 if (!get_page_unless_zero(page))
1023 return -EBUSY;
1024
1025 ret = isolate_lru_page(page);
1026
1027 if (ret)
1028 goto cancel;
1029
1030 ret = mem_cgroup_move_account(pc, child, parent);
1031
1032 /* drop extra refcnt by try_charge() (move_account increment one) */
1033 css_put(&parent->css);
1034 putback_lru_page(page);
1035 if (!ret) {
1036 put_page(page);
1037 return 0;
1038 }
1039 /* uncharge if move fails */
1040cancel:
1041 res_counter_uncharge(&parent->res, PAGE_SIZE);
1042 if (do_swap_account)
1043 res_counter_uncharge(&parent->memsw, PAGE_SIZE);
1044 put_page(page);
1045 return ret;
1046}
1047
1048/*
1049 * Charge the memory controller for page usage.
1050 * Return
1051 * 0 if the charge was successful
1052 * < 0 if the cgroup is over its limit
1053 */
1054static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
1055 gfp_t gfp_mask, enum charge_type ctype,
1056 struct mem_cgroup *memcg)
1057{
1058 struct mem_cgroup *mem;
1059 struct page_cgroup *pc;
1060 int ret;
1061
1062 pc = lookup_page_cgroup(page);
1063 /* can happen at boot */
1064 if (unlikely(!pc))
1065 return 0;
1066 prefetchw(pc);
1067
1068 mem = memcg;
1069 ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
1070 if (ret || !mem)
1071 return ret;
1072
1073 __mem_cgroup_commit_charge(mem, pc, ctype);
1074 return 0;
566} 1075}
567 1076
568int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) 1077int mem_cgroup_newpage_charge(struct page *page,
1078 struct mm_struct *mm, gfp_t gfp_mask)
569{ 1079{
570 if (mem_cgroup_subsys.disabled) 1080 if (mem_cgroup_disabled())
571 return 0; 1081 return 0;
572 if (PageCompound(page)) 1082 if (PageCompound(page))
573 return 0; 1083 return 0;
@@ -589,7 +1099,10 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
589int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 1099int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
590 gfp_t gfp_mask) 1100 gfp_t gfp_mask)
591{ 1101{
592 if (mem_cgroup_subsys.disabled) 1102 struct mem_cgroup *mem = NULL;
1103 int ret;
1104
1105 if (mem_cgroup_disabled())
593 return 0; 1106 return 0;
594 if (PageCompound(page)) 1107 if (PageCompound(page))
595 return 0; 1108 return 0;
@@ -601,6 +1114,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
601 * For GFP_NOWAIT case, the page may be pre-charged before calling 1114 * For GFP_NOWAIT case, the page may be pre-charged before calling
602 * add_to_page_cache(). (See shmem.c) check it here and avoid to call 1115 * add_to_page_cache(). (See shmem.c) check it here and avoid to call
603 * charge twice. (It works but has to pay a bit larger cost.) 1116 * charge twice. (It works but has to pay a bit larger cost.)
1117 * And when the page is SwapCache, it should take swap information
1118 * into account. This is under lock_page() now.
604 */ 1119 */
605 if (!(gfp_mask & __GFP_WAIT)) { 1120 if (!(gfp_mask & __GFP_WAIT)) {
606 struct page_cgroup *pc; 1121 struct page_cgroup *pc;
@@ -617,58 +1132,198 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
617 unlock_page_cgroup(pc); 1132 unlock_page_cgroup(pc);
618 } 1133 }
619 1134
620 if (unlikely(!mm)) 1135 if (do_swap_account && PageSwapCache(page)) {
1136 mem = try_get_mem_cgroup_from_swapcache(page);
1137 if (mem)
1138 mm = NULL;
1139 else
1140 mem = NULL;
1141 /* SwapCache may be still linked to LRU now. */
1142 mem_cgroup_lru_del_before_commit_swapcache(page);
1143 }
1144
1145 if (unlikely(!mm && !mem))
621 mm = &init_mm; 1146 mm = &init_mm;
622 1147
623 if (page_is_file_cache(page)) 1148 if (page_is_file_cache(page))
624 return mem_cgroup_charge_common(page, mm, gfp_mask, 1149 return mem_cgroup_charge_common(page, mm, gfp_mask,
625 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 1150 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
626 else 1151
627 return mem_cgroup_charge_common(page, mm, gfp_mask, 1152 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
628 MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); 1153 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
1154 if (mem)
1155 css_put(&mem->css);
1156 if (PageSwapCache(page))
1157 mem_cgroup_lru_add_after_commit_swapcache(page);
1158
1159 if (do_swap_account && !ret && PageSwapCache(page)) {
1160 swp_entry_t ent = {.val = page_private(page)};
1161 /* avoid double counting */
1162 mem = swap_cgroup_record(ent, NULL);
1163 if (mem) {
1164 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1165 mem_cgroup_put(mem);
1166 }
1167 }
1168 return ret;
1169}
1170
1171/*
1172 * While swap-in, try_charge -> commit or cancel, the page is locked.
1173 * And when try_charge() successfully returns, one refcnt to memcg without
1174 * struct page_cgroup is aquired. This refcnt will be cumsumed by
1175 * "commit()" or removed by "cancel()"
1176 */
1177int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
1178 struct page *page,
1179 gfp_t mask, struct mem_cgroup **ptr)
1180{
1181 struct mem_cgroup *mem;
1182 int ret;
1183
1184 if (mem_cgroup_disabled())
1185 return 0;
1186
1187 if (!do_swap_account)
1188 goto charge_cur_mm;
1189 /*
1190 * A racing thread's fault, or swapoff, may have already updated
1191 * the pte, and even removed page from swap cache: return success
1192 * to go on to do_swap_page()'s pte_same() test, which should fail.
1193 */
1194 if (!PageSwapCache(page))
1195 return 0;
1196 mem = try_get_mem_cgroup_from_swapcache(page);
1197 if (!mem)
1198 goto charge_cur_mm;
1199 *ptr = mem;
1200 ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
1201 /* drop extra refcnt from tryget */
1202 css_put(&mem->css);
1203 return ret;
1204charge_cur_mm:
1205 if (unlikely(!mm))
1206 mm = &init_mm;
1207 return __mem_cgroup_try_charge(mm, mask, ptr, true);
1208}
1209
1210void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1211{
1212 struct page_cgroup *pc;
1213
1214 if (mem_cgroup_disabled())
1215 return;
1216 if (!ptr)
1217 return;
1218 pc = lookup_page_cgroup(page);
1219 mem_cgroup_lru_del_before_commit_swapcache(page);
1220 __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED);
1221 mem_cgroup_lru_add_after_commit_swapcache(page);
1222 /*
1223 * Now swap is on-memory. This means this page may be
1224 * counted both as mem and swap....double count.
1225 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
1226 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
1227 * may call delete_from_swap_cache() before reach here.
1228 */
1229 if (do_swap_account && PageSwapCache(page)) {
1230 swp_entry_t ent = {.val = page_private(page)};
1231 struct mem_cgroup *memcg;
1232 memcg = swap_cgroup_record(ent, NULL);
1233 if (memcg) {
1234 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1235 mem_cgroup_put(memcg);
1236 }
1237
1238 }
1239 /* add this page(page_cgroup) to the LRU we want. */
1240
629} 1241}
630 1242
1243void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1244{
1245 if (mem_cgroup_disabled())
1246 return;
1247 if (!mem)
1248 return;
1249 res_counter_uncharge(&mem->res, PAGE_SIZE);
1250 if (do_swap_account)
1251 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1252 css_put(&mem->css);
1253}
1254
1255
631/* 1256/*
632 * uncharge if !page_mapped(page) 1257 * uncharge if !page_mapped(page)
633 */ 1258 */
634static void 1259static struct mem_cgroup *
635__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) 1260__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
636{ 1261{
637 struct page_cgroup *pc; 1262 struct page_cgroup *pc;
638 struct mem_cgroup *mem; 1263 struct mem_cgroup *mem = NULL;
639 struct mem_cgroup_per_zone *mz; 1264 struct mem_cgroup_per_zone *mz;
640 unsigned long flags;
641 1265
642 if (mem_cgroup_subsys.disabled) 1266 if (mem_cgroup_disabled())
643 return; 1267 return NULL;
1268
1269 if (PageSwapCache(page))
1270 return NULL;
644 1271
645 /* 1272 /*
646 * Check if our page_cgroup is valid 1273 * Check if our page_cgroup is valid
647 */ 1274 */
648 pc = lookup_page_cgroup(page); 1275 pc = lookup_page_cgroup(page);
649 if (unlikely(!pc || !PageCgroupUsed(pc))) 1276 if (unlikely(!pc || !PageCgroupUsed(pc)))
650 return; 1277 return NULL;
651 1278
652 lock_page_cgroup(pc); 1279 lock_page_cgroup(pc);
653 if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page)) 1280
654 || !PageCgroupUsed(pc)) { 1281 mem = pc->mem_cgroup;
655 /* This happens at race in zap_pte_range() and do_swap_page()*/ 1282
656 unlock_page_cgroup(pc); 1283 if (!PageCgroupUsed(pc))
657 return; 1284 goto unlock_out;
1285
1286 switch (ctype) {
1287 case MEM_CGROUP_CHARGE_TYPE_MAPPED:
1288 if (page_mapped(page))
1289 goto unlock_out;
1290 break;
1291 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
1292 if (!PageAnon(page)) { /* Shared memory */
1293 if (page->mapping && !page_is_file_cache(page))
1294 goto unlock_out;
1295 } else if (page_mapped(page)) /* Anon */
1296 goto unlock_out;
1297 break;
1298 default:
1299 break;
658 } 1300 }
1301
1302 res_counter_uncharge(&mem->res, PAGE_SIZE);
1303 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1304 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1305
1306 mem_cgroup_charge_statistics(mem, pc, false);
659 ClearPageCgroupUsed(pc); 1307 ClearPageCgroupUsed(pc);
660 mem = pc->mem_cgroup; 1308 /*
1309 * pc->mem_cgroup is not cleared here. It will be accessed when it's
1310 * freed from LRU. This is safe because uncharged page is expected not
1311 * to be reused (freed soon). Exception is SwapCache, it's handled by
1312 * special functions.
1313 */
661 1314
662 mz = page_cgroup_zoneinfo(pc); 1315 mz = page_cgroup_zoneinfo(pc);
663 spin_lock_irqsave(&mz->lru_lock, flags);
664 __mem_cgroup_remove_list(mz, pc);
665 spin_unlock_irqrestore(&mz->lru_lock, flags);
666 unlock_page_cgroup(pc); 1316 unlock_page_cgroup(pc);
667 1317
668 res_counter_uncharge(&mem->res, PAGE_SIZE); 1318 /* at swapout, this memcg will be accessed to record to swap */
669 css_put(&mem->css); 1319 if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
1320 css_put(&mem->css);
670 1321
671 return; 1322 return mem;
1323
1324unlock_out:
1325 unlock_page_cgroup(pc);
1326 return NULL;
672} 1327}
673 1328
674void mem_cgroup_uncharge_page(struct page *page) 1329void mem_cgroup_uncharge_page(struct page *page)
@@ -689,16 +1344,55 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
689} 1344}
690 1345
691/* 1346/*
692 * Before starting migration, account against new page. 1347 * called from __delete_from_swap_cache() and drop "page" account.
1348 * memcg information is recorded to swap_cgroup of "ent"
1349 */
1350void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
1351{
1352 struct mem_cgroup *memcg;
1353
1354 memcg = __mem_cgroup_uncharge_common(page,
1355 MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
1356 /* record memcg information */
1357 if (do_swap_account && memcg) {
1358 swap_cgroup_record(ent, memcg);
1359 mem_cgroup_get(memcg);
1360 }
1361 if (memcg)
1362 css_put(&memcg->css);
1363}
1364
1365#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
1366/*
1367 * called from swap_entry_free(). remove record in swap_cgroup and
1368 * uncharge "memsw" account.
693 */ 1369 */
694int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) 1370void mem_cgroup_uncharge_swap(swp_entry_t ent)
1371{
1372 struct mem_cgroup *memcg;
1373
1374 if (!do_swap_account)
1375 return;
1376
1377 memcg = swap_cgroup_record(ent, NULL);
1378 if (memcg) {
1379 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1380 mem_cgroup_put(memcg);
1381 }
1382}
1383#endif
1384
1385/*
1386 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
1387 * page belongs to.
1388 */
1389int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
695{ 1390{
696 struct page_cgroup *pc; 1391 struct page_cgroup *pc;
697 struct mem_cgroup *mem = NULL; 1392 struct mem_cgroup *mem = NULL;
698 enum charge_type ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
699 int ret = 0; 1393 int ret = 0;
700 1394
701 if (mem_cgroup_subsys.disabled) 1395 if (mem_cgroup_disabled())
702 return 0; 1396 return 0;
703 1397
704 pc = lookup_page_cgroup(page); 1398 pc = lookup_page_cgroup(page);
@@ -706,41 +1400,67 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage)
706 if (PageCgroupUsed(pc)) { 1400 if (PageCgroupUsed(pc)) {
707 mem = pc->mem_cgroup; 1401 mem = pc->mem_cgroup;
708 css_get(&mem->css); 1402 css_get(&mem->css);
709 if (PageCgroupCache(pc)) {
710 if (page_is_file_cache(page))
711 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
712 else
713 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
714 }
715 } 1403 }
716 unlock_page_cgroup(pc); 1404 unlock_page_cgroup(pc);
1405
717 if (mem) { 1406 if (mem) {
718 ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, 1407 ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
719 ctype, mem);
720 css_put(&mem->css); 1408 css_put(&mem->css);
721 } 1409 }
1410 *ptr = mem;
722 return ret; 1411 return ret;
723} 1412}
724 1413
725/* remove redundant charge if migration failed*/ 1414/* remove redundant charge if migration failed*/
726void mem_cgroup_end_migration(struct page *newpage) 1415void mem_cgroup_end_migration(struct mem_cgroup *mem,
1416 struct page *oldpage, struct page *newpage)
727{ 1417{
1418 struct page *target, *unused;
1419 struct page_cgroup *pc;
1420 enum charge_type ctype;
1421
1422 if (!mem)
1423 return;
1424
1425 /* at migration success, oldpage->mapping is NULL. */
1426 if (oldpage->mapping) {
1427 target = oldpage;
1428 unused = NULL;
1429 } else {
1430 target = newpage;
1431 unused = oldpage;
1432 }
1433
1434 if (PageAnon(target))
1435 ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
1436 else if (page_is_file_cache(target))
1437 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
1438 else
1439 ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
1440
1441 /* unused page is not on radix-tree now. */
1442 if (unused)
1443 __mem_cgroup_uncharge_common(unused, ctype);
1444
1445 pc = lookup_page_cgroup(target);
728 /* 1446 /*
729 * At success, page->mapping is not NULL. 1447 * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
730 * special rollback care is necessary when 1448 * So, double-counting is effectively avoided.
731 * 1. at migration failure. (newpage->mapping is cleared in this case)
732 * 2. the newpage was moved but not remapped again because the task
733 * exits and the newpage is obsolete. In this case, the new page
734 * may be a swapcache. So, we just call mem_cgroup_uncharge_page()
735 * always for avoiding mess. The page_cgroup will be removed if
736 * unnecessary. File cache pages is still on radix-tree. Don't
737 * care it.
738 */ 1449 */
739 if (!newpage->mapping) 1450 __mem_cgroup_commit_charge(mem, pc, ctype);
740 __mem_cgroup_uncharge_common(newpage, 1451
741 MEM_CGROUP_CHARGE_TYPE_FORCE); 1452 /*
742 else if (PageAnon(newpage)) 1453 * Both of oldpage and newpage are still under lock_page().
743 mem_cgroup_uncharge_page(newpage); 1454 * Then, we don't have to care about race in radix-tree.
1455 * But we have to be careful that this page is unmapped or not.
1456 *
1457 * There is a case for !page_mapped(). At the start of
1458 * migration, oldpage was mapped. But now, it's zapped.
1459 * But we know *target* page is not freed/reused under us.
1460 * mem_cgroup_uncharge_page() does all necessary checks.
1461 */
1462 if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
1463 mem_cgroup_uncharge_page(target);
744} 1464}
745 1465
746/* 1466/*
@@ -748,29 +1468,26 @@ void mem_cgroup_end_migration(struct page *newpage)
748 * This is typically used for page reclaiming for shmem for reducing side 1468 * This is typically used for page reclaiming for shmem for reducing side
749 * effect of page allocation from shmem, which is used by some mem_cgroup. 1469 * effect of page allocation from shmem, which is used by some mem_cgroup.
750 */ 1470 */
751int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask) 1471int mem_cgroup_shrink_usage(struct page *page,
1472 struct mm_struct *mm,
1473 gfp_t gfp_mask)
752{ 1474{
753 struct mem_cgroup *mem; 1475 struct mem_cgroup *mem = NULL;
754 int progress = 0; 1476 int progress = 0;
755 int retry = MEM_CGROUP_RECLAIM_RETRIES; 1477 int retry = MEM_CGROUP_RECLAIM_RETRIES;
756 1478
757 if (mem_cgroup_subsys.disabled) 1479 if (mem_cgroup_disabled())
758 return 0; 1480 return 0;
759 if (!mm) 1481 if (page)
1482 mem = try_get_mem_cgroup_from_swapcache(page);
1483 if (!mem && mm)
1484 mem = try_get_mem_cgroup_from_mm(mm);
1485 if (unlikely(!mem))
760 return 0; 1486 return 0;
761 1487
762 rcu_read_lock();
763 mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
764 if (unlikely(!mem)) {
765 rcu_read_unlock();
766 return 0;
767 }
768 css_get(&mem->css);
769 rcu_read_unlock();
770
771 do { 1488 do {
772 progress = try_to_free_mem_cgroup_pages(mem, gfp_mask); 1489 progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true);
773 progress += res_counter_check_under_limit(&mem->res); 1490 progress += mem_cgroup_check_under_limit(mem);
774 } while (!progress && --retry); 1491 } while (!progress && --retry);
775 1492
776 css_put(&mem->css); 1493 css_put(&mem->css);
@@ -779,116 +1496,295 @@ int mem_cgroup_shrink_usage(struct mm_struct *mm, gfp_t gfp_mask)
779 return 0; 1496 return 0;
780} 1497}
781 1498
782int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) 1499static DEFINE_MUTEX(set_limit_mutex);
1500
1501static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1502 unsigned long long val)
783{ 1503{
784 1504
785 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 1505 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
786 int progress; 1506 int progress;
1507 u64 memswlimit;
787 int ret = 0; 1508 int ret = 0;
788 1509
789 while (res_counter_set_limit(&memcg->res, val)) { 1510 while (retry_count) {
790 if (signal_pending(current)) { 1511 if (signal_pending(current)) {
791 ret = -EINTR; 1512 ret = -EINTR;
792 break; 1513 break;
793 } 1514 }
794 if (!retry_count) { 1515 /*
795 ret = -EBUSY; 1516 * Rather than hide all in some function, I do this in
1517 * open coded manner. You see what this really does.
1518 * We have to guarantee mem->res.limit < mem->memsw.limit.
1519 */
1520 mutex_lock(&set_limit_mutex);
1521 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1522 if (memswlimit < val) {
1523 ret = -EINVAL;
1524 mutex_unlock(&set_limit_mutex);
796 break; 1525 break;
797 } 1526 }
798 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL); 1527 ret = res_counter_set_limit(&memcg->res, val);
799 if (!progress) 1528 mutex_unlock(&set_limit_mutex);
800 retry_count--; 1529
1530 if (!ret)
1531 break;
1532
1533 progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
1534 false);
1535 if (!progress) retry_count--;
801 } 1536 }
1537
802 return ret; 1538 return ret;
803} 1539}
804 1540
1541int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1542 unsigned long long val)
1543{
1544 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1545 u64 memlimit, oldusage, curusage;
1546 int ret;
1547
1548 if (!do_swap_account)
1549 return -EINVAL;
1550
1551 while (retry_count) {
1552 if (signal_pending(current)) {
1553 ret = -EINTR;
1554 break;
1555 }
1556 /*
1557 * Rather than hide all in some function, I do this in
1558 * open coded manner. You see what this really does.
1559 * We have to guarantee mem->res.limit < mem->memsw.limit.
1560 */
1561 mutex_lock(&set_limit_mutex);
1562 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1563 if (memlimit > val) {
1564 ret = -EINVAL;
1565 mutex_unlock(&set_limit_mutex);
1566 break;
1567 }
1568 ret = res_counter_set_limit(&memcg->memsw, val);
1569 mutex_unlock(&set_limit_mutex);
1570
1571 if (!ret)
1572 break;
1573
1574 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1575 mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true);
1576 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1577 if (curusage >= oldusage)
1578 retry_count--;
1579 }
1580 return ret;
1581}
805 1582
806/* 1583/*
807 * This routine traverse page_cgroup in given list and drop them all. 1584 * This routine traverse page_cgroup in given list and drop them all.
808 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 1585 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
809 */ 1586 */
810#define FORCE_UNCHARGE_BATCH (128) 1587static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
811static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, 1588 int node, int zid, enum lru_list lru)
812 struct mem_cgroup_per_zone *mz,
813 enum lru_list lru)
814{ 1589{
815 struct page_cgroup *pc; 1590 struct zone *zone;
816 struct page *page; 1591 struct mem_cgroup_per_zone *mz;
817 int count = FORCE_UNCHARGE_BATCH; 1592 struct page_cgroup *pc, *busy;
818 unsigned long flags; 1593 unsigned long flags, loop;
819 struct list_head *list; 1594 struct list_head *list;
1595 int ret = 0;
820 1596
1597 zone = &NODE_DATA(node)->node_zones[zid];
1598 mz = mem_cgroup_zoneinfo(mem, node, zid);
821 list = &mz->lists[lru]; 1599 list = &mz->lists[lru];
822 1600
823 spin_lock_irqsave(&mz->lru_lock, flags); 1601 loop = MEM_CGROUP_ZSTAT(mz, lru);
824 while (!list_empty(list)) { 1602 /* give some margin against EBUSY etc...*/
825 pc = list_entry(list->prev, struct page_cgroup, lru); 1603 loop += 256;
826 page = pc->page; 1604 busy = NULL;
827 if (!PageCgroupUsed(pc)) 1605 while (loop--) {
828 break; 1606 ret = 0;
829 get_page(page); 1607 spin_lock_irqsave(&zone->lru_lock, flags);
830 spin_unlock_irqrestore(&mz->lru_lock, flags); 1608 if (list_empty(list)) {
831 /* 1609 spin_unlock_irqrestore(&zone->lru_lock, flags);
832 * Check if this page is on LRU. !LRU page can be found
833 * if it's under page migration.
834 */
835 if (PageLRU(page)) {
836 __mem_cgroup_uncharge_common(page,
837 MEM_CGROUP_CHARGE_TYPE_FORCE);
838 put_page(page);
839 if (--count <= 0) {
840 count = FORCE_UNCHARGE_BATCH;
841 cond_resched();
842 }
843 } else {
844 spin_lock_irqsave(&mz->lru_lock, flags);
845 break; 1610 break;
846 } 1611 }
847 spin_lock_irqsave(&mz->lru_lock, flags); 1612 pc = list_entry(list->prev, struct page_cgroup, lru);
1613 if (busy == pc) {
1614 list_move(&pc->lru, list);
1615 busy = 0;
1616 spin_unlock_irqrestore(&zone->lru_lock, flags);
1617 continue;
1618 }
1619 spin_unlock_irqrestore(&zone->lru_lock, flags);
1620
1621 ret = mem_cgroup_move_parent(pc, mem, GFP_KERNEL);
1622 if (ret == -ENOMEM)
1623 break;
1624
1625 if (ret == -EBUSY || ret == -EINVAL) {
1626 /* found lock contention or "pc" is obsolete. */
1627 busy = pc;
1628 cond_resched();
1629 } else
1630 busy = NULL;
848 } 1631 }
849 spin_unlock_irqrestore(&mz->lru_lock, flags); 1632
1633 if (!ret && !list_empty(list))
1634 return -EBUSY;
1635 return ret;
850} 1636}
851 1637
852/* 1638/*
853 * make mem_cgroup's charge to be 0 if there is no task. 1639 * make mem_cgroup's charge to be 0 if there is no task.
854 * This enables deleting this mem_cgroup. 1640 * This enables deleting this mem_cgroup.
855 */ 1641 */
856static int mem_cgroup_force_empty(struct mem_cgroup *mem) 1642static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
857{ 1643{
858 int ret = -EBUSY; 1644 int ret;
859 int node, zid; 1645 int node, zid, shrink;
1646 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1647 struct cgroup *cgrp = mem->css.cgroup;
860 1648
861 css_get(&mem->css); 1649 css_get(&mem->css);
862 /* 1650
863 * page reclaim code (kswapd etc..) will move pages between 1651 shrink = 0;
864 * active_list <-> inactive_list while we don't take a lock. 1652 /* should free all ? */
865 * So, we have to do loop here until all lists are empty. 1653 if (free_all)
866 */ 1654 goto try_to_free;
1655move_account:
867 while (mem->res.usage > 0) { 1656 while (mem->res.usage > 0) {
868 if (atomic_read(&mem->css.cgroup->count) > 0) 1657 ret = -EBUSY;
1658 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
1659 goto out;
1660 ret = -EINTR;
1661 if (signal_pending(current))
869 goto out; 1662 goto out;
870 /* This is for making all *used* pages to be on LRU. */ 1663 /* This is for making all *used* pages to be on LRU. */
871 lru_add_drain_all(); 1664 lru_add_drain_all();
872 for_each_node_state(node, N_POSSIBLE) 1665 ret = 0;
873 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 1666 for_each_node_state(node, N_POSSIBLE) {
874 struct mem_cgroup_per_zone *mz; 1667 for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) {
875 enum lru_list l; 1668 enum lru_list l;
876 mz = mem_cgroup_zoneinfo(mem, node, zid); 1669 for_each_lru(l) {
877 for_each_lru(l) 1670 ret = mem_cgroup_force_empty_list(mem,
878 mem_cgroup_force_empty_list(mem, mz, l); 1671 node, zid, l);
1672 if (ret)
1673 break;
1674 }
879 } 1675 }
1676 if (ret)
1677 break;
1678 }
1679 /* it seems parent cgroup doesn't have enough mem */
1680 if (ret == -ENOMEM)
1681 goto try_to_free;
880 cond_resched(); 1682 cond_resched();
881 } 1683 }
882 ret = 0; 1684 ret = 0;
883out: 1685out:
884 css_put(&mem->css); 1686 css_put(&mem->css);
885 return ret; 1687 return ret;
1688
1689try_to_free:
1690 /* returns EBUSY if there is a task or if we come here twice. */
1691 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
1692 ret = -EBUSY;
1693 goto out;
1694 }
1695 /* we call try-to-free pages for make this cgroup empty */
1696 lru_add_drain_all();
1697 /* try to free all pages in this cgroup */
1698 shrink = 1;
1699 while (nr_retries && mem->res.usage > 0) {
1700 int progress;
1701
1702 if (signal_pending(current)) {
1703 ret = -EINTR;
1704 goto out;
1705 }
1706 progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL,
1707 false, get_swappiness(mem));
1708 if (!progress) {
1709 nr_retries--;
1710 /* maybe some writeback is necessary */
1711 congestion_wait(WRITE, HZ/10);
1712 }
1713
1714 }
1715 lru_add_drain();
1716 /* try move_account...there may be some *locked* pages. */
1717 if (mem->res.usage)
1718 goto move_account;
1719 ret = 0;
1720 goto out;
1721}
1722
1723int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
1724{
1725 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
1726}
1727
1728
1729static u64 mem_cgroup_hierarchy_read(struct cgroup *cont, struct cftype *cft)
1730{
1731 return mem_cgroup_from_cont(cont)->use_hierarchy;
1732}
1733
1734static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft,
1735 u64 val)
1736{
1737 int retval = 0;
1738 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1739 struct cgroup *parent = cont->parent;
1740 struct mem_cgroup *parent_mem = NULL;
1741
1742 if (parent)
1743 parent_mem = mem_cgroup_from_cont(parent);
1744
1745 cgroup_lock();
1746 /*
1747 * If parent's use_hiearchy is set, we can't make any modifications
1748 * in the child subtrees. If it is unset, then the change can
1749 * occur, provided the current cgroup has no children.
1750 *
1751 * For the root cgroup, parent_mem is NULL, we allow value to be
1752 * set if there are no children.
1753 */
1754 if ((!parent_mem || !parent_mem->use_hierarchy) &&
1755 (val == 1 || val == 0)) {
1756 if (list_empty(&cont->children))
1757 mem->use_hierarchy = val;
1758 else
1759 retval = -EBUSY;
1760 } else
1761 retval = -EINVAL;
1762 cgroup_unlock();
1763
1764 return retval;
886} 1765}
887 1766
888static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) 1767static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
889{ 1768{
890 return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res, 1769 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
891 cft->private); 1770 u64 val = 0;
1771 int type, name;
1772
1773 type = MEMFILE_TYPE(cft->private);
1774 name = MEMFILE_ATTR(cft->private);
1775 switch (type) {
1776 case _MEM:
1777 val = res_counter_read_u64(&mem->res, name);
1778 break;
1779 case _MEMSWAP:
1780 if (do_swap_account)
1781 val = res_counter_read_u64(&mem->memsw, name);
1782 break;
1783 default:
1784 BUG();
1785 break;
1786 }
1787 return val;
892} 1788}
893/* 1789/*
894 * The user of this function is... 1790 * The user of this function is...
@@ -898,15 +1794,22 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
898 const char *buffer) 1794 const char *buffer)
899{ 1795{
900 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); 1796 struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
1797 int type, name;
901 unsigned long long val; 1798 unsigned long long val;
902 int ret; 1799 int ret;
903 1800
904 switch (cft->private) { 1801 type = MEMFILE_TYPE(cft->private);
1802 name = MEMFILE_ATTR(cft->private);
1803 switch (name) {
905 case RES_LIMIT: 1804 case RES_LIMIT:
906 /* This function does all necessary parse...reuse it */ 1805 /* This function does all necessary parse...reuse it */
907 ret = res_counter_memparse_write_strategy(buffer, &val); 1806 ret = res_counter_memparse_write_strategy(buffer, &val);
908 if (!ret) 1807 if (ret)
1808 break;
1809 if (type == _MEM)
909 ret = mem_cgroup_resize_limit(memcg, val); 1810 ret = mem_cgroup_resize_limit(memcg, val);
1811 else
1812 ret = mem_cgroup_resize_memsw_limit(memcg, val);
910 break; 1813 break;
911 default: 1814 default:
912 ret = -EINVAL; /* should be BUG() ? */ 1815 ret = -EINVAL; /* should be BUG() ? */
@@ -915,27 +1818,59 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
915 return ret; 1818 return ret;
916} 1819}
917 1820
1821static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg,
1822 unsigned long long *mem_limit, unsigned long long *memsw_limit)
1823{
1824 struct cgroup *cgroup;
1825 unsigned long long min_limit, min_memsw_limit, tmp;
1826
1827 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT);
1828 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1829 cgroup = memcg->css.cgroup;
1830 if (!memcg->use_hierarchy)
1831 goto out;
1832
1833 while (cgroup->parent) {
1834 cgroup = cgroup->parent;
1835 memcg = mem_cgroup_from_cont(cgroup);
1836 if (!memcg->use_hierarchy)
1837 break;
1838 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT);
1839 min_limit = min(min_limit, tmp);
1840 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1841 min_memsw_limit = min(min_memsw_limit, tmp);
1842 }
1843out:
1844 *mem_limit = min_limit;
1845 *memsw_limit = min_memsw_limit;
1846 return;
1847}
1848
918static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) 1849static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
919{ 1850{
920 struct mem_cgroup *mem; 1851 struct mem_cgroup *mem;
1852 int type, name;
921 1853
922 mem = mem_cgroup_from_cont(cont); 1854 mem = mem_cgroup_from_cont(cont);
923 switch (event) { 1855 type = MEMFILE_TYPE(event);
1856 name = MEMFILE_ATTR(event);
1857 switch (name) {
924 case RES_MAX_USAGE: 1858 case RES_MAX_USAGE:
925 res_counter_reset_max(&mem->res); 1859 if (type == _MEM)
1860 res_counter_reset_max(&mem->res);
1861 else
1862 res_counter_reset_max(&mem->memsw);
926 break; 1863 break;
927 case RES_FAILCNT: 1864 case RES_FAILCNT:
928 res_counter_reset_failcnt(&mem->res); 1865 if (type == _MEM)
1866 res_counter_reset_failcnt(&mem->res);
1867 else
1868 res_counter_reset_failcnt(&mem->memsw);
929 break; 1869 break;
930 } 1870 }
931 return 0; 1871 return 0;
932} 1872}
933 1873
934static int mem_force_empty_write(struct cgroup *cont, unsigned int event)
935{
936 return mem_cgroup_force_empty(mem_cgroup_from_cont(cont));
937}
938
939static const struct mem_cgroup_stat_desc { 1874static const struct mem_cgroup_stat_desc {
940 const char *msg; 1875 const char *msg;
941 u64 unit; 1876 u64 unit;
@@ -984,43 +1919,163 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
984 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); 1919 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
985 1920
986 } 1921 }
1922 {
1923 unsigned long long limit, memsw_limit;
1924 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
1925 cb->fill(cb, "hierarchical_memory_limit", limit);
1926 if (do_swap_account)
1927 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
1928 }
1929
1930#ifdef CONFIG_DEBUG_VM
1931 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
1932
1933 {
1934 int nid, zid;
1935 struct mem_cgroup_per_zone *mz;
1936 unsigned long recent_rotated[2] = {0, 0};
1937 unsigned long recent_scanned[2] = {0, 0};
1938
1939 for_each_online_node(nid)
1940 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
1941 mz = mem_cgroup_zoneinfo(mem_cont, nid, zid);
1942
1943 recent_rotated[0] +=
1944 mz->reclaim_stat.recent_rotated[0];
1945 recent_rotated[1] +=
1946 mz->reclaim_stat.recent_rotated[1];
1947 recent_scanned[0] +=
1948 mz->reclaim_stat.recent_scanned[0];
1949 recent_scanned[1] +=
1950 mz->reclaim_stat.recent_scanned[1];
1951 }
1952 cb->fill(cb, "recent_rotated_anon", recent_rotated[0]);
1953 cb->fill(cb, "recent_rotated_file", recent_rotated[1]);
1954 cb->fill(cb, "recent_scanned_anon", recent_scanned[0]);
1955 cb->fill(cb, "recent_scanned_file", recent_scanned[1]);
1956 }
1957#endif
1958
1959 return 0;
1960}
1961
1962static u64 mem_cgroup_swappiness_read(struct cgroup *cgrp, struct cftype *cft)
1963{
1964 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
1965
1966 return get_swappiness(memcg);
1967}
1968
1969static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
1970 u64 val)
1971{
1972 struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
1973 struct mem_cgroup *parent;
1974 if (val > 100)
1975 return -EINVAL;
1976
1977 if (cgrp->parent == NULL)
1978 return -EINVAL;
1979
1980 parent = mem_cgroup_from_cont(cgrp->parent);
1981 /* If under hierarchy, only empty-root can set this value */
1982 if ((parent->use_hierarchy) ||
1983 (memcg->use_hierarchy && !list_empty(&cgrp->children)))
1984 return -EINVAL;
1985
1986 spin_lock(&memcg->reclaim_param_lock);
1987 memcg->swappiness = val;
1988 spin_unlock(&memcg->reclaim_param_lock);
1989
987 return 0; 1990 return 0;
988} 1991}
989 1992
1993
990static struct cftype mem_cgroup_files[] = { 1994static struct cftype mem_cgroup_files[] = {
991 { 1995 {
992 .name = "usage_in_bytes", 1996 .name = "usage_in_bytes",
993 .private = RES_USAGE, 1997 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
994 .read_u64 = mem_cgroup_read, 1998 .read_u64 = mem_cgroup_read,
995 }, 1999 },
996 { 2000 {
997 .name = "max_usage_in_bytes", 2001 .name = "max_usage_in_bytes",
998 .private = RES_MAX_USAGE, 2002 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
999 .trigger = mem_cgroup_reset, 2003 .trigger = mem_cgroup_reset,
1000 .read_u64 = mem_cgroup_read, 2004 .read_u64 = mem_cgroup_read,
1001 }, 2005 },
1002 { 2006 {
1003 .name = "limit_in_bytes", 2007 .name = "limit_in_bytes",
1004 .private = RES_LIMIT, 2008 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
1005 .write_string = mem_cgroup_write, 2009 .write_string = mem_cgroup_write,
1006 .read_u64 = mem_cgroup_read, 2010 .read_u64 = mem_cgroup_read,
1007 }, 2011 },
1008 { 2012 {
1009 .name = "failcnt", 2013 .name = "failcnt",
1010 .private = RES_FAILCNT, 2014 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
1011 .trigger = mem_cgroup_reset, 2015 .trigger = mem_cgroup_reset,
1012 .read_u64 = mem_cgroup_read, 2016 .read_u64 = mem_cgroup_read,
1013 }, 2017 },
1014 { 2018 {
2019 .name = "stat",
2020 .read_map = mem_control_stat_show,
2021 },
2022 {
1015 .name = "force_empty", 2023 .name = "force_empty",
1016 .trigger = mem_force_empty_write, 2024 .trigger = mem_cgroup_force_empty_write,
1017 }, 2025 },
1018 { 2026 {
1019 .name = "stat", 2027 .name = "use_hierarchy",
1020 .read_map = mem_control_stat_show, 2028 .write_u64 = mem_cgroup_hierarchy_write,
2029 .read_u64 = mem_cgroup_hierarchy_read,
2030 },
2031 {
2032 .name = "swappiness",
2033 .read_u64 = mem_cgroup_swappiness_read,
2034 .write_u64 = mem_cgroup_swappiness_write,
1021 }, 2035 },
1022}; 2036};
1023 2037
2038#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2039static struct cftype memsw_cgroup_files[] = {
2040 {
2041 .name = "memsw.usage_in_bytes",
2042 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
2043 .read_u64 = mem_cgroup_read,
2044 },
2045 {
2046 .name = "memsw.max_usage_in_bytes",
2047 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
2048 .trigger = mem_cgroup_reset,
2049 .read_u64 = mem_cgroup_read,
2050 },
2051 {
2052 .name = "memsw.limit_in_bytes",
2053 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
2054 .write_string = mem_cgroup_write,
2055 .read_u64 = mem_cgroup_read,
2056 },
2057 {
2058 .name = "memsw.failcnt",
2059 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
2060 .trigger = mem_cgroup_reset,
2061 .read_u64 = mem_cgroup_read,
2062 },
2063};
2064
2065static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
2066{
2067 if (!do_swap_account)
2068 return 0;
2069 return cgroup_add_files(cont, ss, memsw_cgroup_files,
2070 ARRAY_SIZE(memsw_cgroup_files));
2071};
2072#else
2073static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss)
2074{
2075 return 0;
2076}
2077#endif
2078
1024static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) 2079static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1025{ 2080{
1026 struct mem_cgroup_per_node *pn; 2081 struct mem_cgroup_per_node *pn;
@@ -1046,7 +2101,6 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1046 2101
1047 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 2102 for (zone = 0; zone < MAX_NR_ZONES; zone++) {
1048 mz = &pn->zoneinfo[zone]; 2103 mz = &pn->zoneinfo[zone];
1049 spin_lock_init(&mz->lru_lock);
1050 for_each_lru(l) 2104 for_each_lru(l)
1051 INIT_LIST_HEAD(&mz->lists[l]); 2105 INIT_LIST_HEAD(&mz->lists[l]);
1052 } 2106 }
@@ -1058,55 +2112,113 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1058 kfree(mem->info.nodeinfo[node]); 2112 kfree(mem->info.nodeinfo[node]);
1059} 2113}
1060 2114
2115static int mem_cgroup_size(void)
2116{
2117 int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
2118 return sizeof(struct mem_cgroup) + cpustat_size;
2119}
2120
1061static struct mem_cgroup *mem_cgroup_alloc(void) 2121static struct mem_cgroup *mem_cgroup_alloc(void)
1062{ 2122{
1063 struct mem_cgroup *mem; 2123 struct mem_cgroup *mem;
2124 int size = mem_cgroup_size();
1064 2125
1065 if (sizeof(*mem) < PAGE_SIZE) 2126 if (size < PAGE_SIZE)
1066 mem = kmalloc(sizeof(*mem), GFP_KERNEL); 2127 mem = kmalloc(size, GFP_KERNEL);
1067 else 2128 else
1068 mem = vmalloc(sizeof(*mem)); 2129 mem = vmalloc(size);
1069 2130
1070 if (mem) 2131 if (mem)
1071 memset(mem, 0, sizeof(*mem)); 2132 memset(mem, 0, size);
1072 return mem; 2133 return mem;
1073} 2134}
1074 2135
1075static void mem_cgroup_free(struct mem_cgroup *mem) 2136/*
2137 * At destroying mem_cgroup, references from swap_cgroup can remain.
2138 * (scanning all at force_empty is too costly...)
2139 *
2140 * Instead of clearing all references at force_empty, we remember
2141 * the number of reference from swap_cgroup and free mem_cgroup when
2142 * it goes down to 0.
2143 *
2144 * Removal of cgroup itself succeeds regardless of refs from swap.
2145 */
2146
2147static void __mem_cgroup_free(struct mem_cgroup *mem)
1076{ 2148{
1077 if (sizeof(*mem) < PAGE_SIZE) 2149 int node;
2150
2151 for_each_node_state(node, N_POSSIBLE)
2152 free_mem_cgroup_per_zone_info(mem, node);
2153
2154 if (mem_cgroup_size() < PAGE_SIZE)
1078 kfree(mem); 2155 kfree(mem);
1079 else 2156 else
1080 vfree(mem); 2157 vfree(mem);
1081} 2158}
1082 2159
2160static void mem_cgroup_get(struct mem_cgroup *mem)
2161{
2162 atomic_inc(&mem->refcnt);
2163}
2164
2165static void mem_cgroup_put(struct mem_cgroup *mem)
2166{
2167 if (atomic_dec_and_test(&mem->refcnt))
2168 __mem_cgroup_free(mem);
2169}
2170
2171
2172#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2173static void __init enable_swap_cgroup(void)
2174{
2175 if (!mem_cgroup_disabled() && really_do_swap_account)
2176 do_swap_account = 1;
2177}
2178#else
2179static void __init enable_swap_cgroup(void)
2180{
2181}
2182#endif
1083 2183
1084static struct cgroup_subsys_state * 2184static struct cgroup_subsys_state *
1085mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 2185mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1086{ 2186{
1087 struct mem_cgroup *mem; 2187 struct mem_cgroup *mem, *parent;
1088 int node; 2188 int node;
1089 2189
1090 if (unlikely((cont->parent) == NULL)) { 2190 mem = mem_cgroup_alloc();
1091 mem = &init_mem_cgroup; 2191 if (!mem)
1092 } else { 2192 return ERR_PTR(-ENOMEM);
1093 mem = mem_cgroup_alloc();
1094 if (!mem)
1095 return ERR_PTR(-ENOMEM);
1096 }
1097
1098 res_counter_init(&mem->res);
1099 2193
1100 for_each_node_state(node, N_POSSIBLE) 2194 for_each_node_state(node, N_POSSIBLE)
1101 if (alloc_mem_cgroup_per_zone_info(mem, node)) 2195 if (alloc_mem_cgroup_per_zone_info(mem, node))
1102 goto free_out; 2196 goto free_out;
2197 /* root ? */
2198 if (cont->parent == NULL) {
2199 enable_swap_cgroup();
2200 parent = NULL;
2201 } else {
2202 parent = mem_cgroup_from_cont(cont->parent);
2203 mem->use_hierarchy = parent->use_hierarchy;
2204 }
1103 2205
2206 if (parent && parent->use_hierarchy) {
2207 res_counter_init(&mem->res, &parent->res);
2208 res_counter_init(&mem->memsw, &parent->memsw);
2209 } else {
2210 res_counter_init(&mem->res, NULL);
2211 res_counter_init(&mem->memsw, NULL);
2212 }
2213 mem->last_scanned_child = NULL;
2214 spin_lock_init(&mem->reclaim_param_lock);
2215
2216 if (parent)
2217 mem->swappiness = get_swappiness(parent);
2218 atomic_set(&mem->refcnt, 1);
1104 return &mem->css; 2219 return &mem->css;
1105free_out: 2220free_out:
1106 for_each_node_state(node, N_POSSIBLE) 2221 __mem_cgroup_free(mem);
1107 free_mem_cgroup_per_zone_info(mem, node);
1108 if (cont->parent != NULL)
1109 mem_cgroup_free(mem);
1110 return ERR_PTR(-ENOMEM); 2222 return ERR_PTR(-ENOMEM);
1111} 2223}
1112 2224
@@ -1114,26 +2226,26 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
1114 struct cgroup *cont) 2226 struct cgroup *cont)
1115{ 2227{
1116 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2228 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1117 mem_cgroup_force_empty(mem); 2229 mem_cgroup_force_empty(mem, false);
1118} 2230}
1119 2231
1120static void mem_cgroup_destroy(struct cgroup_subsys *ss, 2232static void mem_cgroup_destroy(struct cgroup_subsys *ss,
1121 struct cgroup *cont) 2233 struct cgroup *cont)
1122{ 2234{
1123 int node; 2235 mem_cgroup_put(mem_cgroup_from_cont(cont));
1124 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
1125
1126 for_each_node_state(node, N_POSSIBLE)
1127 free_mem_cgroup_per_zone_info(mem, node);
1128
1129 mem_cgroup_free(mem_cgroup_from_cont(cont));
1130} 2236}
1131 2237
1132static int mem_cgroup_populate(struct cgroup_subsys *ss, 2238static int mem_cgroup_populate(struct cgroup_subsys *ss,
1133 struct cgroup *cont) 2239 struct cgroup *cont)
1134{ 2240{
1135 return cgroup_add_files(cont, ss, mem_cgroup_files, 2241 int ret;
1136 ARRAY_SIZE(mem_cgroup_files)); 2242
2243 ret = cgroup_add_files(cont, ss, mem_cgroup_files,
2244 ARRAY_SIZE(mem_cgroup_files));
2245
2246 if (!ret)
2247 ret = register_memsw_files(cont, ss);
2248 return ret;
1137} 2249}
1138 2250
1139static void mem_cgroup_move_task(struct cgroup_subsys *ss, 2251static void mem_cgroup_move_task(struct cgroup_subsys *ss,
@@ -1141,25 +2253,12 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1141 struct cgroup *old_cont, 2253 struct cgroup *old_cont,
1142 struct task_struct *p) 2254 struct task_struct *p)
1143{ 2255{
1144 struct mm_struct *mm; 2256 mutex_lock(&memcg_tasklist);
1145 struct mem_cgroup *mem, *old_mem;
1146
1147 mm = get_task_mm(p);
1148 if (mm == NULL)
1149 return;
1150
1151 mem = mem_cgroup_from_cont(cont);
1152 old_mem = mem_cgroup_from_cont(old_cont);
1153
1154 /* 2257 /*
1155 * Only thread group leaders are allowed to migrate, the mm_struct is 2258 * FIXME: It's better to move charges of this process from old
1156 * in effect owned by the leader 2259 * memcg to new memcg. But it's just on TODO-List now.
1157 */ 2260 */
1158 if (!thread_group_leader(p)) 2261 mutex_unlock(&memcg_tasklist);
1159 goto out;
1160
1161out:
1162 mmput(mm);
1163} 2262}
1164 2263
1165struct cgroup_subsys mem_cgroup_subsys = { 2264struct cgroup_subsys mem_cgroup_subsys = {
@@ -1172,3 +2271,13 @@ struct cgroup_subsys mem_cgroup_subsys = {
1172 .attach = mem_cgroup_move_task, 2271 .attach = mem_cgroup_move_task,
1173 .early_init = 0, 2272 .early_init = 0,
1174}; 2273};
2274
2275#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
2276
2277static int __init disable_swap_account(char *s)
2278{
2279 really_do_swap_account = 0;
2280 return 1;
2281}
2282__setup("noswapaccount", disable_swap_account);
2283#endif
diff --git a/mm/memory.c b/mm/memory.c
index 7b9db658aca2..e009ce870859 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -52,6 +52,9 @@
52#include <linux/writeback.h> 52#include <linux/writeback.h>
53#include <linux/memcontrol.h> 53#include <linux/memcontrol.h>
54#include <linux/mmu_notifier.h> 54#include <linux/mmu_notifier.h>
55#include <linux/kallsyms.h>
56#include <linux/swapops.h>
57#include <linux/elf.h>
55 58
56#include <asm/pgalloc.h> 59#include <asm/pgalloc.h>
57#include <asm/uaccess.h> 60#include <asm/uaccess.h>
@@ -59,9 +62,6 @@
59#include <asm/tlbflush.h> 62#include <asm/tlbflush.h>
60#include <asm/pgtable.h> 63#include <asm/pgtable.h>
61 64
62#include <linux/swapops.h>
63#include <linux/elf.h>
64
65#include "internal.h" 65#include "internal.h"
66 66
67#ifndef CONFIG_NEED_MULTIPLE_NODES 67#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -375,15 +375,65 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
375 * 375 *
376 * The calling function must still handle the error. 376 * The calling function must still handle the error.
377 */ 377 */
378static void print_bad_pte(struct vm_area_struct *vma, pte_t pte, 378static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
379 unsigned long vaddr) 379 pte_t pte, struct page *page)
380{ 380{
381 printk(KERN_ERR "Bad pte = %08llx, process = %s, " 381 pgd_t *pgd = pgd_offset(vma->vm_mm, addr);
382 "vm_flags = %lx, vaddr = %lx\n", 382 pud_t *pud = pud_offset(pgd, addr);
383 (long long)pte_val(pte), 383 pmd_t *pmd = pmd_offset(pud, addr);
384 (vma->vm_mm == current->mm ? current->comm : "???"), 384 struct address_space *mapping;
385 vma->vm_flags, vaddr); 385 pgoff_t index;
386 static unsigned long resume;
387 static unsigned long nr_shown;
388 static unsigned long nr_unshown;
389
390 /*
391 * Allow a burst of 60 reports, then keep quiet for that minute;
392 * or allow a steady drip of one report per second.
393 */
394 if (nr_shown == 60) {
395 if (time_before(jiffies, resume)) {
396 nr_unshown++;
397 return;
398 }
399 if (nr_unshown) {
400 printk(KERN_ALERT
401 "BUG: Bad page map: %lu messages suppressed\n",
402 nr_unshown);
403 nr_unshown = 0;
404 }
405 nr_shown = 0;
406 }
407 if (nr_shown++ == 0)
408 resume = jiffies + 60 * HZ;
409
410 mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
411 index = linear_page_index(vma, addr);
412
413 printk(KERN_ALERT
414 "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n",
415 current->comm,
416 (long long)pte_val(pte), (long long)pmd_val(*pmd));
417 if (page) {
418 printk(KERN_ALERT
419 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
420 page, (void *)page->flags, page_count(page),
421 page_mapcount(page), page->mapping, page->index);
422 }
423 printk(KERN_ALERT
424 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
425 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
426 /*
427 * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
428 */
429 if (vma->vm_ops)
430 print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
431 (unsigned long)vma->vm_ops->fault);
432 if (vma->vm_file && vma->vm_file->f_op)
433 print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
434 (unsigned long)vma->vm_file->f_op->mmap);
386 dump_stack(); 435 dump_stack();
436 add_taint(TAINT_BAD_PAGE);
387} 437}
388 438
389static inline int is_cow_mapping(unsigned int flags) 439static inline int is_cow_mapping(unsigned int flags)
@@ -441,21 +491,18 @@ static inline int is_cow_mapping(unsigned int flags)
441struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, 491struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
442 pte_t pte) 492 pte_t pte)
443{ 493{
444 unsigned long pfn; 494 unsigned long pfn = pte_pfn(pte);
445 495
446 if (HAVE_PTE_SPECIAL) { 496 if (HAVE_PTE_SPECIAL) {
447 if (likely(!pte_special(pte))) { 497 if (likely(!pte_special(pte)))
448 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 498 goto check_pfn;
449 return pte_page(pte); 499 if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)))
450 } 500 print_bad_pte(vma, addr, pte, NULL);
451 VM_BUG_ON(!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)));
452 return NULL; 501 return NULL;
453 } 502 }
454 503
455 /* !HAVE_PTE_SPECIAL case follows: */ 504 /* !HAVE_PTE_SPECIAL case follows: */
456 505
457 pfn = pte_pfn(pte);
458
459 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { 506 if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
460 if (vma->vm_flags & VM_MIXEDMAP) { 507 if (vma->vm_flags & VM_MIXEDMAP) {
461 if (!pfn_valid(pfn)) 508 if (!pfn_valid(pfn))
@@ -471,11 +518,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
471 } 518 }
472 } 519 }
473 520
474 VM_BUG_ON(!pfn_valid(pfn)); 521check_pfn:
522 if (unlikely(pfn > highest_memmap_pfn)) {
523 print_bad_pte(vma, addr, pte, NULL);
524 return NULL;
525 }
475 526
476 /* 527 /*
477 * NOTE! We still have PageReserved() pages in the page tables. 528 * NOTE! We still have PageReserved() pages in the page tables.
478 *
479 * eg. VDSO mappings can cause them to exist. 529 * eg. VDSO mappings can cause them to exist.
480 */ 530 */
481out: 531out:
@@ -767,11 +817,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
767 else { 817 else {
768 if (pte_dirty(ptent)) 818 if (pte_dirty(ptent))
769 set_page_dirty(page); 819 set_page_dirty(page);
770 if (pte_young(ptent)) 820 if (pte_young(ptent) &&
771 SetPageReferenced(page); 821 likely(!VM_SequentialReadHint(vma)))
822 mark_page_accessed(page);
772 file_rss--; 823 file_rss--;
773 } 824 }
774 page_remove_rmap(page, vma); 825 page_remove_rmap(page);
826 if (unlikely(page_mapcount(page) < 0))
827 print_bad_pte(vma, addr, ptent, page);
775 tlb_remove_page(tlb, page); 828 tlb_remove_page(tlb, page);
776 continue; 829 continue;
777 } 830 }
@@ -781,8 +834,12 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
781 */ 834 */
782 if (unlikely(details)) 835 if (unlikely(details))
783 continue; 836 continue;
784 if (!pte_file(ptent)) 837 if (pte_file(ptent)) {
785 free_swap_and_cache(pte_to_swp_entry(ptent)); 838 if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
839 print_bad_pte(vma, addr, ptent, NULL);
840 } else if
841 (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent))))
842 print_bad_pte(vma, addr, ptent, NULL);
786 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); 843 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
787 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); 844 } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
788 845
@@ -1153,6 +1210,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1153 int write = !!(flags & GUP_FLAGS_WRITE); 1210 int write = !!(flags & GUP_FLAGS_WRITE);
1154 int force = !!(flags & GUP_FLAGS_FORCE); 1211 int force = !!(flags & GUP_FLAGS_FORCE);
1155 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); 1212 int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
1213 int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
1156 1214
1157 if (len <= 0) 1215 if (len <= 0)
1158 return 0; 1216 return 0;
@@ -1231,12 +1289,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1231 struct page *page; 1289 struct page *page;
1232 1290
1233 /* 1291 /*
1234 * If tsk is ooming, cut off its access to large memory 1292 * If we have a pending SIGKILL, don't keep faulting
1235 * allocations. It has a pending SIGKILL, but it can't 1293 * pages and potentially allocating memory, unless
1236 * be processed until returning to user space. 1294 * current is handling munlock--e.g., on exit. In
1295 * that case, we are not allocating memory. Rather,
1296 * we're only unlocking already resident/mapped pages.
1237 */ 1297 */
1238 if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE))) 1298 if (unlikely(!ignore_sigkill &&
1239 return i ? i : -ENOMEM; 1299 fatal_signal_pending(current)))
1300 return i ? i : -ERESTARTSYS;
1240 1301
1241 if (write) 1302 if (write)
1242 foll_flags |= FOLL_WRITE; 1303 foll_flags |= FOLL_WRITE;
@@ -1263,9 +1324,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
1263 * do_wp_page has broken COW when necessary, 1324 * do_wp_page has broken COW when necessary,
1264 * even if maybe_mkwrite decided not to set 1325 * even if maybe_mkwrite decided not to set
1265 * pte_write. We can thus safely do subsequent 1326 * pte_write. We can thus safely do subsequent
1266 * page lookups as if they were reads. 1327 * page lookups as if they were reads. But only
1328 * do so when looping for pte_write is futile:
1329 * in some cases userspace may also be wanting
1330 * to write to the gotten user page, which a
1331 * read fault here might prevent (a readonly
1332 * page might get reCOWed by userspace write).
1267 */ 1333 */
1268 if (ret & VM_FAULT_WRITE) 1334 if ((ret & VM_FAULT_WRITE) &&
1335 !(vma->vm_flags & VM_WRITE))
1269 foll_flags &= ~FOLL_WRITE; 1336 foll_flags &= ~FOLL_WRITE;
1270 1337
1271 cond_resched(); 1338 cond_resched();
@@ -1644,6 +1711,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1644 1711
1645 BUG_ON(pmd_huge(*pmd)); 1712 BUG_ON(pmd_huge(*pmd));
1646 1713
1714 arch_enter_lazy_mmu_mode();
1715
1647 token = pmd_pgtable(*pmd); 1716 token = pmd_pgtable(*pmd);
1648 1717
1649 do { 1718 do {
@@ -1652,6 +1721,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
1652 break; 1721 break;
1653 } while (pte++, addr += PAGE_SIZE, addr != end); 1722 } while (pte++, addr += PAGE_SIZE, addr != end);
1654 1723
1724 arch_leave_lazy_mmu_mode();
1725
1655 if (mm != &init_mm) 1726 if (mm != &init_mm)
1656 pte_unmap_unlock(pte-1, ptl); 1727 pte_unmap_unlock(pte-1, ptl);
1657 return err; 1728 return err;
@@ -1837,10 +1908,21 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1837 * not dirty accountable. 1908 * not dirty accountable.
1838 */ 1909 */
1839 if (PageAnon(old_page)) { 1910 if (PageAnon(old_page)) {
1840 if (trylock_page(old_page)) { 1911 if (!trylock_page(old_page)) {
1841 reuse = can_share_swap_page(old_page); 1912 page_cache_get(old_page);
1842 unlock_page(old_page); 1913 pte_unmap_unlock(page_table, ptl);
1914 lock_page(old_page);
1915 page_table = pte_offset_map_lock(mm, pmd, address,
1916 &ptl);
1917 if (!pte_same(*page_table, orig_pte)) {
1918 unlock_page(old_page);
1919 page_cache_release(old_page);
1920 goto unlock;
1921 }
1922 page_cache_release(old_page);
1843 } 1923 }
1924 reuse = reuse_swap_page(old_page);
1925 unlock_page(old_page);
1844 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == 1926 } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
1845 (VM_WRITE|VM_SHARED))) { 1927 (VM_WRITE|VM_SHARED))) {
1846 /* 1928 /*
@@ -1918,7 +2000,7 @@ gotten:
1918 cow_user_page(new_page, old_page, address, vma); 2000 cow_user_page(new_page, old_page, address, vma);
1919 __SetPageUptodate(new_page); 2001 __SetPageUptodate(new_page);
1920 2002
1921 if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) 2003 if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
1922 goto oom_free_new; 2004 goto oom_free_new;
1923 2005
1924 /* 2006 /*
@@ -1943,11 +2025,7 @@ gotten:
1943 * thread doing COW. 2025 * thread doing COW.
1944 */ 2026 */
1945 ptep_clear_flush_notify(vma, address, page_table); 2027 ptep_clear_flush_notify(vma, address, page_table);
1946 SetPageSwapBacked(new_page);
1947 lru_cache_add_active_or_unevictable(new_page, vma);
1948 page_add_new_anon_rmap(new_page, vma, address); 2028 page_add_new_anon_rmap(new_page, vma, address);
1949
1950//TODO: is this safe? do_anonymous_page() does it this way.
1951 set_pte_at(mm, address, page_table, entry); 2029 set_pte_at(mm, address, page_table, entry);
1952 update_mmu_cache(vma, address, entry); 2030 update_mmu_cache(vma, address, entry);
1953 if (old_page) { 2031 if (old_page) {
@@ -1973,7 +2051,7 @@ gotten:
1973 * mapcount is visible. So transitively, TLBs to 2051 * mapcount is visible. So transitively, TLBs to
1974 * old page will be flushed before it can be reused. 2052 * old page will be flushed before it can be reused.
1975 */ 2053 */
1976 page_remove_rmap(old_page, vma); 2054 page_remove_rmap(old_page);
1977 } 2055 }
1978 2056
1979 /* Free the old page.. */ 2057 /* Free the old page.. */
@@ -2314,6 +2392,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2314 struct page *page; 2392 struct page *page;
2315 swp_entry_t entry; 2393 swp_entry_t entry;
2316 pte_t pte; 2394 pte_t pte;
2395 struct mem_cgroup *ptr = NULL;
2317 int ret = 0; 2396 int ret = 0;
2318 2397
2319 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2398 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@@ -2352,7 +2431,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2352 lock_page(page); 2431 lock_page(page);
2353 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2432 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2354 2433
2355 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { 2434 if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
2356 ret = VM_FAULT_OOM; 2435 ret = VM_FAULT_OOM;
2357 unlock_page(page); 2436 unlock_page(page);
2358 goto out; 2437 goto out;
@@ -2370,22 +2449,35 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2370 goto out_nomap; 2449 goto out_nomap;
2371 } 2450 }
2372 2451
2373 /* The page isn't present yet, go ahead with the fault. */ 2452 /*
2453 * The page isn't present yet, go ahead with the fault.
2454 *
2455 * Be careful about the sequence of operations here.
2456 * To get its accounting right, reuse_swap_page() must be called
2457 * while the page is counted on swap but not yet in mapcount i.e.
2458 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
2459 * must be called after the swap_free(), or it will never succeed.
2460 * Because delete_from_swap_page() may be called by reuse_swap_page(),
2461 * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
2462 * in page->private. In this case, a record in swap_cgroup is silently
2463 * discarded at swap_free().
2464 */
2374 2465
2375 inc_mm_counter(mm, anon_rss); 2466 inc_mm_counter(mm, anon_rss);
2376 pte = mk_pte(page, vma->vm_page_prot); 2467 pte = mk_pte(page, vma->vm_page_prot);
2377 if (write_access && can_share_swap_page(page)) { 2468 if (write_access && reuse_swap_page(page)) {
2378 pte = maybe_mkwrite(pte_mkdirty(pte), vma); 2469 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
2379 write_access = 0; 2470 write_access = 0;
2380 } 2471 }
2381
2382 flush_icache_page(vma, page); 2472 flush_icache_page(vma, page);
2383 set_pte_at(mm, address, page_table, pte); 2473 set_pte_at(mm, address, page_table, pte);
2384 page_add_anon_rmap(page, vma, address); 2474 page_add_anon_rmap(page, vma, address);
2475 /* It's better to call commit-charge after rmap is established */
2476 mem_cgroup_commit_charge_swapin(page, ptr);
2385 2477
2386 swap_free(entry); 2478 swap_free(entry);
2387 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) 2479 if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
2388 remove_exclusive_swap_page(page); 2480 try_to_free_swap(page);
2389 unlock_page(page); 2481 unlock_page(page);
2390 2482
2391 if (write_access) { 2483 if (write_access) {
@@ -2402,7 +2494,7 @@ unlock:
2402out: 2494out:
2403 return ret; 2495 return ret;
2404out_nomap: 2496out_nomap:
2405 mem_cgroup_uncharge_page(page); 2497 mem_cgroup_cancel_charge_swapin(ptr);
2406 pte_unmap_unlock(page_table, ptl); 2498 pte_unmap_unlock(page_table, ptl);
2407 unlock_page(page); 2499 unlock_page(page);
2408 page_cache_release(page); 2500 page_cache_release(page);
@@ -2432,7 +2524,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2432 goto oom; 2524 goto oom;
2433 __SetPageUptodate(page); 2525 __SetPageUptodate(page);
2434 2526
2435 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) 2527 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
2436 goto oom_free_page; 2528 goto oom_free_page;
2437 2529
2438 entry = mk_pte(page, vma->vm_page_prot); 2530 entry = mk_pte(page, vma->vm_page_prot);
@@ -2442,8 +2534,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
2442 if (!pte_none(*page_table)) 2534 if (!pte_none(*page_table))
2443 goto release; 2535 goto release;
2444 inc_mm_counter(mm, anon_rss); 2536 inc_mm_counter(mm, anon_rss);
2445 SetPageSwapBacked(page);
2446 lru_cache_add_active_or_unevictable(page, vma);
2447 page_add_new_anon_rmap(page, vma, address); 2537 page_add_new_anon_rmap(page, vma, address);
2448 set_pte_at(mm, address, page_table, entry); 2538 set_pte_at(mm, address, page_table, entry);
2449 2539
@@ -2525,7 +2615,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2525 ret = VM_FAULT_OOM; 2615 ret = VM_FAULT_OOM;
2526 goto out; 2616 goto out;
2527 } 2617 }
2528 if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { 2618 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
2529 ret = VM_FAULT_OOM; 2619 ret = VM_FAULT_OOM;
2530 page_cache_release(page); 2620 page_cache_release(page);
2531 goto out; 2621 goto out;
@@ -2591,8 +2681,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2591 entry = maybe_mkwrite(pte_mkdirty(entry), vma); 2681 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
2592 if (anon) { 2682 if (anon) {
2593 inc_mm_counter(mm, anon_rss); 2683 inc_mm_counter(mm, anon_rss);
2594 SetPageSwapBacked(page);
2595 lru_cache_add_active_or_unevictable(page, vma);
2596 page_add_new_anon_rmap(page, vma, address); 2684 page_add_new_anon_rmap(page, vma, address);
2597 } else { 2685 } else {
2598 inc_mm_counter(mm, file_rss); 2686 inc_mm_counter(mm, file_rss);
@@ -2602,7 +2690,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2602 get_page(dirty_page); 2690 get_page(dirty_page);
2603 } 2691 }
2604 } 2692 }
2605//TODO: is this safe? do_anonymous_page() does it this way.
2606 set_pte_at(mm, address, page_table, entry); 2693 set_pte_at(mm, address, page_table, entry);
2607 2694
2608 /* no need to invalidate: a not-present page won't be cached */ 2695 /* no need to invalidate: a not-present page won't be cached */
@@ -2666,12 +2753,11 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2666 if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) 2753 if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
2667 return 0; 2754 return 0;
2668 2755
2669 if (unlikely(!(vma->vm_flags & VM_NONLINEAR) || 2756 if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
2670 !(vma->vm_flags & VM_CAN_NONLINEAR))) {
2671 /* 2757 /*
2672 * Page table corrupted: show pte and kill process. 2758 * Page table corrupted: show pte and kill process.
2673 */ 2759 */
2674 print_bad_pte(vma, orig_pte, address); 2760 print_bad_pte(vma, address, orig_pte, NULL);
2675 return VM_FAULT_OOM; 2761 return VM_FAULT_OOM;
2676 } 2762 }
2677 2763
@@ -2953,7 +3039,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
2953{ 3039{
2954 resource_size_t phys_addr; 3040 resource_size_t phys_addr;
2955 unsigned long prot = 0; 3041 unsigned long prot = 0;
2956 void *maddr; 3042 void __iomem *maddr;
2957 int offset = addr & (PAGE_SIZE-1); 3043 int offset = addr & (PAGE_SIZE-1);
2958 3044
2959 if (follow_phys(vma, addr, write, &prot, &phys_addr)) 3045 if (follow_phys(vma, addr, write, &prot, &phys_addr))
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b17371185468..c083cf5fd6df 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -216,7 +216,8 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
216 return 0; 216 return 0;
217} 217}
218 218
219static int __meminit __add_section(struct zone *zone, unsigned long phys_start_pfn) 219static int __meminit __add_section(int nid, struct zone *zone,
220 unsigned long phys_start_pfn)
220{ 221{
221 int nr_pages = PAGES_PER_SECTION; 222 int nr_pages = PAGES_PER_SECTION;
222 int ret; 223 int ret;
@@ -234,7 +235,7 @@ static int __meminit __add_section(struct zone *zone, unsigned long phys_start_p
234 if (ret < 0) 235 if (ret < 0)
235 return ret; 236 return ret;
236 237
237 return register_new_memory(__pfn_to_section(phys_start_pfn)); 238 return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
238} 239}
239 240
240#ifdef CONFIG_SPARSEMEM_VMEMMAP 241#ifdef CONFIG_SPARSEMEM_VMEMMAP
@@ -273,8 +274,8 @@ static int __remove_section(struct zone *zone, struct mem_section *ms)
273 * call this function after deciding the zone to which to 274 * call this function after deciding the zone to which to
274 * add the new pages. 275 * add the new pages.
275 */ 276 */
276int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn, 277int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
277 unsigned long nr_pages) 278 unsigned long nr_pages)
278{ 279{
279 unsigned long i; 280 unsigned long i;
280 int err = 0; 281 int err = 0;
@@ -284,7 +285,7 @@ int __ref __add_pages(struct zone *zone, unsigned long phys_start_pfn,
284 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1); 285 end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
285 286
286 for (i = start_sec; i <= end_sec; i++) { 287 for (i = start_sec; i <= end_sec; i++) {
287 err = __add_section(zone, i << PFN_SECTION_SHIFT); 288 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
288 289
289 /* 290 /*
290 * EEXIST is finally dealt with by ioresource collision 291 * EEXIST is finally dealt with by ioresource collision
@@ -626,15 +627,12 @@ int scan_lru_pages(unsigned long start, unsigned long end)
626} 627}
627 628
628static struct page * 629static struct page *
629hotremove_migrate_alloc(struct page *page, 630hotremove_migrate_alloc(struct page *page, unsigned long private, int **x)
630 unsigned long private,
631 int **x)
632{ 631{
633 /* This should be improoooooved!! */ 632 /* This should be improooooved!! */
634 return alloc_page(GFP_HIGHUSER_PAGECACHE); 633 return alloc_page(GFP_HIGHUSER_MOVABLE);
635} 634}
636 635
637
638#define NR_OFFLINE_AT_ONCE_PAGES (256) 636#define NR_OFFLINE_AT_ONCE_PAGES (256)
639static int 637static int
640do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) 638do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
diff --git a/mm/migrate.c b/mm/migrate.c
index 21631ab8c08b..a30ea5fcf9f1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -121,20 +121,6 @@ static void remove_migration_pte(struct vm_area_struct *vma,
121 if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) 121 if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old)
122 goto out; 122 goto out;
123 123
124 /*
125 * Yes, ignore the return value from a GFP_ATOMIC mem_cgroup_charge.
126 * Failure is not an option here: we're now expected to remove every
127 * migration pte, and will cause crashes otherwise. Normally this
128 * is not an issue: mem_cgroup_prepare_migration bumped up the old
129 * page_cgroup count for safety, that's now attached to the new page,
130 * so this charge should just be another incrementation of the count,
131 * to keep in balance with rmap.c's mem_cgroup_uncharging. But if
132 * there's been a force_empty, those reference counts may no longer
133 * be reliable, and this charge can actually fail: oh well, we don't
134 * make the situation any worse by proceeding as if it had succeeded.
135 */
136 mem_cgroup_charge(new, mm, GFP_ATOMIC);
137
138 get_page(new); 124 get_page(new);
139 pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); 125 pte = pte_mkold(mk_pte(new, vma->vm_page_prot));
140 if (is_write_migration_entry(entry)) 126 if (is_write_migration_entry(entry))
@@ -300,12 +286,10 @@ static int migrate_page_move_mapping(struct address_space *mapping,
300 * Now we know that no one else is looking at the page. 286 * Now we know that no one else is looking at the page.
301 */ 287 */
302 get_page(newpage); /* add cache reference */ 288 get_page(newpage); /* add cache reference */
303#ifdef CONFIG_SWAP
304 if (PageSwapCache(page)) { 289 if (PageSwapCache(page)) {
305 SetPageSwapCache(newpage); 290 SetPageSwapCache(newpage);
306 set_page_private(newpage, page_private(page)); 291 set_page_private(newpage, page_private(page));
307 } 292 }
308#endif
309 293
310 radix_tree_replace_slot(pslot, newpage); 294 radix_tree_replace_slot(pslot, newpage);
311 295
@@ -373,18 +357,13 @@ static void migrate_page_copy(struct page *newpage, struct page *page)
373 357
374 mlock_migrate_page(newpage, page); 358 mlock_migrate_page(newpage, page);
375 359
376#ifdef CONFIG_SWAP
377 ClearPageSwapCache(page); 360 ClearPageSwapCache(page);
378#endif
379 ClearPagePrivate(page); 361 ClearPagePrivate(page);
380 set_page_private(page, 0); 362 set_page_private(page, 0);
381 /* page->mapping contains a flag for PageAnon() */ 363 /* page->mapping contains a flag for PageAnon() */
382 anon = PageAnon(page); 364 anon = PageAnon(page);
383 page->mapping = NULL; 365 page->mapping = NULL;
384 366
385 if (!anon) /* This page was removed from radix-tree. */
386 mem_cgroup_uncharge_cache_page(page);
387
388 /* 367 /*
389 * If any waiters have accumulated on the new page then 368 * If any waiters have accumulated on the new page then
390 * wake them up. 369 * wake them up.
@@ -618,6 +597,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
618 struct page *newpage = get_new_page(page, private, &result); 597 struct page *newpage = get_new_page(page, private, &result);
619 int rcu_locked = 0; 598 int rcu_locked = 0;
620 int charge = 0; 599 int charge = 0;
600 struct mem_cgroup *mem;
621 601
622 if (!newpage) 602 if (!newpage)
623 return -ENOMEM; 603 return -ENOMEM;
@@ -627,24 +607,26 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
627 goto move_newpage; 607 goto move_newpage;
628 } 608 }
629 609
630 charge = mem_cgroup_prepare_migration(page, newpage);
631 if (charge == -ENOMEM) {
632 rc = -ENOMEM;
633 goto move_newpage;
634 }
635 /* prepare cgroup just returns 0 or -ENOMEM */ 610 /* prepare cgroup just returns 0 or -ENOMEM */
636 BUG_ON(charge);
637
638 rc = -EAGAIN; 611 rc = -EAGAIN;
612
639 if (!trylock_page(page)) { 613 if (!trylock_page(page)) {
640 if (!force) 614 if (!force)
641 goto move_newpage; 615 goto move_newpage;
642 lock_page(page); 616 lock_page(page);
643 } 617 }
644 618
619 /* charge against new page */
620 charge = mem_cgroup_prepare_migration(page, &mem);
621 if (charge == -ENOMEM) {
622 rc = -ENOMEM;
623 goto unlock;
624 }
625 BUG_ON(charge);
626
645 if (PageWriteback(page)) { 627 if (PageWriteback(page)) {
646 if (!force) 628 if (!force)
647 goto unlock; 629 goto uncharge;
648 wait_on_page_writeback(page); 630 wait_on_page_writeback(page);
649 } 631 }
650 /* 632 /*
@@ -697,7 +679,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
697rcu_unlock: 679rcu_unlock:
698 if (rcu_locked) 680 if (rcu_locked)
699 rcu_read_unlock(); 681 rcu_read_unlock();
700 682uncharge:
683 if (!charge)
684 mem_cgroup_end_migration(mem, page, newpage);
701unlock: 685unlock:
702 unlock_page(page); 686 unlock_page(page);
703 687
@@ -713,8 +697,6 @@ unlock:
713 } 697 }
714 698
715move_newpage: 699move_newpage:
716 if (!charge)
717 mem_cgroup_end_migration(newpage);
718 700
719 /* 701 /*
720 * Move the new page to the LRU. If migration was not successful 702 * Move the new page to the LRU. If migration was not successful
@@ -848,12 +830,6 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
848 struct vm_area_struct *vma; 830 struct vm_area_struct *vma;
849 struct page *page; 831 struct page *page;
850 832
851 /*
852 * A valid page pointer that will not match any of the
853 * pages that will be moved.
854 */
855 pp->page = ZERO_PAGE(0);
856
857 err = -EFAULT; 833 err = -EFAULT;
858 vma = find_vma(mm, pp->addr); 834 vma = find_vma(mm, pp->addr);
859 if (!vma || !vma_migratable(vma)) 835 if (!vma || !vma_migratable(vma))
@@ -919,41 +895,43 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
919 const int __user *nodes, 895 const int __user *nodes,
920 int __user *status, int flags) 896 int __user *status, int flags)
921{ 897{
922 struct page_to_node *pm = NULL; 898 struct page_to_node *pm;
923 nodemask_t task_nodes; 899 nodemask_t task_nodes;
924 int err = 0; 900 unsigned long chunk_nr_pages;
925 int i; 901 unsigned long chunk_start;
902 int err;
926 903
927 task_nodes = cpuset_mems_allowed(task); 904 task_nodes = cpuset_mems_allowed(task);
928 905
929 /* Limit nr_pages so that the multiplication may not overflow */ 906 err = -ENOMEM;
930 if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { 907 pm = (struct page_to_node *)__get_free_page(GFP_KERNEL);
931 err = -E2BIG; 908 if (!pm)
932 goto out;
933 }
934
935 pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node));
936 if (!pm) {
937 err = -ENOMEM;
938 goto out; 909 goto out;
939 }
940
941 /* 910 /*
942 * Get parameters from user space and initialize the pm 911 * Store a chunk of page_to_node array in a page,
943 * array. Return various errors if the user did something wrong. 912 * but keep the last one as a marker
944 */ 913 */
945 for (i = 0; i < nr_pages; i++) { 914 chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1;
946 const void __user *p;
947 915
948 err = -EFAULT; 916 for (chunk_start = 0;
949 if (get_user(p, pages + i)) 917 chunk_start < nr_pages;
950 goto out_pm; 918 chunk_start += chunk_nr_pages) {
919 int j;
920
921 if (chunk_start + chunk_nr_pages > nr_pages)
922 chunk_nr_pages = nr_pages - chunk_start;
951 923
952 pm[i].addr = (unsigned long)p; 924 /* fill the chunk pm with addrs and nodes from user-space */
953 if (nodes) { 925 for (j = 0; j < chunk_nr_pages; j++) {
926 const void __user *p;
954 int node; 927 int node;
955 928
956 if (get_user(node, nodes + i)) 929 err = -EFAULT;
930 if (get_user(p, pages + j + chunk_start))
931 goto out_pm;
932 pm[j].addr = (unsigned long) p;
933
934 if (get_user(node, nodes + j + chunk_start))
957 goto out_pm; 935 goto out_pm;
958 936
959 err = -ENODEV; 937 err = -ENODEV;
@@ -964,22 +942,29 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task,
964 if (!node_isset(node, task_nodes)) 942 if (!node_isset(node, task_nodes))
965 goto out_pm; 943 goto out_pm;
966 944
967 pm[i].node = node; 945 pm[j].node = node;
968 } else 946 }
969 pm[i].node = 0; /* anything to not match MAX_NUMNODES */ 947
970 } 948 /* End marker for this chunk */
971 /* End marker */ 949 pm[chunk_nr_pages].node = MAX_NUMNODES;
972 pm[nr_pages].node = MAX_NUMNODES; 950
951 /* Migrate this chunk */
952 err = do_move_page_to_node_array(mm, pm,
953 flags & MPOL_MF_MOVE_ALL);
954 if (err < 0)
955 goto out_pm;
973 956
974 err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL);
975 if (err >= 0)
976 /* Return status information */ 957 /* Return status information */
977 for (i = 0; i < nr_pages; i++) 958 for (j = 0; j < chunk_nr_pages; j++)
978 if (put_user(pm[i].status, status + i)) 959 if (put_user(pm[j].status, status + j + chunk_start)) {
979 err = -EFAULT; 960 err = -EFAULT;
961 goto out_pm;
962 }
963 }
964 err = 0;
980 965
981out_pm: 966out_pm:
982 vfree(pm); 967 free_page((unsigned long)pm);
983out: 968out:
984 return err; 969 return err;
985} 970}
diff --git a/mm/mlock.c b/mm/mlock.c
index 3035a56e7616..e125156c664e 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -173,12 +173,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
173 (atomic_read(&mm->mm_users) != 0)); 173 (atomic_read(&mm->mm_users) != 0));
174 174
175 /* 175 /*
176 * mlock: don't page populate if page has PROT_NONE permission. 176 * mlock: don't page populate if vma has PROT_NONE permission.
177 * munlock: the pages always do munlock althrough 177 * munlock: always do munlock although the vma has PROT_NONE
178 * its has PROT_NONE permission. 178 * permission, or SIGKILL is pending.
179 */ 179 */
180 if (!mlock) 180 if (!mlock)
181 gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS; 181 gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS |
182 GUP_FLAGS_IGNORE_SIGKILL;
182 183
183 if (vma->vm_flags & VM_WRITE) 184 if (vma->vm_flags & VM_WRITE)
184 gup_flags |= GUP_FLAGS_WRITE; 185 gup_flags |= GUP_FLAGS_WRITE;
diff --git a/mm/mmap.c b/mm/mmap.c
index 2c778fcfd9bd..749623196cb9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -413,7 +413,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
413 413
414static void __vma_link_file(struct vm_area_struct *vma) 414static void __vma_link_file(struct vm_area_struct *vma)
415{ 415{
416 struct file * file; 416 struct file *file;
417 417
418 file = vma->vm_file; 418 file = vma->vm_file;
419 if (file) { 419 if (file) {
@@ -474,11 +474,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
474 * insert vm structure into list and rbtree and anon_vma, 474 * insert vm structure into list and rbtree and anon_vma,
475 * but it has already been inserted into prio_tree earlier. 475 * but it has already been inserted into prio_tree earlier.
476 */ 476 */
477static void 477static void __insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
478__insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma)
479{ 478{
480 struct vm_area_struct * __vma, * prev; 479 struct vm_area_struct *__vma, *prev;
481 struct rb_node ** rb_link, * rb_parent; 480 struct rb_node **rb_link, *rb_parent;
482 481
483 __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent); 482 __vma = find_vma_prepare(mm, vma->vm_start,&prev, &rb_link, &rb_parent);
484 BUG_ON(__vma && __vma->vm_start < vma->vm_end); 483 BUG_ON(__vma && __vma->vm_start < vma->vm_end);
@@ -908,7 +907,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags,
908 * The caller must hold down_write(current->mm->mmap_sem). 907 * The caller must hold down_write(current->mm->mmap_sem).
909 */ 908 */
910 909
911unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, 910unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
912 unsigned long len, unsigned long prot, 911 unsigned long len, unsigned long prot,
913 unsigned long flags, unsigned long pgoff) 912 unsigned long flags, unsigned long pgoff)
914{ 913{
@@ -1464,7 +1463,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
1464EXPORT_SYMBOL(get_unmapped_area); 1463EXPORT_SYMBOL(get_unmapped_area);
1465 1464
1466/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ 1465/* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
1467struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long addr) 1466struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
1468{ 1467{
1469 struct vm_area_struct *vma = NULL; 1468 struct vm_area_struct *vma = NULL;
1470 1469
@@ -1507,7 +1506,7 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
1507 struct vm_area_struct **pprev) 1506 struct vm_area_struct **pprev)
1508{ 1507{
1509 struct vm_area_struct *vma = NULL, *prev = NULL; 1508 struct vm_area_struct *vma = NULL, *prev = NULL;
1510 struct rb_node * rb_node; 1509 struct rb_node *rb_node;
1511 if (!mm) 1510 if (!mm)
1512 goto out; 1511 goto out;
1513 1512
@@ -1541,7 +1540,7 @@ out:
1541 * update accounting. This is shared with both the 1540 * update accounting. This is shared with both the
1542 * grow-up and grow-down cases. 1541 * grow-up and grow-down cases.
1543 */ 1542 */
1544static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, unsigned long grow) 1543static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
1545{ 1544{
1546 struct mm_struct *mm = vma->vm_mm; 1545 struct mm_struct *mm = vma->vm_mm;
1547 struct rlimit *rlim = current->signal->rlim; 1546 struct rlimit *rlim = current->signal->rlim;
@@ -2091,6 +2090,9 @@ void exit_mmap(struct mm_struct *mm)
2091 arch_exit_mmap(mm); 2090 arch_exit_mmap(mm);
2092 mmu_notifier_release(mm); 2091 mmu_notifier_release(mm);
2093 2092
2093 if (!mm->mmap) /* Can happen if dup_mmap() received an OOM */
2094 return;
2095
2094 if (mm->locked_vm) { 2096 if (mm->locked_vm) {
2095 vma = mm->mmap; 2097 vma = mm->mmap;
2096 while (vma) { 2098 while (vma) {
@@ -2103,7 +2105,7 @@ void exit_mmap(struct mm_struct *mm)
2103 lru_add_drain(); 2105 lru_add_drain();
2104 flush_cache_mm(mm); 2106 flush_cache_mm(mm);
2105 tlb = tlb_gather_mmu(mm, 1); 2107 tlb = tlb_gather_mmu(mm, 1);
2106 /* Don't update_hiwater_rss(mm) here, do_exit already did */ 2108 /* update_hiwater_rss(mm) here? but nobody should be looking */
2107 /* Use -1 here to ensure all VMAs in the mm are unmapped */ 2109 /* Use -1 here to ensure all VMAs in the mm are unmapped */
2108 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); 2110 end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL);
2109 vm_unacct_memory(nr_accounted); 2111 vm_unacct_memory(nr_accounted);
@@ -2470,3 +2472,13 @@ void mm_drop_all_locks(struct mm_struct *mm)
2470 2472
2471 mutex_unlock(&mm_all_locks_mutex); 2473 mutex_unlock(&mm_all_locks_mutex);
2472} 2474}
2475
2476/*
2477 * initialise the VMA slab
2478 */
2479void __init mmap_init(void)
2480{
2481 vm_area_cachep = kmem_cache_create("vm_area_struct",
2482 sizeof(struct vm_area_struct), 0,
2483 SLAB_PANIC, NULL);
2484}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index cfb4c4852062..d0f6e7ce09f1 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -22,6 +22,7 @@
22#include <linux/swap.h> 22#include <linux/swap.h>
23#include <linux/swapops.h> 23#include <linux/swapops.h>
24#include <linux/mmu_notifier.h> 24#include <linux/mmu_notifier.h>
25#include <linux/migrate.h>
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26#include <asm/pgtable.h> 27#include <asm/pgtable.h>
27#include <asm/cacheflush.h> 28#include <asm/cacheflush.h>
@@ -59,8 +60,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
59 ptent = pte_mkwrite(ptent); 60 ptent = pte_mkwrite(ptent);
60 61
61 ptep_modify_prot_commit(mm, addr, pte, ptent); 62 ptep_modify_prot_commit(mm, addr, pte, ptent);
62#ifdef CONFIG_MIGRATION 63 } else if (PAGE_MIGRATION && !pte_file(oldpte)) {
63 } else if (!pte_file(oldpte)) {
64 swp_entry_t entry = pte_to_swp_entry(oldpte); 64 swp_entry_t entry = pte_to_swp_entry(oldpte);
65 65
66 if (is_write_migration_entry(entry)) { 66 if (is_write_migration_entry(entry)) {
@@ -72,9 +72,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd,
72 set_pte_at(mm, addr, pte, 72 set_pte_at(mm, addr, pte,
73 swp_entry_to_pte(entry)); 73 swp_entry_to_pte(entry));
74 } 74 }
75#endif
76 } 75 }
77
78 } while (pte++, addr += PAGE_SIZE, addr != end); 76 } while (pte++, addr += PAGE_SIZE, addr != end);
79 arch_leave_lazy_mmu_mode(); 77 arch_leave_lazy_mmu_mode();
80 pte_unmap_unlock(pte - 1, ptl); 78 pte_unmap_unlock(pte - 1, ptl);
diff --git a/mm/nommu.c b/mm/nommu.c
index 1c28ea3a4e9c..60ed8375c986 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -6,11 +6,11 @@
6 * 6 *
7 * See Documentation/nommu-mmap.txt 7 * See Documentation/nommu-mmap.txt
8 * 8 *
9 * Copyright (c) 2004-2005 David Howells <dhowells@redhat.com> 9 * Copyright (c) 2004-2008 David Howells <dhowells@redhat.com>
10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> 10 * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> 11 * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org>
12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> 12 * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com>
13 * Copyright (c) 2007 Paul Mundt <lethal@linux-sh.org> 13 * Copyright (c) 2007-2008 Paul Mundt <lethal@linux-sh.org>
14 */ 14 */
15 15
16#include <linux/module.h> 16#include <linux/module.h>
@@ -33,6 +33,28 @@
33#include <asm/uaccess.h> 33#include <asm/uaccess.h>
34#include <asm/tlb.h> 34#include <asm/tlb.h>
35#include <asm/tlbflush.h> 35#include <asm/tlbflush.h>
36#include "internal.h"
37
38static inline __attribute__((format(printf, 1, 2)))
39void no_printk(const char *fmt, ...)
40{
41}
42
43#if 0
44#define kenter(FMT, ...) \
45 printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
46#define kleave(FMT, ...) \
47 printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
48#define kdebug(FMT, ...) \
49 printk(KERN_DEBUG "xxx" FMT"yyy\n", ##__VA_ARGS__)
50#else
51#define kenter(FMT, ...) \
52 no_printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
53#define kleave(FMT, ...) \
54 no_printk(KERN_DEBUG "<== %s()"FMT"\n", __func__, ##__VA_ARGS__)
55#define kdebug(FMT, ...) \
56 no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__)
57#endif
36 58
37#include "internal.h" 59#include "internal.h"
38 60
@@ -40,19 +62,22 @@ void *high_memory;
40struct page *mem_map; 62struct page *mem_map;
41unsigned long max_mapnr; 63unsigned long max_mapnr;
42unsigned long num_physpages; 64unsigned long num_physpages;
43unsigned long askedalloc, realalloc;
44atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0); 65atomic_long_t vm_committed_space = ATOMIC_LONG_INIT(0);
45int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ 66int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */
46int sysctl_overcommit_ratio = 50; /* default is 50% */ 67int sysctl_overcommit_ratio = 50; /* default is 50% */
47int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; 68int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
69int sysctl_nr_trim_pages = 1; /* page trimming behaviour */
48int heap_stack_gap = 0; 70int heap_stack_gap = 0;
49 71
72atomic_t mmap_pages_allocated;
73
50EXPORT_SYMBOL(mem_map); 74EXPORT_SYMBOL(mem_map);
51EXPORT_SYMBOL(num_physpages); 75EXPORT_SYMBOL(num_physpages);
52 76
53/* list of shareable VMAs */ 77/* list of mapped, potentially shareable regions */
54struct rb_root nommu_vma_tree = RB_ROOT; 78static struct kmem_cache *vm_region_jar;
55DECLARE_RWSEM(nommu_vma_sem); 79struct rb_root nommu_region_tree = RB_ROOT;
80DECLARE_RWSEM(nommu_region_sem);
56 81
57struct vm_operations_struct generic_file_vm_ops = { 82struct vm_operations_struct generic_file_vm_ops = {
58}; 83};
@@ -124,6 +149,20 @@ unsigned int kobjsize(const void *objp)
124 return ksize(objp); 149 return ksize(objp);
125 150
126 /* 151 /*
152 * If it's not a compound page, see if we have a matching VMA
153 * region. This test is intentionally done in reverse order,
154 * so if there's no VMA, we still fall through and hand back
155 * PAGE_SIZE for 0-order pages.
156 */
157 if (!PageCompound(page)) {
158 struct vm_area_struct *vma;
159
160 vma = find_vma(current->mm, (unsigned long)objp);
161 if (vma)
162 return vma->vm_end - vma->vm_start;
163 }
164
165 /*
127 * The ksize() function is only guaranteed to work for pointers 166 * The ksize() function is only guaranteed to work for pointers
128 * returned by kmalloc(). So handle arbitrary pointers here. 167 * returned by kmalloc(). So handle arbitrary pointers here.
129 */ 168 */
@@ -401,129 +440,178 @@ asmlinkage unsigned long sys_brk(unsigned long brk)
401 return mm->brk = brk; 440 return mm->brk = brk;
402} 441}
403 442
404#ifdef DEBUG 443/*
405static void show_process_blocks(void) 444 * initialise the VMA and region record slabs
445 */
446void __init mmap_init(void)
406{ 447{
407 struct vm_list_struct *vml; 448 vm_region_jar = kmem_cache_create("vm_region_jar",
408 449 sizeof(struct vm_region), 0,
409 printk("Process blocks %d:", current->pid); 450 SLAB_PANIC, NULL);
410 451 vm_area_cachep = kmem_cache_create("vm_area_struct",
411 for (vml = &current->mm->context.vmlist; vml; vml = vml->next) { 452 sizeof(struct vm_area_struct), 0,
412 printk(" %p: %p", vml, vml->vma); 453 SLAB_PANIC, NULL);
413 if (vml->vma)
414 printk(" (%d @%lx #%d)",
415 kobjsize((void *) vml->vma->vm_start),
416 vml->vma->vm_start,
417 atomic_read(&vml->vma->vm_usage));
418 printk(vml->next ? " ->" : ".\n");
419 }
420} 454}
421#endif /* DEBUG */
422 455
423/* 456/*
424 * add a VMA into a process's mm_struct in the appropriate place in the list 457 * validate the region tree
425 * - should be called with mm->mmap_sem held writelocked 458 * - the caller must hold the region lock
426 */ 459 */
427static void add_vma_to_mm(struct mm_struct *mm, struct vm_list_struct *vml) 460#ifdef CONFIG_DEBUG_NOMMU_REGIONS
461static noinline void validate_nommu_regions(void)
428{ 462{
429 struct vm_list_struct **ppv; 463 struct vm_region *region, *last;
430 464 struct rb_node *p, *lastp;
431 for (ppv = &current->mm->context.vmlist; *ppv; ppv = &(*ppv)->next) 465
432 if ((*ppv)->vma->vm_start > vml->vma->vm_start) 466 lastp = rb_first(&nommu_region_tree);
433 break; 467 if (!lastp)
434 468 return;
435 vml->next = *ppv; 469
436 *ppv = vml; 470 last = rb_entry(lastp, struct vm_region, vm_rb);
471 if (unlikely(last->vm_end <= last->vm_start))
472 BUG();
473 if (unlikely(last->vm_top < last->vm_end))
474 BUG();
475
476 while ((p = rb_next(lastp))) {
477 region = rb_entry(p, struct vm_region, vm_rb);
478 last = rb_entry(lastp, struct vm_region, vm_rb);
479
480 if (unlikely(region->vm_end <= region->vm_start))
481 BUG();
482 if (unlikely(region->vm_top < region->vm_end))
483 BUG();
484 if (unlikely(region->vm_start < last->vm_top))
485 BUG();
486
487 lastp = p;
488 }
437} 489}
490#else
491#define validate_nommu_regions() do {} while(0)
492#endif
438 493
439/* 494/*
440 * look up the first VMA in which addr resides, NULL if none 495 * add a region into the global tree
441 * - should be called with mm->mmap_sem at least held readlocked
442 */ 496 */
443struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) 497static void add_nommu_region(struct vm_region *region)
444{ 498{
445 struct vm_list_struct *loop, *vml; 499 struct vm_region *pregion;
500 struct rb_node **p, *parent;
446 501
447 /* search the vm_start ordered list */ 502 validate_nommu_regions();
448 vml = NULL; 503
449 for (loop = mm->context.vmlist; loop; loop = loop->next) { 504 BUG_ON(region->vm_start & ~PAGE_MASK);
450 if (loop->vma->vm_start > addr) 505
451 break; 506 parent = NULL;
452 vml = loop; 507 p = &nommu_region_tree.rb_node;
508 while (*p) {
509 parent = *p;
510 pregion = rb_entry(parent, struct vm_region, vm_rb);
511 if (region->vm_start < pregion->vm_start)
512 p = &(*p)->rb_left;
513 else if (region->vm_start > pregion->vm_start)
514 p = &(*p)->rb_right;
515 else if (pregion == region)
516 return;
517 else
518 BUG();
453 } 519 }
454 520
455 if (vml && vml->vma->vm_end > addr) 521 rb_link_node(&region->vm_rb, parent, p);
456 return vml->vma; 522 rb_insert_color(&region->vm_rb, &nommu_region_tree);
457 523
458 return NULL; 524 validate_nommu_regions();
459} 525}
460EXPORT_SYMBOL(find_vma);
461 526
462/* 527/*
463 * find a VMA 528 * delete a region from the global tree
464 * - we don't extend stack VMAs under NOMMU conditions
465 */ 529 */
466struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr) 530static void delete_nommu_region(struct vm_region *region)
467{ 531{
468 return find_vma(mm, addr); 532 BUG_ON(!nommu_region_tree.rb_node);
469}
470 533
471int expand_stack(struct vm_area_struct *vma, unsigned long address) 534 validate_nommu_regions();
472{ 535 rb_erase(&region->vm_rb, &nommu_region_tree);
473 return -ENOMEM; 536 validate_nommu_regions();
474} 537}
475 538
476/* 539/*
477 * look up the first VMA exactly that exactly matches addr 540 * free a contiguous series of pages
478 * - should be called with mm->mmap_sem at least held readlocked
479 */ 541 */
480static inline struct vm_area_struct *find_vma_exact(struct mm_struct *mm, 542static void free_page_series(unsigned long from, unsigned long to)
481 unsigned long addr)
482{ 543{
483 struct vm_list_struct *vml; 544 for (; from < to; from += PAGE_SIZE) {
484 545 struct page *page = virt_to_page(from);
485 /* search the vm_start ordered list */ 546
486 for (vml = mm->context.vmlist; vml; vml = vml->next) { 547 kdebug("- free %lx", from);
487 if (vml->vma->vm_start == addr) 548 atomic_dec(&mmap_pages_allocated);
488 return vml->vma; 549 if (page_count(page) != 1)
489 if (vml->vma->vm_start > addr) 550 kdebug("free page %p [%d]", page, page_count(page));
490 break; 551 put_page(page);
491 } 552 }
492
493 return NULL;
494} 553}
495 554
496/* 555/*
497 * find a VMA in the global tree 556 * release a reference to a region
557 * - the caller must hold the region semaphore, which this releases
558 * - the region may not have been added to the tree yet, in which case vm_top
559 * will equal vm_start
498 */ 560 */
499static inline struct vm_area_struct *find_nommu_vma(unsigned long start) 561static void __put_nommu_region(struct vm_region *region)
562 __releases(nommu_region_sem)
500{ 563{
501 struct vm_area_struct *vma; 564 kenter("%p{%d}", region, atomic_read(&region->vm_usage));
502 struct rb_node *n = nommu_vma_tree.rb_node;
503 565
504 while (n) { 566 BUG_ON(!nommu_region_tree.rb_node);
505 vma = rb_entry(n, struct vm_area_struct, vm_rb);
506 567
507 if (start < vma->vm_start) 568 if (atomic_dec_and_test(&region->vm_usage)) {
508 n = n->rb_left; 569 if (region->vm_top > region->vm_start)
509 else if (start > vma->vm_start) 570 delete_nommu_region(region);
510 n = n->rb_right; 571 up_write(&nommu_region_sem);
511 else 572
512 return vma; 573 if (region->vm_file)
574 fput(region->vm_file);
575
576 /* IO memory and memory shared directly out of the pagecache
577 * from ramfs/tmpfs mustn't be released here */
578 if (region->vm_flags & VM_MAPPED_COPY) {
579 kdebug("free series");
580 free_page_series(region->vm_start, region->vm_top);
581 }
582 kmem_cache_free(vm_region_jar, region);
583 } else {
584 up_write(&nommu_region_sem);
513 } 585 }
586}
514 587
515 return NULL; 588/*
589 * release a reference to a region
590 */
591static void put_nommu_region(struct vm_region *region)
592{
593 down_write(&nommu_region_sem);
594 __put_nommu_region(region);
516} 595}
517 596
518/* 597/*
519 * add a VMA in the global tree 598 * add a VMA into a process's mm_struct in the appropriate place in the list
599 * and tree and add to the address space's page tree also if not an anonymous
600 * page
601 * - should be called with mm->mmap_sem held writelocked
520 */ 602 */
521static void add_nommu_vma(struct vm_area_struct *vma) 603static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
522{ 604{
523 struct vm_area_struct *pvma; 605 struct vm_area_struct *pvma, **pp;
524 struct address_space *mapping; 606 struct address_space *mapping;
525 struct rb_node **p = &nommu_vma_tree.rb_node; 607 struct rb_node **p, *parent;
526 struct rb_node *parent = NULL; 608
609 kenter(",%p", vma);
610
611 BUG_ON(!vma->vm_region);
612
613 mm->map_count++;
614 vma->vm_mm = mm;
527 615
528 /* add the VMA to the mapping */ 616 /* add the VMA to the mapping */
529 if (vma->vm_file) { 617 if (vma->vm_file) {
@@ -534,42 +622,62 @@ static void add_nommu_vma(struct vm_area_struct *vma)
534 flush_dcache_mmap_unlock(mapping); 622 flush_dcache_mmap_unlock(mapping);
535 } 623 }
536 624
537 /* add the VMA to the master list */ 625 /* add the VMA to the tree */
626 parent = NULL;
627 p = &mm->mm_rb.rb_node;
538 while (*p) { 628 while (*p) {
539 parent = *p; 629 parent = *p;
540 pvma = rb_entry(parent, struct vm_area_struct, vm_rb); 630 pvma = rb_entry(parent, struct vm_area_struct, vm_rb);
541 631
542 if (vma->vm_start < pvma->vm_start) { 632 /* sort by: start addr, end addr, VMA struct addr in that order
633 * (the latter is necessary as we may get identical VMAs) */
634 if (vma->vm_start < pvma->vm_start)
543 p = &(*p)->rb_left; 635 p = &(*p)->rb_left;
544 } 636 else if (vma->vm_start > pvma->vm_start)
545 else if (vma->vm_start > pvma->vm_start) {
546 p = &(*p)->rb_right; 637 p = &(*p)->rb_right;
547 } 638 else if (vma->vm_end < pvma->vm_end)
548 else { 639 p = &(*p)->rb_left;
549 /* mappings are at the same address - this can only 640 else if (vma->vm_end > pvma->vm_end)
550 * happen for shared-mem chardevs and shared file 641 p = &(*p)->rb_right;
551 * mappings backed by ramfs/tmpfs */ 642 else if (vma < pvma)
552 BUG_ON(!(pvma->vm_flags & VM_SHARED)); 643 p = &(*p)->rb_left;
553 644 else if (vma > pvma)
554 if (vma < pvma) 645 p = &(*p)->rb_right;
555 p = &(*p)->rb_left; 646 else
556 else if (vma > pvma) 647 BUG();
557 p = &(*p)->rb_right;
558 else
559 BUG();
560 }
561 } 648 }
562 649
563 rb_link_node(&vma->vm_rb, parent, p); 650 rb_link_node(&vma->vm_rb, parent, p);
564 rb_insert_color(&vma->vm_rb, &nommu_vma_tree); 651 rb_insert_color(&vma->vm_rb, &mm->mm_rb);
652
653 /* add VMA to the VMA list also */
654 for (pp = &mm->mmap; (pvma = *pp); pp = &(*pp)->vm_next) {
655 if (pvma->vm_start > vma->vm_start)
656 break;
657 if (pvma->vm_start < vma->vm_start)
658 continue;
659 if (pvma->vm_end < vma->vm_end)
660 break;
661 }
662
663 vma->vm_next = *pp;
664 *pp = vma;
565} 665}
566 666
567/* 667/*
568 * delete a VMA from the global list 668 * delete a VMA from its owning mm_struct and address space
569 */ 669 */
570static void delete_nommu_vma(struct vm_area_struct *vma) 670static void delete_vma_from_mm(struct vm_area_struct *vma)
571{ 671{
672 struct vm_area_struct **pp;
572 struct address_space *mapping; 673 struct address_space *mapping;
674 struct mm_struct *mm = vma->vm_mm;
675
676 kenter("%p", vma);
677
678 mm->map_count--;
679 if (mm->mmap_cache == vma)
680 mm->mmap_cache = NULL;
573 681
574 /* remove the VMA from the mapping */ 682 /* remove the VMA from the mapping */
575 if (vma->vm_file) { 683 if (vma->vm_file) {
@@ -580,8 +688,115 @@ static void delete_nommu_vma(struct vm_area_struct *vma)
580 flush_dcache_mmap_unlock(mapping); 688 flush_dcache_mmap_unlock(mapping);
581 } 689 }
582 690
583 /* remove from the master list */ 691 /* remove from the MM's tree and list */
584 rb_erase(&vma->vm_rb, &nommu_vma_tree); 692 rb_erase(&vma->vm_rb, &mm->mm_rb);
693 for (pp = &mm->mmap; *pp; pp = &(*pp)->vm_next) {
694 if (*pp == vma) {
695 *pp = vma->vm_next;
696 break;
697 }
698 }
699
700 vma->vm_mm = NULL;
701}
702
703/*
704 * destroy a VMA record
705 */
706static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
707{
708 kenter("%p", vma);
709 if (vma->vm_ops && vma->vm_ops->close)
710 vma->vm_ops->close(vma);
711 if (vma->vm_file) {
712 fput(vma->vm_file);
713 if (vma->vm_flags & VM_EXECUTABLE)
714 removed_exe_file_vma(mm);
715 }
716 put_nommu_region(vma->vm_region);
717 kmem_cache_free(vm_area_cachep, vma);
718}
719
720/*
721 * look up the first VMA in which addr resides, NULL if none
722 * - should be called with mm->mmap_sem at least held readlocked
723 */
724struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
725{
726 struct vm_area_struct *vma;
727 struct rb_node *n = mm->mm_rb.rb_node;
728
729 /* check the cache first */
730 vma = mm->mmap_cache;
731 if (vma && vma->vm_start <= addr && vma->vm_end > addr)
732 return vma;
733
734 /* trawl the tree (there may be multiple mappings in which addr
735 * resides) */
736 for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
737 vma = rb_entry(n, struct vm_area_struct, vm_rb);
738 if (vma->vm_start > addr)
739 return NULL;
740 if (vma->vm_end > addr) {
741 mm->mmap_cache = vma;
742 return vma;
743 }
744 }
745
746 return NULL;
747}
748EXPORT_SYMBOL(find_vma);
749
750/*
751 * find a VMA
752 * - we don't extend stack VMAs under NOMMU conditions
753 */
754struct vm_area_struct *find_extend_vma(struct mm_struct *mm, unsigned long addr)
755{
756 return find_vma(mm, addr);
757}
758
759/*
760 * expand a stack to a given address
761 * - not supported under NOMMU conditions
762 */
763int expand_stack(struct vm_area_struct *vma, unsigned long address)
764{
765 return -ENOMEM;
766}
767
768/*
769 * look up the first VMA exactly that exactly matches addr
770 * - should be called with mm->mmap_sem at least held readlocked
771 */
772static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
773 unsigned long addr,
774 unsigned long len)
775{
776 struct vm_area_struct *vma;
777 struct rb_node *n = mm->mm_rb.rb_node;
778 unsigned long end = addr + len;
779
780 /* check the cache first */
781 vma = mm->mmap_cache;
782 if (vma && vma->vm_start == addr && vma->vm_end == end)
783 return vma;
784
785 /* trawl the tree (there may be multiple mappings in which addr
786 * resides) */
787 for (n = rb_first(&mm->mm_rb); n; n = rb_next(n)) {
788 vma = rb_entry(n, struct vm_area_struct, vm_rb);
789 if (vma->vm_start < addr)
790 continue;
791 if (vma->vm_start > addr)
792 return NULL;
793 if (vma->vm_end == end) {
794 mm->mmap_cache = vma;
795 return vma;
796 }
797 }
798
799 return NULL;
585} 800}
586 801
587/* 802/*
@@ -596,7 +811,7 @@ static int validate_mmap_request(struct file *file,
596 unsigned long pgoff, 811 unsigned long pgoff,
597 unsigned long *_capabilities) 812 unsigned long *_capabilities)
598{ 813{
599 unsigned long capabilities; 814 unsigned long capabilities, rlen;
600 unsigned long reqprot = prot; 815 unsigned long reqprot = prot;
601 int ret; 816 int ret;
602 817
@@ -616,12 +831,12 @@ static int validate_mmap_request(struct file *file,
616 return -EINVAL; 831 return -EINVAL;
617 832
618 /* Careful about overflows.. */ 833 /* Careful about overflows.. */
619 len = PAGE_ALIGN(len); 834 rlen = PAGE_ALIGN(len);
620 if (!len || len > TASK_SIZE) 835 if (!rlen || rlen > TASK_SIZE)
621 return -ENOMEM; 836 return -ENOMEM;
622 837
623 /* offset overflow? */ 838 /* offset overflow? */
624 if ((pgoff + (len >> PAGE_SHIFT)) < pgoff) 839 if ((pgoff + (rlen >> PAGE_SHIFT)) < pgoff)
625 return -EOVERFLOW; 840 return -EOVERFLOW;
626 841
627 if (file) { 842 if (file) {
@@ -795,13 +1010,18 @@ static unsigned long determine_vm_flags(struct file *file,
795} 1010}
796 1011
797/* 1012/*
798 * set up a shared mapping on a file 1013 * set up a shared mapping on a file (the driver or filesystem provides and
1014 * pins the storage)
799 */ 1015 */
800static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len) 1016static int do_mmap_shared_file(struct vm_area_struct *vma)
801{ 1017{
802 int ret; 1018 int ret;
803 1019
804 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); 1020 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
1021 if (ret == 0) {
1022 vma->vm_region->vm_top = vma->vm_region->vm_end;
1023 return ret;
1024 }
805 if (ret != -ENOSYS) 1025 if (ret != -ENOSYS)
806 return ret; 1026 return ret;
807 1027
@@ -815,10 +1035,14 @@ static int do_mmap_shared_file(struct vm_area_struct *vma, unsigned long len)
815/* 1035/*
816 * set up a private mapping or an anonymous shared mapping 1036 * set up a private mapping or an anonymous shared mapping
817 */ 1037 */
818static int do_mmap_private(struct vm_area_struct *vma, unsigned long len) 1038static int do_mmap_private(struct vm_area_struct *vma,
1039 struct vm_region *region,
1040 unsigned long len)
819{ 1041{
1042 struct page *pages;
1043 unsigned long total, point, n, rlen;
820 void *base; 1044 void *base;
821 int ret; 1045 int ret, order;
822 1046
823 /* invoke the file's mapping function so that it can keep track of 1047 /* invoke the file's mapping function so that it can keep track of
824 * shared mappings on devices or memory 1048 * shared mappings on devices or memory
@@ -826,34 +1050,63 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
826 */ 1050 */
827 if (vma->vm_file) { 1051 if (vma->vm_file) {
828 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); 1052 ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
829 if (ret != -ENOSYS) { 1053 if (ret == 0) {
830 /* shouldn't return success if we're not sharing */ 1054 /* shouldn't return success if we're not sharing */
831 BUG_ON(ret == 0 && !(vma->vm_flags & VM_MAYSHARE)); 1055 BUG_ON(!(vma->vm_flags & VM_MAYSHARE));
832 return ret; /* success or a real error */ 1056 vma->vm_region->vm_top = vma->vm_region->vm_end;
1057 return ret;
833 } 1058 }
1059 if (ret != -ENOSYS)
1060 return ret;
834 1061
835 /* getting an ENOSYS error indicates that direct mmap isn't 1062 /* getting an ENOSYS error indicates that direct mmap isn't
836 * possible (as opposed to tried but failed) so we'll try to 1063 * possible (as opposed to tried but failed) so we'll try to
837 * make a private copy of the data and map that instead */ 1064 * make a private copy of the data and map that instead */
838 } 1065 }
839 1066
1067 rlen = PAGE_ALIGN(len);
1068
840 /* allocate some memory to hold the mapping 1069 /* allocate some memory to hold the mapping
841 * - note that this may not return a page-aligned address if the object 1070 * - note that this may not return a page-aligned address if the object
842 * we're allocating is smaller than a page 1071 * we're allocating is smaller than a page
843 */ 1072 */
844 base = kmalloc(len, GFP_KERNEL|__GFP_COMP); 1073 order = get_order(rlen);
845 if (!base) 1074 kdebug("alloc order %d for %lx", order, len);
1075
1076 pages = alloc_pages(GFP_KERNEL, order);
1077 if (!pages)
846 goto enomem; 1078 goto enomem;
847 1079
848 vma->vm_start = (unsigned long) base; 1080 total = 1 << order;
849 vma->vm_end = vma->vm_start + len; 1081 atomic_add(total, &mmap_pages_allocated);
850 vma->vm_flags |= VM_MAPPED_COPY; 1082
1083 point = rlen >> PAGE_SHIFT;
1084
1085 /* we allocated a power-of-2 sized page set, so we may want to trim off
1086 * the excess */
1087 if (sysctl_nr_trim_pages && total - point >= sysctl_nr_trim_pages) {
1088 while (total > point) {
1089 order = ilog2(total - point);
1090 n = 1 << order;
1091 kdebug("shave %lu/%lu @%lu", n, total - point, total);
1092 atomic_sub(n, &mmap_pages_allocated);
1093 total -= n;
1094 set_page_refcounted(pages + total);
1095 __free_pages(pages + total, order);
1096 }
1097 }
1098
1099 for (point = 1; point < total; point++)
1100 set_page_refcounted(&pages[point]);
851 1101
852#ifdef WARN_ON_SLACK 1102 base = page_address(pages);
853 if (len + WARN_ON_SLACK <= kobjsize(result)) 1103 region->vm_flags = vma->vm_flags |= VM_MAPPED_COPY;
854 printk("Allocation of %lu bytes from process %d has %lu bytes of slack\n", 1104 region->vm_start = (unsigned long) base;
855 len, current->pid, kobjsize(result) - len); 1105 region->vm_end = region->vm_start + rlen;
856#endif 1106 region->vm_top = region->vm_start + (total << PAGE_SHIFT);
1107
1108 vma->vm_start = region->vm_start;
1109 vma->vm_end = region->vm_start + len;
857 1110
858 if (vma->vm_file) { 1111 if (vma->vm_file) {
859 /* read the contents of a file into the copy */ 1112 /* read the contents of a file into the copy */
@@ -865,26 +1118,28 @@ static int do_mmap_private(struct vm_area_struct *vma, unsigned long len)
865 1118
866 old_fs = get_fs(); 1119 old_fs = get_fs();
867 set_fs(KERNEL_DS); 1120 set_fs(KERNEL_DS);
868 ret = vma->vm_file->f_op->read(vma->vm_file, base, len, &fpos); 1121 ret = vma->vm_file->f_op->read(vma->vm_file, base, rlen, &fpos);
869 set_fs(old_fs); 1122 set_fs(old_fs);
870 1123
871 if (ret < 0) 1124 if (ret < 0)
872 goto error_free; 1125 goto error_free;
873 1126
874 /* clear the last little bit */ 1127 /* clear the last little bit */
875 if (ret < len) 1128 if (ret < rlen)
876 memset(base + ret, 0, len - ret); 1129 memset(base + ret, 0, rlen - ret);
877 1130
878 } else { 1131 } else {
879 /* if it's an anonymous mapping, then just clear it */ 1132 /* if it's an anonymous mapping, then just clear it */
880 memset(base, 0, len); 1133 memset(base, 0, rlen);
881 } 1134 }
882 1135
883 return 0; 1136 return 0;
884 1137
885error_free: 1138error_free:
886 kfree(base); 1139 free_page_series(region->vm_start, region->vm_end);
887 vma->vm_start = 0; 1140 region->vm_start = vma->vm_start = 0;
1141 region->vm_end = vma->vm_end = 0;
1142 region->vm_top = 0;
888 return ret; 1143 return ret;
889 1144
890enomem: 1145enomem:
@@ -904,13 +1159,14 @@ unsigned long do_mmap_pgoff(struct file *file,
904 unsigned long flags, 1159 unsigned long flags,
905 unsigned long pgoff) 1160 unsigned long pgoff)
906{ 1161{
907 struct vm_list_struct *vml = NULL; 1162 struct vm_area_struct *vma;
908 struct vm_area_struct *vma = NULL; 1163 struct vm_region *region;
909 struct rb_node *rb; 1164 struct rb_node *rb;
910 unsigned long capabilities, vm_flags; 1165 unsigned long capabilities, vm_flags, result;
911 void *result;
912 int ret; 1166 int ret;
913 1167
1168 kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff);
1169
914 if (!(flags & MAP_FIXED)) 1170 if (!(flags & MAP_FIXED))
915 addr = round_hint_to_min(addr); 1171 addr = round_hint_to_min(addr);
916 1172
@@ -918,73 +1174,120 @@ unsigned long do_mmap_pgoff(struct file *file,
918 * mapping */ 1174 * mapping */
919 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, 1175 ret = validate_mmap_request(file, addr, len, prot, flags, pgoff,
920 &capabilities); 1176 &capabilities);
921 if (ret < 0) 1177 if (ret < 0) {
1178 kleave(" = %d [val]", ret);
922 return ret; 1179 return ret;
1180 }
923 1181
924 /* we've determined that we can make the mapping, now translate what we 1182 /* we've determined that we can make the mapping, now translate what we
925 * now know into VMA flags */ 1183 * now know into VMA flags */
926 vm_flags = determine_vm_flags(file, prot, flags, capabilities); 1184 vm_flags = determine_vm_flags(file, prot, flags, capabilities);
927 1185
928 /* we're going to need to record the mapping if it works */ 1186 /* we're going to need to record the mapping */
929 vml = kzalloc(sizeof(struct vm_list_struct), GFP_KERNEL); 1187 region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
930 if (!vml) 1188 if (!region)
931 goto error_getting_vml; 1189 goto error_getting_region;
1190
1191 vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
1192 if (!vma)
1193 goto error_getting_vma;
1194
1195 atomic_set(&region->vm_usage, 1);
1196 region->vm_flags = vm_flags;
1197 region->vm_pgoff = pgoff;
1198
1199 INIT_LIST_HEAD(&vma->anon_vma_node);
1200 vma->vm_flags = vm_flags;
1201 vma->vm_pgoff = pgoff;
932 1202
933 down_write(&nommu_vma_sem); 1203 if (file) {
1204 region->vm_file = file;
1205 get_file(file);
1206 vma->vm_file = file;
1207 get_file(file);
1208 if (vm_flags & VM_EXECUTABLE) {
1209 added_exe_file_vma(current->mm);
1210 vma->vm_mm = current->mm;
1211 }
1212 }
934 1213
935 /* if we want to share, we need to check for VMAs created by other 1214 down_write(&nommu_region_sem);
1215
1216 /* if we want to share, we need to check for regions created by other
936 * mmap() calls that overlap with our proposed mapping 1217 * mmap() calls that overlap with our proposed mapping
937 * - we can only share with an exact match on most regular files 1218 * - we can only share with a superset match on most regular files
938 * - shared mappings on character devices and memory backed files are 1219 * - shared mappings on character devices and memory backed files are
939 * permitted to overlap inexactly as far as we are concerned for in 1220 * permitted to overlap inexactly as far as we are concerned for in
940 * these cases, sharing is handled in the driver or filesystem rather 1221 * these cases, sharing is handled in the driver or filesystem rather
941 * than here 1222 * than here
942 */ 1223 */
943 if (vm_flags & VM_MAYSHARE) { 1224 if (vm_flags & VM_MAYSHARE) {
944 unsigned long pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT; 1225 struct vm_region *pregion;
945 unsigned long vmpglen; 1226 unsigned long pglen, rpglen, pgend, rpgend, start;
946 1227
947 /* suppress VMA sharing for shared regions */ 1228 pglen = (len + PAGE_SIZE - 1) >> PAGE_SHIFT;
948 if (vm_flags & VM_SHARED && 1229 pgend = pgoff + pglen;
949 capabilities & BDI_CAP_MAP_DIRECT)
950 goto dont_share_VMAs;
951 1230
952 for (rb = rb_first(&nommu_vma_tree); rb; rb = rb_next(rb)) { 1231 for (rb = rb_first(&nommu_region_tree); rb; rb = rb_next(rb)) {
953 vma = rb_entry(rb, struct vm_area_struct, vm_rb); 1232 pregion = rb_entry(rb, struct vm_region, vm_rb);
954 1233
955 if (!(vma->vm_flags & VM_MAYSHARE)) 1234 if (!(pregion->vm_flags & VM_MAYSHARE))
956 continue; 1235 continue;
957 1236
958 /* search for overlapping mappings on the same file */ 1237 /* search for overlapping mappings on the same file */
959 if (vma->vm_file->f_path.dentry->d_inode != file->f_path.dentry->d_inode) 1238 if (pregion->vm_file->f_path.dentry->d_inode !=
1239 file->f_path.dentry->d_inode)
960 continue; 1240 continue;
961 1241
962 if (vma->vm_pgoff >= pgoff + pglen) 1242 if (pregion->vm_pgoff >= pgend)
963 continue; 1243 continue;
964 1244
965 vmpglen = vma->vm_end - vma->vm_start + PAGE_SIZE - 1; 1245 rpglen = pregion->vm_end - pregion->vm_start;
966 vmpglen >>= PAGE_SHIFT; 1246 rpglen = (rpglen + PAGE_SIZE - 1) >> PAGE_SHIFT;
967 if (pgoff >= vma->vm_pgoff + vmpglen) 1247 rpgend = pregion->vm_pgoff + rpglen;
1248 if (pgoff >= rpgend)
968 continue; 1249 continue;
969 1250
970 /* handle inexactly overlapping matches between mappings */ 1251 /* handle inexactly overlapping matches between
971 if (vma->vm_pgoff != pgoff || vmpglen != pglen) { 1252 * mappings */
1253 if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
1254 !(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
1255 /* new mapping is not a subset of the region */
972 if (!(capabilities & BDI_CAP_MAP_DIRECT)) 1256 if (!(capabilities & BDI_CAP_MAP_DIRECT))
973 goto sharing_violation; 1257 goto sharing_violation;
974 continue; 1258 continue;
975 } 1259 }
976 1260
977 /* we've found a VMA we can share */ 1261 /* we've found a region we can share */
978 atomic_inc(&vma->vm_usage); 1262 atomic_inc(&pregion->vm_usage);
979 1263 vma->vm_region = pregion;
980 vml->vma = vma; 1264 start = pregion->vm_start;
981 result = (void *) vma->vm_start; 1265 start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT;
982 goto shared; 1266 vma->vm_start = start;
1267 vma->vm_end = start + len;
1268
1269 if (pregion->vm_flags & VM_MAPPED_COPY) {
1270 kdebug("share copy");
1271 vma->vm_flags |= VM_MAPPED_COPY;
1272 } else {
1273 kdebug("share mmap");
1274 ret = do_mmap_shared_file(vma);
1275 if (ret < 0) {
1276 vma->vm_region = NULL;
1277 vma->vm_start = 0;
1278 vma->vm_end = 0;
1279 atomic_dec(&pregion->vm_usage);
1280 pregion = NULL;
1281 goto error_just_free;
1282 }
1283 }
1284 fput(region->vm_file);
1285 kmem_cache_free(vm_region_jar, region);
1286 region = pregion;
1287 result = start;
1288 goto share;
983 } 1289 }
984 1290
985 dont_share_VMAs:
986 vma = NULL;
987
988 /* obtain the address at which to make a shared mapping 1291 /* obtain the address at which to make a shared mapping
989 * - this is the hook for quasi-memory character devices to 1292 * - this is the hook for quasi-memory character devices to
990 * tell us the location of a shared mapping 1293 * tell us the location of a shared mapping
@@ -995,113 +1298,93 @@ unsigned long do_mmap_pgoff(struct file *file,
995 if (IS_ERR((void *) addr)) { 1298 if (IS_ERR((void *) addr)) {
996 ret = addr; 1299 ret = addr;
997 if (ret != (unsigned long) -ENOSYS) 1300 if (ret != (unsigned long) -ENOSYS)
998 goto error; 1301 goto error_just_free;
999 1302
1000 /* the driver refused to tell us where to site 1303 /* the driver refused to tell us where to site
1001 * the mapping so we'll have to attempt to copy 1304 * the mapping so we'll have to attempt to copy
1002 * it */ 1305 * it */
1003 ret = (unsigned long) -ENODEV; 1306 ret = (unsigned long) -ENODEV;
1004 if (!(capabilities & BDI_CAP_MAP_COPY)) 1307 if (!(capabilities & BDI_CAP_MAP_COPY))
1005 goto error; 1308 goto error_just_free;
1006 1309
1007 capabilities &= ~BDI_CAP_MAP_DIRECT; 1310 capabilities &= ~BDI_CAP_MAP_DIRECT;
1311 } else {
1312 vma->vm_start = region->vm_start = addr;
1313 vma->vm_end = region->vm_end = addr + len;
1008 } 1314 }
1009 } 1315 }
1010 } 1316 }
1011 1317
1012 /* we're going to need a VMA struct as well */ 1318 vma->vm_region = region;
1013 vma = kzalloc(sizeof(struct vm_area_struct), GFP_KERNEL);
1014 if (!vma)
1015 goto error_getting_vma;
1016
1017 INIT_LIST_HEAD(&vma->anon_vma_node);
1018 atomic_set(&vma->vm_usage, 1);
1019 if (file) {
1020 get_file(file);
1021 if (vm_flags & VM_EXECUTABLE) {
1022 added_exe_file_vma(current->mm);
1023 vma->vm_mm = current->mm;
1024 }
1025 }
1026 vma->vm_file = file;
1027 vma->vm_flags = vm_flags;
1028 vma->vm_start = addr;
1029 vma->vm_end = addr + len;
1030 vma->vm_pgoff = pgoff;
1031
1032 vml->vma = vma;
1033 1319
1034 /* set up the mapping */ 1320 /* set up the mapping */
1035 if (file && vma->vm_flags & VM_SHARED) 1321 if (file && vma->vm_flags & VM_SHARED)
1036 ret = do_mmap_shared_file(vma, len); 1322 ret = do_mmap_shared_file(vma);
1037 else 1323 else
1038 ret = do_mmap_private(vma, len); 1324 ret = do_mmap_private(vma, region, len);
1039 if (ret < 0) 1325 if (ret < 0)
1040 goto error; 1326 goto error_put_region;
1041
1042 /* okay... we have a mapping; now we have to register it */
1043 result = (void *) vma->vm_start;
1044 1327
1045 if (vma->vm_flags & VM_MAPPED_COPY) { 1328 add_nommu_region(region);
1046 realalloc += kobjsize(result);
1047 askedalloc += len;
1048 }
1049 1329
1050 realalloc += kobjsize(vma); 1330 /* okay... we have a mapping; now we have to register it */
1051 askedalloc += sizeof(*vma); 1331 result = vma->vm_start;
1052 1332
1053 current->mm->total_vm += len >> PAGE_SHIFT; 1333 current->mm->total_vm += len >> PAGE_SHIFT;
1054 1334
1055 add_nommu_vma(vma); 1335share:
1056 1336 add_vma_to_mm(current->mm, vma);
1057 shared:
1058 realalloc += kobjsize(vml);
1059 askedalloc += sizeof(*vml);
1060
1061 add_vma_to_mm(current->mm, vml);
1062 1337
1063 up_write(&nommu_vma_sem); 1338 up_write(&nommu_region_sem);
1064 1339
1065 if (prot & PROT_EXEC) 1340 if (prot & PROT_EXEC)
1066 flush_icache_range((unsigned long) result, 1341 flush_icache_range(result, result + len);
1067 (unsigned long) result + len);
1068 1342
1069#ifdef DEBUG 1343 kleave(" = %lx", result);
1070 printk("do_mmap:\n"); 1344 return result;
1071 show_process_blocks();
1072#endif
1073
1074 return (unsigned long) result;
1075 1345
1076 error: 1346error_put_region:
1077 up_write(&nommu_vma_sem); 1347 __put_nommu_region(region);
1078 kfree(vml);
1079 if (vma) { 1348 if (vma) {
1080 if (vma->vm_file) { 1349 if (vma->vm_file) {
1081 fput(vma->vm_file); 1350 fput(vma->vm_file);
1082 if (vma->vm_flags & VM_EXECUTABLE) 1351 if (vma->vm_flags & VM_EXECUTABLE)
1083 removed_exe_file_vma(vma->vm_mm); 1352 removed_exe_file_vma(vma->vm_mm);
1084 } 1353 }
1085 kfree(vma); 1354 kmem_cache_free(vm_area_cachep, vma);
1086 } 1355 }
1356 kleave(" = %d [pr]", ret);
1087 return ret; 1357 return ret;
1088 1358
1089 sharing_violation: 1359error_just_free:
1090 up_write(&nommu_vma_sem); 1360 up_write(&nommu_region_sem);
1091 printk("Attempt to share mismatched mappings\n"); 1361error:
1092 kfree(vml); 1362 fput(region->vm_file);
1093 return -EINVAL; 1363 kmem_cache_free(vm_region_jar, region);
1364 fput(vma->vm_file);
1365 if (vma->vm_flags & VM_EXECUTABLE)
1366 removed_exe_file_vma(vma->vm_mm);
1367 kmem_cache_free(vm_area_cachep, vma);
1368 kleave(" = %d", ret);
1369 return ret;
1094 1370
1095 error_getting_vma: 1371sharing_violation:
1096 up_write(&nommu_vma_sem); 1372 up_write(&nommu_region_sem);
1097 kfree(vml); 1373 printk(KERN_WARNING "Attempt to share mismatched mappings\n");
1098 printk("Allocation of vma for %lu byte allocation from process %d failed\n", 1374 ret = -EINVAL;
1375 goto error;
1376
1377error_getting_vma:
1378 kmem_cache_free(vm_region_jar, region);
1379 printk(KERN_WARNING "Allocation of vma for %lu byte allocation"
1380 " from process %d failed\n",
1099 len, current->pid); 1381 len, current->pid);
1100 show_free_areas(); 1382 show_free_areas();
1101 return -ENOMEM; 1383 return -ENOMEM;
1102 1384
1103 error_getting_vml: 1385error_getting_region:
1104 printk("Allocation of vml for %lu byte allocation from process %d failed\n", 1386 printk(KERN_WARNING "Allocation of vm region for %lu byte allocation"
1387 " from process %d failed\n",
1105 len, current->pid); 1388 len, current->pid);
1106 show_free_areas(); 1389 show_free_areas();
1107 return -ENOMEM; 1390 return -ENOMEM;
@@ -1109,85 +1392,183 @@ unsigned long do_mmap_pgoff(struct file *file,
1109EXPORT_SYMBOL(do_mmap_pgoff); 1392EXPORT_SYMBOL(do_mmap_pgoff);
1110 1393
1111/* 1394/*
1112 * handle mapping disposal for uClinux 1395 * split a vma into two pieces at address 'addr', a new vma is allocated either
1396 * for the first part or the tail.
1113 */ 1397 */
1114static void put_vma(struct mm_struct *mm, struct vm_area_struct *vma) 1398int split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
1399 unsigned long addr, int new_below)
1115{ 1400{
1116 if (vma) { 1401 struct vm_area_struct *new;
1117 down_write(&nommu_vma_sem); 1402 struct vm_region *region;
1403 unsigned long npages;
1118 1404
1119 if (atomic_dec_and_test(&vma->vm_usage)) { 1405 kenter("");
1120 delete_nommu_vma(vma);
1121 1406
1122 if (vma->vm_ops && vma->vm_ops->close) 1407 /* we're only permitted to split anonymous regions that have a single
1123 vma->vm_ops->close(vma); 1408 * owner */
1409 if (vma->vm_file ||
1410 atomic_read(&vma->vm_region->vm_usage) != 1)
1411 return -ENOMEM;
1124 1412
1125 /* IO memory and memory shared directly out of the pagecache from 1413 if (mm->map_count >= sysctl_max_map_count)
1126 * ramfs/tmpfs mustn't be released here */ 1414 return -ENOMEM;
1127 if (vma->vm_flags & VM_MAPPED_COPY) {
1128 realalloc -= kobjsize((void *) vma->vm_start);
1129 askedalloc -= vma->vm_end - vma->vm_start;
1130 kfree((void *) vma->vm_start);
1131 }
1132 1415
1133 realalloc -= kobjsize(vma); 1416 region = kmem_cache_alloc(vm_region_jar, GFP_KERNEL);
1134 askedalloc -= sizeof(*vma); 1417 if (!region)
1418 return -ENOMEM;
1135 1419
1136 if (vma->vm_file) { 1420 new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
1137 fput(vma->vm_file); 1421 if (!new) {
1138 if (vma->vm_flags & VM_EXECUTABLE) 1422 kmem_cache_free(vm_region_jar, region);
1139 removed_exe_file_vma(mm); 1423 return -ENOMEM;
1140 } 1424 }
1141 kfree(vma); 1425
1142 } 1426 /* most fields are the same, copy all, and then fixup */
1427 *new = *vma;
1428 *region = *vma->vm_region;
1429 new->vm_region = region;
1430
1431 npages = (addr - vma->vm_start) >> PAGE_SHIFT;
1143 1432
1144 up_write(&nommu_vma_sem); 1433 if (new_below) {
1434 region->vm_top = region->vm_end = new->vm_end = addr;
1435 } else {
1436 region->vm_start = new->vm_start = addr;
1437 region->vm_pgoff = new->vm_pgoff += npages;
1438 }
1439
1440 if (new->vm_ops && new->vm_ops->open)
1441 new->vm_ops->open(new);
1442
1443 delete_vma_from_mm(vma);
1444 down_write(&nommu_region_sem);
1445 delete_nommu_region(vma->vm_region);
1446 if (new_below) {
1447 vma->vm_region->vm_start = vma->vm_start = addr;
1448 vma->vm_region->vm_pgoff = vma->vm_pgoff += npages;
1449 } else {
1450 vma->vm_region->vm_end = vma->vm_end = addr;
1451 vma->vm_region->vm_top = addr;
1145 } 1452 }
1453 add_nommu_region(vma->vm_region);
1454 add_nommu_region(new->vm_region);
1455 up_write(&nommu_region_sem);
1456 add_vma_to_mm(mm, vma);
1457 add_vma_to_mm(mm, new);
1458 return 0;
1146} 1459}
1147 1460
1148/* 1461/*
1149 * release a mapping 1462 * shrink a VMA by removing the specified chunk from either the beginning or
1150 * - under NOMMU conditions the parameters must match exactly to the mapping to 1463 * the end
1151 * be removed
1152 */ 1464 */
1153int do_munmap(struct mm_struct *mm, unsigned long addr, size_t len) 1465static int shrink_vma(struct mm_struct *mm,
1466 struct vm_area_struct *vma,
1467 unsigned long from, unsigned long to)
1154{ 1468{
1155 struct vm_list_struct *vml, **parent; 1469 struct vm_region *region;
1156 unsigned long end = addr + len;
1157 1470
1158#ifdef DEBUG 1471 kenter("");
1159 printk("do_munmap:\n");
1160#endif
1161 1472
1162 for (parent = &mm->context.vmlist; *parent; parent = &(*parent)->next) { 1473 /* adjust the VMA's pointers, which may reposition it in the MM's tree
1163 if ((*parent)->vma->vm_start > addr) 1474 * and list */
1164 break; 1475 delete_vma_from_mm(vma);
1165 if ((*parent)->vma->vm_start == addr && 1476 if (from > vma->vm_start)
1166 ((len == 0) || ((*parent)->vma->vm_end == end))) 1477 vma->vm_end = from;
1167 goto found; 1478 else
1479 vma->vm_start = to;
1480 add_vma_to_mm(mm, vma);
1481
1482 /* cut the backing region down to size */
1483 region = vma->vm_region;
1484 BUG_ON(atomic_read(&region->vm_usage) != 1);
1485
1486 down_write(&nommu_region_sem);
1487 delete_nommu_region(region);
1488 if (from > region->vm_start) {
1489 to = region->vm_top;
1490 region->vm_top = region->vm_end = from;
1491 } else {
1492 region->vm_start = to;
1168 } 1493 }
1494 add_nommu_region(region);
1495 up_write(&nommu_region_sem);
1169 1496
1170 printk("munmap of non-mmaped memory by process %d (%s): %p\n", 1497 free_page_series(from, to);
1171 current->pid, current->comm, (void *) addr); 1498 return 0;
1172 return -EINVAL; 1499}
1173 1500
1174 found: 1501/*
1175 vml = *parent; 1502 * release a mapping
1503 * - under NOMMU conditions the chunk to be unmapped must be backed by a single
1504 * VMA, though it need not cover the whole VMA
1505 */
1506int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1507{
1508 struct vm_area_struct *vma;
1509 struct rb_node *rb;
1510 unsigned long end = start + len;
1511 int ret;
1176 1512
1177 put_vma(mm, vml->vma); 1513 kenter(",%lx,%zx", start, len);
1178 1514
1179 *parent = vml->next; 1515 if (len == 0)
1180 realalloc -= kobjsize(vml); 1516 return -EINVAL;
1181 askedalloc -= sizeof(*vml);
1182 kfree(vml);
1183 1517
1184 update_hiwater_vm(mm); 1518 /* find the first potentially overlapping VMA */
1185 mm->total_vm -= len >> PAGE_SHIFT; 1519 vma = find_vma(mm, start);
1520 if (!vma) {
1521 printk(KERN_WARNING
1522 "munmap of memory not mmapped by process %d (%s):"
1523 " 0x%lx-0x%lx\n",
1524 current->pid, current->comm, start, start + len - 1);
1525 return -EINVAL;
1526 }
1186 1527
1187#ifdef DEBUG 1528 /* we're allowed to split an anonymous VMA but not a file-backed one */
1188 show_process_blocks(); 1529 if (vma->vm_file) {
1189#endif 1530 do {
1531 if (start > vma->vm_start) {
1532 kleave(" = -EINVAL [miss]");
1533 return -EINVAL;
1534 }
1535 if (end == vma->vm_end)
1536 goto erase_whole_vma;
1537 rb = rb_next(&vma->vm_rb);
1538 vma = rb_entry(rb, struct vm_area_struct, vm_rb);
1539 } while (rb);
1540 kleave(" = -EINVAL [split file]");
1541 return -EINVAL;
1542 } else {
1543 /* the chunk must be a subset of the VMA found */
1544 if (start == vma->vm_start && end == vma->vm_end)
1545 goto erase_whole_vma;
1546 if (start < vma->vm_start || end > vma->vm_end) {
1547 kleave(" = -EINVAL [superset]");
1548 return -EINVAL;
1549 }
1550 if (start & ~PAGE_MASK) {
1551 kleave(" = -EINVAL [unaligned start]");
1552 return -EINVAL;
1553 }
1554 if (end != vma->vm_end && end & ~PAGE_MASK) {
1555 kleave(" = -EINVAL [unaligned split]");
1556 return -EINVAL;
1557 }
1558 if (start != vma->vm_start && end != vma->vm_end) {
1559 ret = split_vma(mm, vma, start, 1);
1560 if (ret < 0) {
1561 kleave(" = %d [split]", ret);
1562 return ret;
1563 }
1564 }
1565 return shrink_vma(mm, vma, start, end);
1566 }
1190 1567
1568erase_whole_vma:
1569 delete_vma_from_mm(vma);
1570 delete_vma(mm, vma);
1571 kleave(" = 0");
1191 return 0; 1572 return 0;
1192} 1573}
1193EXPORT_SYMBOL(do_munmap); 1574EXPORT_SYMBOL(do_munmap);
@@ -1204,32 +1585,26 @@ asmlinkage long sys_munmap(unsigned long addr, size_t len)
1204} 1585}
1205 1586
1206/* 1587/*
1207 * Release all mappings 1588 * release all the mappings made in a process's VM space
1208 */ 1589 */
1209void exit_mmap(struct mm_struct * mm) 1590void exit_mmap(struct mm_struct *mm)
1210{ 1591{
1211 struct vm_list_struct *tmp; 1592 struct vm_area_struct *vma;
1212
1213 if (mm) {
1214#ifdef DEBUG
1215 printk("Exit_mmap:\n");
1216#endif
1217 1593
1218 mm->total_vm = 0; 1594 if (!mm)
1595 return;
1219 1596
1220 while ((tmp = mm->context.vmlist)) { 1597 kenter("");
1221 mm->context.vmlist = tmp->next;
1222 put_vma(mm, tmp->vma);
1223 1598
1224 realalloc -= kobjsize(tmp); 1599 mm->total_vm = 0;
1225 askedalloc -= sizeof(*tmp);
1226 kfree(tmp);
1227 }
1228 1600
1229#ifdef DEBUG 1601 while ((vma = mm->mmap)) {
1230 show_process_blocks(); 1602 mm->mmap = vma->vm_next;
1231#endif 1603 delete_vma_from_mm(vma);
1604 delete_vma(mm, vma);
1232 } 1605 }
1606
1607 kleave("");
1233} 1608}
1234 1609
1235unsigned long do_brk(unsigned long addr, unsigned long len) 1610unsigned long do_brk(unsigned long addr, unsigned long len)
@@ -1242,8 +1617,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
1242 * time (controlled by the MREMAP_MAYMOVE flag and available VM space) 1617 * time (controlled by the MREMAP_MAYMOVE flag and available VM space)
1243 * 1618 *
1244 * under NOMMU conditions, we only permit changing a mapping's size, and only 1619 * under NOMMU conditions, we only permit changing a mapping's size, and only
1245 * as long as it stays within the hole allocated by the kmalloc() call in 1620 * as long as it stays within the region allocated by do_mmap_private() and the
1246 * do_mmap_pgoff() and the block is not shareable 1621 * block is not shareable
1247 * 1622 *
1248 * MREMAP_FIXED is not supported under NOMMU conditions 1623 * MREMAP_FIXED is not supported under NOMMU conditions
1249 */ 1624 */
@@ -1254,13 +1629,16 @@ unsigned long do_mremap(unsigned long addr,
1254 struct vm_area_struct *vma; 1629 struct vm_area_struct *vma;
1255 1630
1256 /* insanity checks first */ 1631 /* insanity checks first */
1257 if (new_len == 0) 1632 if (old_len == 0 || new_len == 0)
1258 return (unsigned long) -EINVAL; 1633 return (unsigned long) -EINVAL;
1259 1634
1635 if (addr & ~PAGE_MASK)
1636 return -EINVAL;
1637
1260 if (flags & MREMAP_FIXED && new_addr != addr) 1638 if (flags & MREMAP_FIXED && new_addr != addr)
1261 return (unsigned long) -EINVAL; 1639 return (unsigned long) -EINVAL;
1262 1640
1263 vma = find_vma_exact(current->mm, addr); 1641 vma = find_vma_exact(current->mm, addr, old_len);
1264 if (!vma) 1642 if (!vma)
1265 return (unsigned long) -EINVAL; 1643 return (unsigned long) -EINVAL;
1266 1644
@@ -1270,22 +1648,19 @@ unsigned long do_mremap(unsigned long addr,
1270 if (vma->vm_flags & VM_MAYSHARE) 1648 if (vma->vm_flags & VM_MAYSHARE)
1271 return (unsigned long) -EPERM; 1649 return (unsigned long) -EPERM;
1272 1650
1273 if (new_len > kobjsize((void *) addr)) 1651 if (new_len > vma->vm_region->vm_end - vma->vm_region->vm_start)
1274 return (unsigned long) -ENOMEM; 1652 return (unsigned long) -ENOMEM;
1275 1653
1276 /* all checks complete - do it */ 1654 /* all checks complete - do it */
1277 vma->vm_end = vma->vm_start + new_len; 1655 vma->vm_end = vma->vm_start + new_len;
1278
1279 askedalloc -= old_len;
1280 askedalloc += new_len;
1281
1282 return vma->vm_start; 1656 return vma->vm_start;
1283} 1657}
1284EXPORT_SYMBOL(do_mremap); 1658EXPORT_SYMBOL(do_mremap);
1285 1659
1286asmlinkage unsigned long sys_mremap(unsigned long addr, 1660asmlinkage
1287 unsigned long old_len, unsigned long new_len, 1661unsigned long sys_mremap(unsigned long addr,
1288 unsigned long flags, unsigned long new_addr) 1662 unsigned long old_len, unsigned long new_len,
1663 unsigned long flags, unsigned long new_addr)
1289{ 1664{
1290 unsigned long ret; 1665 unsigned long ret;
1291 1666
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 558f9afe6e4e..40ba05061a4f 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -31,7 +31,7 @@
31int sysctl_panic_on_oom; 31int sysctl_panic_on_oom;
32int sysctl_oom_kill_allocating_task; 32int sysctl_oom_kill_allocating_task;
33int sysctl_oom_dump_tasks; 33int sysctl_oom_dump_tasks;
34static DEFINE_SPINLOCK(zone_scan_mutex); 34static DEFINE_SPINLOCK(zone_scan_lock);
35/* #define DEBUG */ 35/* #define DEBUG */
36 36
37/** 37/**
@@ -392,6 +392,9 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
392 printk(KERN_WARNING "%s invoked oom-killer: " 392 printk(KERN_WARNING "%s invoked oom-killer: "
393 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", 393 "gfp_mask=0x%x, order=%d, oomkilladj=%d\n",
394 current->comm, gfp_mask, order, current->oomkilladj); 394 current->comm, gfp_mask, order, current->oomkilladj);
395 task_lock(current);
396 cpuset_print_task_mems_allowed(current);
397 task_unlock(current);
395 dump_stack(); 398 dump_stack();
396 show_mem(); 399 show_mem();
397 if (sysctl_oom_dump_tasks) 400 if (sysctl_oom_dump_tasks)
@@ -426,7 +429,6 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
426 unsigned long points = 0; 429 unsigned long points = 0;
427 struct task_struct *p; 430 struct task_struct *p;
428 431
429 cgroup_lock();
430 read_lock(&tasklist_lock); 432 read_lock(&tasklist_lock);
431retry: 433retry:
432 p = select_bad_process(&points, mem); 434 p = select_bad_process(&points, mem);
@@ -441,7 +443,6 @@ retry:
441 goto retry; 443 goto retry;
442out: 444out:
443 read_unlock(&tasklist_lock); 445 read_unlock(&tasklist_lock);
444 cgroup_unlock();
445} 446}
446#endif 447#endif
447 448
@@ -470,7 +471,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
470 struct zone *zone; 471 struct zone *zone;
471 int ret = 1; 472 int ret = 1;
472 473
473 spin_lock(&zone_scan_mutex); 474 spin_lock(&zone_scan_lock);
474 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 475 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
475 if (zone_is_oom_locked(zone)) { 476 if (zone_is_oom_locked(zone)) {
476 ret = 0; 477 ret = 0;
@@ -480,7 +481,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
480 481
481 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 482 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
482 /* 483 /*
483 * Lock each zone in the zonelist under zone_scan_mutex so a 484 * Lock each zone in the zonelist under zone_scan_lock so a
484 * parallel invocation of try_set_zone_oom() doesn't succeed 485 * parallel invocation of try_set_zone_oom() doesn't succeed
485 * when it shouldn't. 486 * when it shouldn't.
486 */ 487 */
@@ -488,7 +489,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask)
488 } 489 }
489 490
490out: 491out:
491 spin_unlock(&zone_scan_mutex); 492 spin_unlock(&zone_scan_lock);
492 return ret; 493 return ret;
493} 494}
494 495
@@ -502,11 +503,82 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
502 struct zoneref *z; 503 struct zoneref *z;
503 struct zone *zone; 504 struct zone *zone;
504 505
505 spin_lock(&zone_scan_mutex); 506 spin_lock(&zone_scan_lock);
506 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { 507 for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
507 zone_clear_flag(zone, ZONE_OOM_LOCKED); 508 zone_clear_flag(zone, ZONE_OOM_LOCKED);
508 } 509 }
509 spin_unlock(&zone_scan_mutex); 510 spin_unlock(&zone_scan_lock);
511}
512
513/*
514 * Must be called with tasklist_lock held for read.
515 */
516static void __out_of_memory(gfp_t gfp_mask, int order)
517{
518 if (sysctl_oom_kill_allocating_task) {
519 oom_kill_process(current, gfp_mask, order, 0, NULL,
520 "Out of memory (oom_kill_allocating_task)");
521
522 } else {
523 unsigned long points;
524 struct task_struct *p;
525
526retry:
527 /*
528 * Rambo mode: Shoot down a process and hope it solves whatever
529 * issues we may have.
530 */
531 p = select_bad_process(&points, NULL);
532
533 if (PTR_ERR(p) == -1UL)
534 return;
535
536 /* Found nothing?!?! Either we hang forever, or we panic. */
537 if (!p) {
538 read_unlock(&tasklist_lock);
539 panic("Out of memory and no killable processes...\n");
540 }
541
542 if (oom_kill_process(p, gfp_mask, order, points, NULL,
543 "Out of memory"))
544 goto retry;
545 }
546}
547
548/*
549 * pagefault handler calls into here because it is out of memory but
550 * doesn't know exactly how or why.
551 */
552void pagefault_out_of_memory(void)
553{
554 unsigned long freed = 0;
555
556 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
557 if (freed > 0)
558 /* Got some memory back in the last second. */
559 return;
560
561 /*
562 * If this is from memcg, oom-killer is already invoked.
563 * and not worth to go system-wide-oom.
564 */
565 if (mem_cgroup_oom_called(current))
566 goto rest_and_return;
567
568 if (sysctl_panic_on_oom)
569 panic("out of memory from page fault. panic_on_oom is selected.\n");
570
571 read_lock(&tasklist_lock);
572 __out_of_memory(0, 0); /* unknown gfp_mask and order */
573 read_unlock(&tasklist_lock);
574
575 /*
576 * Give "p" a good chance of killing itself before we
577 * retry to allocate memory.
578 */
579rest_and_return:
580 if (!test_thread_flag(TIF_MEMDIE))
581 schedule_timeout_uninterruptible(1);
510} 582}
511 583
512/** 584/**
@@ -522,8 +594,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
522 */ 594 */
523void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) 595void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
524{ 596{
525 struct task_struct *p;
526 unsigned long points = 0;
527 unsigned long freed = 0; 597 unsigned long freed = 0;
528 enum oom_constraint constraint; 598 enum oom_constraint constraint;
529 599
@@ -544,7 +614,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
544 614
545 switch (constraint) { 615 switch (constraint) {
546 case CONSTRAINT_MEMORY_POLICY: 616 case CONSTRAINT_MEMORY_POLICY:
547 oom_kill_process(current, gfp_mask, order, points, NULL, 617 oom_kill_process(current, gfp_mask, order, 0, NULL,
548 "No available memory (MPOL_BIND)"); 618 "No available memory (MPOL_BIND)");
549 break; 619 break;
550 620
@@ -553,35 +623,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order)
553 panic("out of memory. panic_on_oom is selected\n"); 623 panic("out of memory. panic_on_oom is selected\n");
554 /* Fall-through */ 624 /* Fall-through */
555 case CONSTRAINT_CPUSET: 625 case CONSTRAINT_CPUSET:
556 if (sysctl_oom_kill_allocating_task) { 626 __out_of_memory(gfp_mask, order);
557 oom_kill_process(current, gfp_mask, order, points, NULL,
558 "Out of memory (oom_kill_allocating_task)");
559 break;
560 }
561retry:
562 /*
563 * Rambo mode: Shoot down a process and hope it solves whatever
564 * issues we may have.
565 */
566 p = select_bad_process(&points, NULL);
567
568 if (PTR_ERR(p) == -1UL)
569 goto out;
570
571 /* Found nothing?!?! Either we hang forever, or we panic. */
572 if (!p) {
573 read_unlock(&tasklist_lock);
574 panic("Out of memory and no killable processes...\n");
575 }
576
577 if (oom_kill_process(p, gfp_mask, order, points, NULL,
578 "Out of memory"))
579 goto retry;
580
581 break; 627 break;
582 } 628 }
583 629
584out:
585 read_unlock(&tasklist_lock); 630 read_unlock(&tasklist_lock);
586 631
587 /* 632 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 2970e35fd03f..b493db7841dc 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -69,6 +69,12 @@ static inline long sync_writeback_pages(void)
69int dirty_background_ratio = 5; 69int dirty_background_ratio = 5;
70 70
71/* 71/*
72 * dirty_background_bytes starts at 0 (disabled) so that it is a function of
73 * dirty_background_ratio * the amount of dirtyable memory
74 */
75unsigned long dirty_background_bytes;
76
77/*
72 * free highmem will not be subtracted from the total free memory 78 * free highmem will not be subtracted from the total free memory
73 * for calculating free ratios if vm_highmem_is_dirtyable is true 79 * for calculating free ratios if vm_highmem_is_dirtyable is true
74 */ 80 */
@@ -80,6 +86,12 @@ int vm_highmem_is_dirtyable;
80int vm_dirty_ratio = 10; 86int vm_dirty_ratio = 10;
81 87
82/* 88/*
89 * vm_dirty_bytes starts at 0 (disabled) so that it is a function of
90 * vm_dirty_ratio * the amount of dirtyable memory
91 */
92unsigned long vm_dirty_bytes;
93
94/*
83 * The interval between `kupdate'-style writebacks, in jiffies 95 * The interval between `kupdate'-style writebacks, in jiffies
84 */ 96 */
85int dirty_writeback_interval = 5 * HZ; 97int dirty_writeback_interval = 5 * HZ;
@@ -135,23 +147,75 @@ static int calc_period_shift(void)
135{ 147{
136 unsigned long dirty_total; 148 unsigned long dirty_total;
137 149
138 dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100; 150 if (vm_dirty_bytes)
151 dirty_total = vm_dirty_bytes / PAGE_SIZE;
152 else
153 dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) /
154 100;
139 return 2 + ilog2(dirty_total - 1); 155 return 2 + ilog2(dirty_total - 1);
140} 156}
141 157
142/* 158/*
143 * update the period when the dirty ratio changes. 159 * update the period when the dirty threshold changes.
144 */ 160 */
161static void update_completion_period(void)
162{
163 int shift = calc_period_shift();
164 prop_change_shift(&vm_completions, shift);
165 prop_change_shift(&vm_dirties, shift);
166}
167
168int dirty_background_ratio_handler(struct ctl_table *table, int write,
169 struct file *filp, void __user *buffer, size_t *lenp,
170 loff_t *ppos)
171{
172 int ret;
173
174 ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
175 if (ret == 0 && write)
176 dirty_background_bytes = 0;
177 return ret;
178}
179
180int dirty_background_bytes_handler(struct ctl_table *table, int write,
181 struct file *filp, void __user *buffer, size_t *lenp,
182 loff_t *ppos)
183{
184 int ret;
185
186 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
187 if (ret == 0 && write)
188 dirty_background_ratio = 0;
189 return ret;
190}
191
145int dirty_ratio_handler(struct ctl_table *table, int write, 192int dirty_ratio_handler(struct ctl_table *table, int write,
146 struct file *filp, void __user *buffer, size_t *lenp, 193 struct file *filp, void __user *buffer, size_t *lenp,
147 loff_t *ppos) 194 loff_t *ppos)
148{ 195{
149 int old_ratio = vm_dirty_ratio; 196 int old_ratio = vm_dirty_ratio;
150 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); 197 int ret;
198
199 ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
151 if (ret == 0 && write && vm_dirty_ratio != old_ratio) { 200 if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
152 int shift = calc_period_shift(); 201 update_completion_period();
153 prop_change_shift(&vm_completions, shift); 202 vm_dirty_bytes = 0;
154 prop_change_shift(&vm_dirties, shift); 203 }
204 return ret;
205}
206
207
208int dirty_bytes_handler(struct ctl_table *table, int write,
209 struct file *filp, void __user *buffer, size_t *lenp,
210 loff_t *ppos)
211{
212 int old_bytes = vm_dirty_bytes;
213 int ret;
214
215 ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
216 if (ret == 0 && write && vm_dirty_bytes != old_bytes) {
217 update_completion_period();
218 vm_dirty_ratio = 0;
155 } 219 }
156 return ret; 220 return ret;
157} 221}
@@ -362,26 +426,32 @@ unsigned long determine_dirtyable_memory(void)
362} 426}
363 427
364void 428void
365get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty, 429get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
366 struct backing_dev_info *bdi) 430 unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
367{ 431{
368 int background_ratio; /* Percentages */ 432 unsigned long background;
369 int dirty_ratio; 433 unsigned long dirty;
370 long background;
371 long dirty;
372 unsigned long available_memory = determine_dirtyable_memory(); 434 unsigned long available_memory = determine_dirtyable_memory();
373 struct task_struct *tsk; 435 struct task_struct *tsk;
374 436
375 dirty_ratio = vm_dirty_ratio; 437 if (vm_dirty_bytes)
376 if (dirty_ratio < 5) 438 dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE);
377 dirty_ratio = 5; 439 else {
440 int dirty_ratio;
378 441
379 background_ratio = dirty_background_ratio; 442 dirty_ratio = vm_dirty_ratio;
380 if (background_ratio >= dirty_ratio) 443 if (dirty_ratio < 5)
381 background_ratio = dirty_ratio / 2; 444 dirty_ratio = 5;
445 dirty = (dirty_ratio * available_memory) / 100;
446 }
447
448 if (dirty_background_bytes)
449 background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE);
450 else
451 background = (dirty_background_ratio * available_memory) / 100;
382 452
383 background = (background_ratio * available_memory) / 100; 453 if (background >= dirty)
384 dirty = (dirty_ratio * available_memory) / 100; 454 background = dirty / 2;
385 tsk = current; 455 tsk = current;
386 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { 456 if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
387 background += background / 4; 457 background += background / 4;
@@ -423,9 +493,9 @@ static void balance_dirty_pages(struct address_space *mapping)
423{ 493{
424 long nr_reclaimable, bdi_nr_reclaimable; 494 long nr_reclaimable, bdi_nr_reclaimable;
425 long nr_writeback, bdi_nr_writeback; 495 long nr_writeback, bdi_nr_writeback;
426 long background_thresh; 496 unsigned long background_thresh;
427 long dirty_thresh; 497 unsigned long dirty_thresh;
428 long bdi_thresh; 498 unsigned long bdi_thresh;
429 unsigned long pages_written = 0; 499 unsigned long pages_written = 0;
430 unsigned long write_chunk = sync_writeback_pages(); 500 unsigned long write_chunk = sync_writeback_pages();
431 501
@@ -580,8 +650,8 @@ EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr);
580 650
581void throttle_vm_writeout(gfp_t gfp_mask) 651void throttle_vm_writeout(gfp_t gfp_mask)
582{ 652{
583 long background_thresh; 653 unsigned long background_thresh;
584 long dirty_thresh; 654 unsigned long dirty_thresh;
585 655
586 for ( ; ; ) { 656 for ( ; ; ) {
587 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); 657 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
@@ -624,8 +694,8 @@ static void background_writeout(unsigned long _min_pages)
624 }; 694 };
625 695
626 for ( ; ; ) { 696 for ( ; ; ) {
627 long background_thresh; 697 unsigned long background_thresh;
628 long dirty_thresh; 698 unsigned long dirty_thresh;
629 699
630 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); 700 get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
631 if (global_page_state(NR_FILE_DIRTY) + 701 if (global_page_state(NR_FILE_DIRTY) +
@@ -868,9 +938,11 @@ int write_cache_pages(struct address_space *mapping,
868 int done = 0; 938 int done = 0;
869 struct pagevec pvec; 939 struct pagevec pvec;
870 int nr_pages; 940 int nr_pages;
941 pgoff_t uninitialized_var(writeback_index);
871 pgoff_t index; 942 pgoff_t index;
872 pgoff_t end; /* Inclusive */ 943 pgoff_t end; /* Inclusive */
873 int scanned = 0; 944 pgoff_t done_index;
945 int cycled;
874 int range_whole = 0; 946 int range_whole = 0;
875 long nr_to_write = wbc->nr_to_write; 947 long nr_to_write = wbc->nr_to_write;
876 948
@@ -881,83 +953,134 @@ int write_cache_pages(struct address_space *mapping,
881 953
882 pagevec_init(&pvec, 0); 954 pagevec_init(&pvec, 0);
883 if (wbc->range_cyclic) { 955 if (wbc->range_cyclic) {
884 index = mapping->writeback_index; /* Start from prev offset */ 956 writeback_index = mapping->writeback_index; /* prev offset */
957 index = writeback_index;
958 if (index == 0)
959 cycled = 1;
960 else
961 cycled = 0;
885 end = -1; 962 end = -1;
886 } else { 963 } else {
887 index = wbc->range_start >> PAGE_CACHE_SHIFT; 964 index = wbc->range_start >> PAGE_CACHE_SHIFT;
888 end = wbc->range_end >> PAGE_CACHE_SHIFT; 965 end = wbc->range_end >> PAGE_CACHE_SHIFT;
889 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) 966 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
890 range_whole = 1; 967 range_whole = 1;
891 scanned = 1; 968 cycled = 1; /* ignore range_cyclic tests */
892 } 969 }
893retry: 970retry:
894 while (!done && (index <= end) && 971 done_index = index;
895 (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, 972 while (!done && (index <= end)) {
896 PAGECACHE_TAG_DIRTY, 973 int i;
897 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { 974
898 unsigned i; 975 nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
976 PAGECACHE_TAG_DIRTY,
977 min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
978 if (nr_pages == 0)
979 break;
899 980
900 scanned = 1;
901 for (i = 0; i < nr_pages; i++) { 981 for (i = 0; i < nr_pages; i++) {
902 struct page *page = pvec.pages[i]; 982 struct page *page = pvec.pages[i];
903 983
904 /* 984 /*
905 * At this point we hold neither mapping->tree_lock nor 985 * At this point, the page may be truncated or
906 * lock on the page itself: the page may be truncated or 986 * invalidated (changing page->mapping to NULL), or
907 * invalidated (changing page->mapping to NULL), or even 987 * even swizzled back from swapper_space to tmpfs file
908 * swizzled back from swapper_space to tmpfs file 988 * mapping. However, page->index will not change
909 * mapping 989 * because we have a reference on the page.
910 */ 990 */
991 if (page->index > end) {
992 /*
993 * can't be range_cyclic (1st pass) because
994 * end == -1 in that case.
995 */
996 done = 1;
997 break;
998 }
999
1000 done_index = page->index + 1;
1001
911 lock_page(page); 1002 lock_page(page);
912 1003
1004 /*
1005 * Page truncated or invalidated. We can freely skip it
1006 * then, even for data integrity operations: the page
1007 * has disappeared concurrently, so there could be no
1008 * real expectation of this data interity operation
1009 * even if there is now a new, dirty page at the same
1010 * pagecache address.
1011 */
913 if (unlikely(page->mapping != mapping)) { 1012 if (unlikely(page->mapping != mapping)) {
1013continue_unlock:
914 unlock_page(page); 1014 unlock_page(page);
915 continue; 1015 continue;
916 } 1016 }
917 1017
918 if (!wbc->range_cyclic && page->index > end) { 1018 if (!PageDirty(page)) {
919 done = 1; 1019 /* someone wrote it for us */
920 unlock_page(page); 1020 goto continue_unlock;
921 continue;
922 } 1021 }
923 1022
924 if (wbc->sync_mode != WB_SYNC_NONE) 1023 if (PageWriteback(page)) {
925 wait_on_page_writeback(page); 1024 if (wbc->sync_mode != WB_SYNC_NONE)
926 1025 wait_on_page_writeback(page);
927 if (PageWriteback(page) || 1026 else
928 !clear_page_dirty_for_io(page)) { 1027 goto continue_unlock;
929 unlock_page(page);
930 continue;
931 } 1028 }
932 1029
933 ret = (*writepage)(page, wbc, data); 1030 BUG_ON(PageWriteback(page));
1031 if (!clear_page_dirty_for_io(page))
1032 goto continue_unlock;
934 1033
935 if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { 1034 ret = (*writepage)(page, wbc, data);
936 unlock_page(page); 1035 if (unlikely(ret)) {
937 ret = 0; 1036 if (ret == AOP_WRITEPAGE_ACTIVATE) {
1037 unlock_page(page);
1038 ret = 0;
1039 } else {
1040 /*
1041 * done_index is set past this page,
1042 * so media errors will not choke
1043 * background writeout for the entire
1044 * file. This has consequences for
1045 * range_cyclic semantics (ie. it may
1046 * not be suitable for data integrity
1047 * writeout).
1048 */
1049 done = 1;
1050 break;
1051 }
1052 }
1053
1054 if (wbc->sync_mode == WB_SYNC_NONE) {
1055 wbc->nr_to_write--;
1056 if (wbc->nr_to_write <= 0) {
1057 done = 1;
1058 break;
1059 }
938 } 1060 }
939 if (ret || (--nr_to_write <= 0))
940 done = 1;
941 if (wbc->nonblocking && bdi_write_congested(bdi)) { 1061 if (wbc->nonblocking && bdi_write_congested(bdi)) {
942 wbc->encountered_congestion = 1; 1062 wbc->encountered_congestion = 1;
943 done = 1; 1063 done = 1;
1064 break;
944 } 1065 }
945 } 1066 }
946 pagevec_release(&pvec); 1067 pagevec_release(&pvec);
947 cond_resched(); 1068 cond_resched();
948 } 1069 }
949 if (!scanned && !done) { 1070 if (!cycled) {
950 /* 1071 /*
1072 * range_cyclic:
951 * We hit the last page and there is more work to be done: wrap 1073 * We hit the last page and there is more work to be done: wrap
952 * back to the start of the file 1074 * back to the start of the file
953 */ 1075 */
954 scanned = 1; 1076 cycled = 1;
955 index = 0; 1077 index = 0;
1078 end = writeback_index - 1;
956 goto retry; 1079 goto retry;
957 } 1080 }
958 if (!wbc->no_nrwrite_index_update) { 1081 if (!wbc->no_nrwrite_index_update) {
959 if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) 1082 if (wbc->range_cyclic || (range_whole && nr_to_write > 0))
960 mapping->writeback_index = index; 1083 mapping->writeback_index = done_index;
961 wbc->nr_to_write = nr_to_write; 1084 wbc->nr_to_write = nr_to_write;
962 } 1085 }
963 1086
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index d8ac01474563..5675b3073854 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -69,7 +69,7 @@ EXPORT_SYMBOL(node_states);
69 69
70unsigned long totalram_pages __read_mostly; 70unsigned long totalram_pages __read_mostly;
71unsigned long totalreserve_pages __read_mostly; 71unsigned long totalreserve_pages __read_mostly;
72long nr_swap_pages; 72unsigned long highest_memmap_pfn __read_mostly;
73int percpu_pagelist_fraction; 73int percpu_pagelist_fraction;
74 74
75#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE 75#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -223,19 +223,41 @@ static inline int bad_range(struct zone *zone, struct page *page)
223 223
224static void bad_page(struct page *page) 224static void bad_page(struct page *page)
225{ 225{
226 printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG 226 static unsigned long resume;
227 "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", 227 static unsigned long nr_shown;
228 current->comm, page, (int)(2*sizeof(unsigned long)), 228 static unsigned long nr_unshown;
229 (unsigned long)page->flags, page->mapping, 229
230 page_mapcount(page), page_count(page)); 230 /*
231 * Allow a burst of 60 reports, then keep quiet for that minute;
232 * or allow a steady drip of one report per second.
233 */
234 if (nr_shown == 60) {
235 if (time_before(jiffies, resume)) {
236 nr_unshown++;
237 goto out;
238 }
239 if (nr_unshown) {
240 printk(KERN_ALERT
241 "BUG: Bad page state: %lu messages suppressed\n",
242 nr_unshown);
243 nr_unshown = 0;
244 }
245 nr_shown = 0;
246 }
247 if (nr_shown++ == 0)
248 resume = jiffies + 60 * HZ;
249
250 printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n",
251 current->comm, page_to_pfn(page));
252 printk(KERN_ALERT
253 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
254 page, (void *)page->flags, page_count(page),
255 page_mapcount(page), page->mapping, page->index);
231 256
232 printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
233 KERN_EMERG "Backtrace:\n");
234 dump_stack(); 257 dump_stack();
235 page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD; 258out:
236 set_page_count(page, 0); 259 /* Leave bad fields for debug, except PageBuddy could make trouble */
237 reset_page_mapcount(page); 260 __ClearPageBuddy(page);
238 page->mapping = NULL;
239 add_taint(TAINT_BAD_PAGE); 261 add_taint(TAINT_BAD_PAGE);
240} 262}
241 263
@@ -292,25 +314,31 @@ void prep_compound_gigantic_page(struct page *page, unsigned long order)
292} 314}
293#endif 315#endif
294 316
295static void destroy_compound_page(struct page *page, unsigned long order) 317static int destroy_compound_page(struct page *page, unsigned long order)
296{ 318{
297 int i; 319 int i;
298 int nr_pages = 1 << order; 320 int nr_pages = 1 << order;
321 int bad = 0;
299 322
300 if (unlikely(compound_order(page) != order)) 323 if (unlikely(compound_order(page) != order) ||
324 unlikely(!PageHead(page))) {
301 bad_page(page); 325 bad_page(page);
326 bad++;
327 }
302 328
303 if (unlikely(!PageHead(page)))
304 bad_page(page);
305 __ClearPageHead(page); 329 __ClearPageHead(page);
330
306 for (i = 1; i < nr_pages; i++) { 331 for (i = 1; i < nr_pages; i++) {
307 struct page *p = page + i; 332 struct page *p = page + i;
308 333
309 if (unlikely(!PageTail(p) | 334 if (unlikely(!PageTail(p) | (p->first_page != page))) {
310 (p->first_page != page)))
311 bad_page(page); 335 bad_page(page);
336 bad++;
337 }
312 __ClearPageTail(p); 338 __ClearPageTail(p);
313 } 339 }
340
341 return bad;
314} 342}
315 343
316static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) 344static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
@@ -430,7 +458,8 @@ static inline void __free_one_page(struct page *page,
430 int migratetype = get_pageblock_migratetype(page); 458 int migratetype = get_pageblock_migratetype(page);
431 459
432 if (unlikely(PageCompound(page))) 460 if (unlikely(PageCompound(page)))
433 destroy_compound_page(page, order); 461 if (unlikely(destroy_compound_page(page, order)))
462 return;
434 463
435 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); 464 page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
436 465
@@ -467,18 +496,13 @@ static inline int free_pages_check(struct page *page)
467 if (unlikely(page_mapcount(page) | 496 if (unlikely(page_mapcount(page) |
468 (page->mapping != NULL) | 497 (page->mapping != NULL) |
469 (page_count(page) != 0) | 498 (page_count(page) != 0) |
470 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) 499 (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) {
471 bad_page(page); 500 bad_page(page);
472 if (PageDirty(page)) 501 return 1;
473 __ClearPageDirty(page); 502 }
474 if (PageSwapBacked(page)) 503 if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
475 __ClearPageSwapBacked(page); 504 page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
476 /* 505 return 0;
477 * For now, we report if PG_reserved was found set, but do not
478 * clear it, and do not free the page. But we shall soon need
479 * to do more, for when the ZERO_PAGE count wraps negative.
480 */
481 return PageReserved(page);
482} 506}
483 507
484/* 508/*
@@ -523,11 +547,11 @@ static void __free_pages_ok(struct page *page, unsigned int order)
523{ 547{
524 unsigned long flags; 548 unsigned long flags;
525 int i; 549 int i;
526 int reserved = 0; 550 int bad = 0;
527 551
528 for (i = 0 ; i < (1 << order) ; ++i) 552 for (i = 0 ; i < (1 << order) ; ++i)
529 reserved += free_pages_check(page + i); 553 bad += free_pages_check(page + i);
530 if (reserved) 554 if (bad)
531 return; 555 return;
532 556
533 if (!PageHighMem(page)) { 557 if (!PageHighMem(page)) {
@@ -612,23 +636,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
612 if (unlikely(page_mapcount(page) | 636 if (unlikely(page_mapcount(page) |
613 (page->mapping != NULL) | 637 (page->mapping != NULL) |
614 (page_count(page) != 0) | 638 (page_count(page) != 0) |
615 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) 639 (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) {
616 bad_page(page); 640 bad_page(page);
617
618 /*
619 * For now, we report if PG_reserved was found set, but do not
620 * clear it, and do not allocate the page: as a safety net.
621 */
622 if (PageReserved(page))
623 return 1; 641 return 1;
642 }
624 643
625 page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim |
626 1 << PG_referenced | 1 << PG_arch_1 |
627 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk
628#ifdef CONFIG_UNEVICTABLE_LRU
629 | 1 << PG_mlocked
630#endif
631 );
632 set_page_private(page, 0); 644 set_page_private(page, 0);
633 set_page_refcounted(page); 645 set_page_refcounted(page);
634 646
@@ -2609,6 +2621,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
2609 unsigned long pfn; 2621 unsigned long pfn;
2610 struct zone *z; 2622 struct zone *z;
2611 2623
2624 if (highest_memmap_pfn < end_pfn - 1)
2625 highest_memmap_pfn = end_pfn - 1;
2626
2612 z = &NODE_DATA(nid)->node_zones[zone]; 2627 z = &NODE_DATA(nid)->node_zones[zone];
2613 for (pfn = start_pfn; pfn < end_pfn; pfn++) { 2628 for (pfn = start_pfn; pfn < end_pfn; pfn++) {
2614 /* 2629 /*
@@ -3381,10 +3396,8 @@ static void __init setup_usemap(struct pglist_data *pgdat,
3381{ 3396{
3382 unsigned long usemapsize = usemap_size(zonesize); 3397 unsigned long usemapsize = usemap_size(zonesize);
3383 zone->pageblock_flags = NULL; 3398 zone->pageblock_flags = NULL;
3384 if (usemapsize) { 3399 if (usemapsize)
3385 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); 3400 zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize);
3386 memset(zone->pageblock_flags, 0, usemapsize);
3387 }
3388} 3401}
3389#else 3402#else
3390static void inline setup_usemap(struct pglist_data *pgdat, 3403static void inline setup_usemap(struct pglist_data *pgdat,
@@ -3469,9 +3482,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3469 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; 3482 PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT;
3470 if (realsize >= memmap_pages) { 3483 if (realsize >= memmap_pages) {
3471 realsize -= memmap_pages; 3484 realsize -= memmap_pages;
3472 printk(KERN_DEBUG 3485 if (memmap_pages)
3473 " %s zone: %lu pages used for memmap\n", 3486 printk(KERN_DEBUG
3474 zone_names[j], memmap_pages); 3487 " %s zone: %lu pages used for memmap\n",
3488 zone_names[j], memmap_pages);
3475 } else 3489 } else
3476 printk(KERN_WARNING 3490 printk(KERN_WARNING
3477 " %s zone: %lu pages exceeds realsize %lu\n", 3491 " %s zone: %lu pages exceeds realsize %lu\n",
@@ -3509,10 +3523,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
3509 INIT_LIST_HEAD(&zone->lru[l].list); 3523 INIT_LIST_HEAD(&zone->lru[l].list);
3510 zone->lru[l].nr_scan = 0; 3524 zone->lru[l].nr_scan = 0;
3511 } 3525 }
3512 zone->recent_rotated[0] = 0; 3526 zone->reclaim_stat.recent_rotated[0] = 0;
3513 zone->recent_rotated[1] = 0; 3527 zone->reclaim_stat.recent_rotated[1] = 0;
3514 zone->recent_scanned[0] = 0; 3528 zone->reclaim_stat.recent_scanned[0] = 0;
3515 zone->recent_scanned[1] = 0; 3529 zone->reclaim_stat.recent_scanned[1] = 0;
3516 zap_zone_vm_stats(zone); 3530 zap_zone_vm_stats(zone);
3517 zone->flags = 0; 3531 zone->flags = 0;
3518 if (!size) 3532 if (!size)
@@ -4316,7 +4330,7 @@ void setup_per_zone_pages_min(void)
4316 * 1TB 101 10GB 4330 * 1TB 101 10GB
4317 * 10TB 320 32GB 4331 * 10TB 320 32GB
4318 */ 4332 */
4319void setup_per_zone_inactive_ratio(void) 4333static void setup_per_zone_inactive_ratio(void)
4320{ 4334{
4321 struct zone *zone; 4335 struct zone *zone;
4322 4336
@@ -4573,19 +4587,6 @@ void *__init alloc_large_system_hash(const char *tablename,
4573 return table; 4587 return table;
4574} 4588}
4575 4589
4576#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
4577struct page *pfn_to_page(unsigned long pfn)
4578{
4579 return __pfn_to_page(pfn);
4580}
4581unsigned long page_to_pfn(struct page *page)
4582{
4583 return __page_to_pfn(page);
4584}
4585EXPORT_SYMBOL(pfn_to_page);
4586EXPORT_SYMBOL(page_to_pfn);
4587#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
4588
4589/* Return a pointer to the bitmap storing bits affecting a block of pages */ 4590/* Return a pointer to the bitmap storing bits affecting a block of pages */
4590static inline unsigned long *get_pageblock_bitmap(struct zone *zone, 4591static inline unsigned long *get_pageblock_bitmap(struct zone *zone,
4591 unsigned long pfn) 4592 unsigned long pfn)
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index ab27ff750519..7006a11350c8 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -8,6 +8,7 @@
8#include <linux/memory.h> 8#include <linux/memory.h>
9#include <linux/vmalloc.h> 9#include <linux/vmalloc.h>
10#include <linux/cgroup.h> 10#include <linux/cgroup.h>
11#include <linux/swapops.h>
11 12
12static void __meminit 13static void __meminit
13__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) 14__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
@@ -15,6 +16,7 @@ __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn)
15 pc->flags = 0; 16 pc->flags = 0;
16 pc->mem_cgroup = NULL; 17 pc->mem_cgroup = NULL;
17 pc->page = pfn_to_page(pfn); 18 pc->page = pfn_to_page(pfn);
19 INIT_LIST_HEAD(&pc->lru);
18} 20}
19static unsigned long total_usage; 21static unsigned long total_usage;
20 22
@@ -72,7 +74,7 @@ void __init page_cgroup_init(void)
72 74
73 int nid, fail; 75 int nid, fail;
74 76
75 if (mem_cgroup_subsys.disabled) 77 if (mem_cgroup_disabled())
76 return; 78 return;
77 79
78 for_each_online_node(nid) { 80 for_each_online_node(nid) {
@@ -101,15 +103,13 @@ struct page_cgroup *lookup_page_cgroup(struct page *page)
101} 103}
102 104
103/* __alloc_bootmem...() is protected by !slab_available() */ 105/* __alloc_bootmem...() is protected by !slab_available() */
104int __init_refok init_section_page_cgroup(unsigned long pfn) 106static int __init_refok init_section_page_cgroup(unsigned long pfn)
105{ 107{
106 struct mem_section *section; 108 struct mem_section *section = __pfn_to_section(pfn);
107 struct page_cgroup *base, *pc; 109 struct page_cgroup *base, *pc;
108 unsigned long table_size; 110 unsigned long table_size;
109 int nid, index; 111 int nid, index;
110 112
111 section = __pfn_to_section(pfn);
112
113 if (!section->page_cgroup) { 113 if (!section->page_cgroup) {
114 nid = page_to_nid(pfn_to_page(pfn)); 114 nid = page_to_nid(pfn_to_page(pfn));
115 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; 115 table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION;
@@ -145,7 +145,6 @@ int __init_refok init_section_page_cgroup(unsigned long pfn)
145 __init_page_cgroup(pc, pfn + index); 145 __init_page_cgroup(pc, pfn + index);
146 } 146 }
147 147
148 section = __pfn_to_section(pfn);
149 section->page_cgroup = base - pfn; 148 section->page_cgroup = base - pfn;
150 total_usage += table_size; 149 total_usage += table_size;
151 return 0; 150 return 0;
@@ -248,7 +247,7 @@ void __init page_cgroup_init(void)
248 unsigned long pfn; 247 unsigned long pfn;
249 int fail = 0; 248 int fail = 0;
250 249
251 if (mem_cgroup_subsys.disabled) 250 if (mem_cgroup_disabled())
252 return; 251 return;
253 252
254 for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { 253 for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) {
@@ -273,3 +272,199 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat)
273} 272}
274 273
275#endif 274#endif
275
276
277#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
278
279static DEFINE_MUTEX(swap_cgroup_mutex);
280struct swap_cgroup_ctrl {
281 struct page **map;
282 unsigned long length;
283};
284
285struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
286
287/*
288 * This 8bytes seems big..maybe we can reduce this when we can use "id" for
289 * cgroup rather than pointer.
290 */
291struct swap_cgroup {
292 struct mem_cgroup *val;
293};
294#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
295#define SC_POS_MASK (SC_PER_PAGE - 1)
296
297/*
298 * SwapCgroup implements "lookup" and "exchange" operations.
299 * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge
300 * against SwapCache. At swap_free(), this is accessed directly from swap.
301 *
302 * This means,
303 * - we have no race in "exchange" when we're accessed via SwapCache because
304 * SwapCache(and its swp_entry) is under lock.
305 * - When called via swap_free(), there is no user of this entry and no race.
306 * Then, we don't need lock around "exchange".
307 *
308 * TODO: we can push these buffers out to HIGHMEM.
309 */
310
311/*
312 * allocate buffer for swap_cgroup.
313 */
314static int swap_cgroup_prepare(int type)
315{
316 struct page *page;
317 struct swap_cgroup_ctrl *ctrl;
318 unsigned long idx, max;
319
320 if (!do_swap_account)
321 return 0;
322 ctrl = &swap_cgroup_ctrl[type];
323
324 for (idx = 0; idx < ctrl->length; idx++) {
325 page = alloc_page(GFP_KERNEL | __GFP_ZERO);
326 if (!page)
327 goto not_enough_page;
328 ctrl->map[idx] = page;
329 }
330 return 0;
331not_enough_page:
332 max = idx;
333 for (idx = 0; idx < max; idx++)
334 __free_page(ctrl->map[idx]);
335
336 return -ENOMEM;
337}
338
339/**
340 * swap_cgroup_record - record mem_cgroup for this swp_entry.
341 * @ent: swap entry to be recorded into
342 * @mem: mem_cgroup to be recorded
343 *
344 * Returns old value at success, NULL at failure.
345 * (Of course, old value can be NULL.)
346 */
347struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
348{
349 int type = swp_type(ent);
350 unsigned long offset = swp_offset(ent);
351 unsigned long idx = offset / SC_PER_PAGE;
352 unsigned long pos = offset & SC_POS_MASK;
353 struct swap_cgroup_ctrl *ctrl;
354 struct page *mappage;
355 struct swap_cgroup *sc;
356 struct mem_cgroup *old;
357
358 if (!do_swap_account)
359 return NULL;
360
361 ctrl = &swap_cgroup_ctrl[type];
362
363 mappage = ctrl->map[idx];
364 sc = page_address(mappage);
365 sc += pos;
366 old = sc->val;
367 sc->val = mem;
368
369 return old;
370}
371
372/**
373 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
374 * @ent: swap entry to be looked up.
375 *
376 * Returns pointer to mem_cgroup at success. NULL at failure.
377 */
378struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
379{
380 int type = swp_type(ent);
381 unsigned long offset = swp_offset(ent);
382 unsigned long idx = offset / SC_PER_PAGE;
383 unsigned long pos = offset & SC_POS_MASK;
384 struct swap_cgroup_ctrl *ctrl;
385 struct page *mappage;
386 struct swap_cgroup *sc;
387 struct mem_cgroup *ret;
388
389 if (!do_swap_account)
390 return NULL;
391
392 ctrl = &swap_cgroup_ctrl[type];
393 mappage = ctrl->map[idx];
394 sc = page_address(mappage);
395 sc += pos;
396 ret = sc->val;
397 return ret;
398}
399
400int swap_cgroup_swapon(int type, unsigned long max_pages)
401{
402 void *array;
403 unsigned long array_size;
404 unsigned long length;
405 struct swap_cgroup_ctrl *ctrl;
406
407 if (!do_swap_account)
408 return 0;
409
410 length = ((max_pages/SC_PER_PAGE) + 1);
411 array_size = length * sizeof(void *);
412
413 array = vmalloc(array_size);
414 if (!array)
415 goto nomem;
416
417 memset(array, 0, array_size);
418 ctrl = &swap_cgroup_ctrl[type];
419 mutex_lock(&swap_cgroup_mutex);
420 ctrl->length = length;
421 ctrl->map = array;
422 if (swap_cgroup_prepare(type)) {
423 /* memory shortage */
424 ctrl->map = NULL;
425 ctrl->length = 0;
426 vfree(array);
427 mutex_unlock(&swap_cgroup_mutex);
428 goto nomem;
429 }
430 mutex_unlock(&swap_cgroup_mutex);
431
432 printk(KERN_INFO
433 "swap_cgroup: uses %ld bytes of vmalloc for pointer array space"
434 " and %ld bytes to hold mem_cgroup pointers on swap\n",
435 array_size, length * PAGE_SIZE);
436 printk(KERN_INFO
437 "swap_cgroup can be disabled by noswapaccount boot option.\n");
438
439 return 0;
440nomem:
441 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
442 printk(KERN_INFO
443 "swap_cgroup can be disabled by noswapaccount boot option\n");
444 return -ENOMEM;
445}
446
447void swap_cgroup_swapoff(int type)
448{
449 int i;
450 struct swap_cgroup_ctrl *ctrl;
451
452 if (!do_swap_account)
453 return;
454
455 mutex_lock(&swap_cgroup_mutex);
456 ctrl = &swap_cgroup_ctrl[type];
457 if (ctrl->map) {
458 for (i = 0; i < ctrl->length; i++) {
459 struct page *page = ctrl->map[i];
460 if (page)
461 __free_page(page);
462 }
463 vfree(ctrl->map);
464 ctrl->map = NULL;
465 ctrl->length = 0;
466 }
467 mutex_unlock(&swap_cgroup_mutex);
468}
469
470#endif
diff --git a/mm/page_io.c b/mm/page_io.c
index 065c4480eaf0..dc6ce0afbded 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -98,7 +98,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
98 struct bio *bio; 98 struct bio *bio;
99 int ret = 0, rw = WRITE; 99 int ret = 0, rw = WRITE;
100 100
101 if (remove_exclusive_swap_page(page)) { 101 if (try_to_free_swap(page)) {
102 unlock_page(page); 102 unlock_page(page);
103 goto out; 103 goto out;
104 } 104 }
@@ -125,8 +125,8 @@ int swap_readpage(struct file *file, struct page *page)
125 struct bio *bio; 125 struct bio *bio;
126 int ret = 0; 126 int ret = 0;
127 127
128 BUG_ON(!PageLocked(page)); 128 VM_BUG_ON(!PageLocked(page));
129 BUG_ON(PageUptodate(page)); 129 VM_BUG_ON(PageUptodate(page));
130 bio = get_swap_bio(GFP_KERNEL, page_private(page), page, 130 bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
131 end_swap_bio_read); 131 end_swap_bio_read);
132 if (bio == NULL) { 132 if (bio == NULL) {
diff --git a/mm/rmap.c b/mm/rmap.c
index 10993942d6c9..ac4af8cffbf9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -47,9 +47,9 @@
47#include <linux/rmap.h> 47#include <linux/rmap.h>
48#include <linux/rcupdate.h> 48#include <linux/rcupdate.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kallsyms.h>
51#include <linux/memcontrol.h> 50#include <linux/memcontrol.h>
52#include <linux/mmu_notifier.h> 51#include <linux/mmu_notifier.h>
52#include <linux/migrate.h>
53 53
54#include <asm/tlbflush.h> 54#include <asm/tlbflush.h>
55 55
@@ -191,7 +191,7 @@ void __init anon_vma_init(void)
191 * Getting a lock on a stable anon_vma from a page off the LRU is 191 * Getting a lock on a stable anon_vma from a page off the LRU is
192 * tricky: page_lock_anon_vma rely on RCU to guard against the races. 192 * tricky: page_lock_anon_vma rely on RCU to guard against the races.
193 */ 193 */
194struct anon_vma *page_lock_anon_vma(struct page *page) 194static struct anon_vma *page_lock_anon_vma(struct page *page)
195{ 195{
196 struct anon_vma *anon_vma; 196 struct anon_vma *anon_vma;
197 unsigned long anon_mapping; 197 unsigned long anon_mapping;
@@ -211,7 +211,7 @@ out:
211 return NULL; 211 return NULL;
212} 212}
213 213
214void page_unlock_anon_vma(struct anon_vma *anon_vma) 214static void page_unlock_anon_vma(struct anon_vma *anon_vma)
215{ 215{
216 spin_unlock(&anon_vma->lock); 216 spin_unlock(&anon_vma->lock);
217 rcu_read_unlock(); 217 rcu_read_unlock();
@@ -359,8 +359,17 @@ static int page_referenced_one(struct page *page,
359 goto out_unmap; 359 goto out_unmap;
360 } 360 }
361 361
362 if (ptep_clear_flush_young_notify(vma, address, pte)) 362 if (ptep_clear_flush_young_notify(vma, address, pte)) {
363 referenced++; 363 /*
364 * Don't treat a reference through a sequentially read
365 * mapping as such. If the page has been used in
366 * another mapping, we will catch it; if this other
367 * mapping is already gone, the unmap path will have
368 * set PG_referenced or activated the page.
369 */
370 if (likely(!VM_SequentialReadHint(vma)))
371 referenced++;
372 }
364 373
365 /* Pretend the page is referenced if the task has the 374 /* Pretend the page is referenced if the task has the
366 swap token and is in the middle of a page fault. */ 375 swap token and is in the middle of a page fault. */
@@ -661,9 +670,14 @@ void page_add_anon_rmap(struct page *page,
661void page_add_new_anon_rmap(struct page *page, 670void page_add_new_anon_rmap(struct page *page,
662 struct vm_area_struct *vma, unsigned long address) 671 struct vm_area_struct *vma, unsigned long address)
663{ 672{
664 BUG_ON(address < vma->vm_start || address >= vma->vm_end); 673 VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end);
665 atomic_set(&page->_mapcount, 0); /* elevate count by 1 (starts at -1) */ 674 SetPageSwapBacked(page);
675 atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */
666 __page_set_anon_rmap(page, vma, address); 676 __page_set_anon_rmap(page, vma, address);
677 if (page_evictable(page, vma))
678 lru_cache_add_lru(page, LRU_ACTIVE_ANON);
679 else
680 add_page_to_unevictable_list(page);
667} 681}
668 682
669/** 683/**
@@ -693,7 +707,6 @@ void page_add_file_rmap(struct page *page)
693 */ 707 */
694void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) 708void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address)
695{ 709{
696 BUG_ON(page_mapcount(page) == 0);
697 if (PageAnon(page)) 710 if (PageAnon(page))
698 __page_check_anon_rmap(page, vma, address); 711 __page_check_anon_rmap(page, vma, address);
699 atomic_inc(&page->_mapcount); 712 atomic_inc(&page->_mapcount);
@@ -703,28 +716,12 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long
703/** 716/**
704 * page_remove_rmap - take down pte mapping from a page 717 * page_remove_rmap - take down pte mapping from a page
705 * @page: page to remove mapping from 718 * @page: page to remove mapping from
706 * @vma: the vm area in which the mapping is removed
707 * 719 *
708 * The caller needs to hold the pte lock. 720 * The caller needs to hold the pte lock.
709 */ 721 */
710void page_remove_rmap(struct page *page, struct vm_area_struct *vma) 722void page_remove_rmap(struct page *page)
711{ 723{
712 if (atomic_add_negative(-1, &page->_mapcount)) { 724 if (atomic_add_negative(-1, &page->_mapcount)) {
713 if (unlikely(page_mapcount(page) < 0)) {
714 printk (KERN_EMERG "Eeek! page_mapcount(page) went negative! (%d)\n", page_mapcount(page));
715 printk (KERN_EMERG " page pfn = %lx\n", page_to_pfn(page));
716 printk (KERN_EMERG " page->flags = %lx\n", page->flags);
717 printk (KERN_EMERG " page->count = %x\n", page_count(page));
718 printk (KERN_EMERG " page->mapping = %p\n", page->mapping);
719 print_symbol (KERN_EMERG " vma->vm_ops = %s\n", (unsigned long)vma->vm_ops);
720 if (vma->vm_ops) {
721 print_symbol (KERN_EMERG " vma->vm_ops->fault = %s\n", (unsigned long)vma->vm_ops->fault);
722 }
723 if (vma->vm_file && vma->vm_file->f_op)
724 print_symbol (KERN_EMERG " vma->vm_file->f_op->mmap = %s\n", (unsigned long)vma->vm_file->f_op->mmap);
725 BUG();
726 }
727
728 /* 725 /*
729 * Now that the last pte has gone, s390 must transfer dirty 726 * Now that the last pte has gone, s390 must transfer dirty
730 * flag from storage key to struct page. We can usually skip 727 * flag from storage key to struct page. We can usually skip
@@ -818,8 +815,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
818 spin_unlock(&mmlist_lock); 815 spin_unlock(&mmlist_lock);
819 } 816 }
820 dec_mm_counter(mm, anon_rss); 817 dec_mm_counter(mm, anon_rss);
821#ifdef CONFIG_MIGRATION 818 } else if (PAGE_MIGRATION) {
822 } else {
823 /* 819 /*
824 * Store the pfn of the page in a special migration 820 * Store the pfn of the page in a special migration
825 * pte. do_swap_page() will wait until the migration 821 * pte. do_swap_page() will wait until the migration
@@ -827,23 +823,19 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
827 */ 823 */
828 BUG_ON(!migration); 824 BUG_ON(!migration);
829 entry = make_migration_entry(page, pte_write(pteval)); 825 entry = make_migration_entry(page, pte_write(pteval));
830#endif
831 } 826 }
832 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 827 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
833 BUG_ON(pte_file(*pte)); 828 BUG_ON(pte_file(*pte));
834 } else 829 } else if (PAGE_MIGRATION && migration) {
835#ifdef CONFIG_MIGRATION
836 if (migration) {
837 /* Establish migration entry for a file page */ 830 /* Establish migration entry for a file page */
838 swp_entry_t entry; 831 swp_entry_t entry;
839 entry = make_migration_entry(page, pte_write(pteval)); 832 entry = make_migration_entry(page, pte_write(pteval));
840 set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); 833 set_pte_at(mm, address, pte, swp_entry_to_pte(entry));
841 } else 834 } else
842#endif
843 dec_mm_counter(mm, file_rss); 835 dec_mm_counter(mm, file_rss);
844 836
845 837
846 page_remove_rmap(page, vma); 838 page_remove_rmap(page);
847 page_cache_release(page); 839 page_cache_release(page);
848 840
849out_unmap: 841out_unmap:
@@ -958,7 +950,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
958 if (pte_dirty(pteval)) 950 if (pte_dirty(pteval))
959 set_page_dirty(page); 951 set_page_dirty(page);
960 952
961 page_remove_rmap(page, vma); 953 page_remove_rmap(page);
962 page_cache_release(page); 954 page_cache_release(page);
963 dec_mm_counter(mm, file_rss); 955 dec_mm_counter(mm, file_rss);
964 (*mapcount)--; 956 (*mapcount)--;
diff --git a/mm/shmem.c b/mm/shmem.c
index f1b0d4871f3a..5d0de96c9789 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -14,31 +14,39 @@
14 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net> 14 * Copyright (c) 2004, Luke Kenneth Casson Leighton <lkcl@lkcl.net>
15 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com> 15 * Copyright (c) 2004 Red Hat, Inc., James Morris <jmorris@redhat.com>
16 * 16 *
17 * tiny-shmem:
18 * Copyright (c) 2004, 2008 Matt Mackall <mpm@selenic.com>
19 *
17 * This file is released under the GPL. 20 * This file is released under the GPL.
18 */ 21 */
19 22
23#include <linux/fs.h>
24#include <linux/init.h>
25#include <linux/vfs.h>
26#include <linux/mount.h>
27#include <linux/file.h>
28#include <linux/mm.h>
29#include <linux/module.h>
30#include <linux/swap.h>
31
32static struct vfsmount *shm_mnt;
33
34#ifdef CONFIG_SHMEM
20/* 35/*
21 * This virtual memory filesystem is heavily based on the ramfs. It 36 * This virtual memory filesystem is heavily based on the ramfs. It
22 * extends ramfs by the ability to use swap and honor resource limits 37 * extends ramfs by the ability to use swap and honor resource limits
23 * which makes it a completely usable filesystem. 38 * which makes it a completely usable filesystem.
24 */ 39 */
25 40
26#include <linux/module.h>
27#include <linux/init.h>
28#include <linux/fs.h>
29#include <linux/xattr.h> 41#include <linux/xattr.h>
30#include <linux/exportfs.h> 42#include <linux/exportfs.h>
31#include <linux/generic_acl.h> 43#include <linux/generic_acl.h>
32#include <linux/mm.h>
33#include <linux/mman.h> 44#include <linux/mman.h>
34#include <linux/file.h>
35#include <linux/swap.h>
36#include <linux/pagemap.h> 45#include <linux/pagemap.h>
37#include <linux/string.h> 46#include <linux/string.h>
38#include <linux/slab.h> 47#include <linux/slab.h>
39#include <linux/backing-dev.h> 48#include <linux/backing-dev.h>
40#include <linux/shmem_fs.h> 49#include <linux/shmem_fs.h>
41#include <linux/mount.h>
42#include <linux/writeback.h> 50#include <linux/writeback.h>
43#include <linux/vfs.h> 51#include <linux/vfs.h>
44#include <linux/blkdev.h> 52#include <linux/blkdev.h>
@@ -920,7 +928,11 @@ found:
920 error = 1; 928 error = 1;
921 if (!inode) 929 if (!inode)
922 goto out; 930 goto out;
923 /* Precharge page using GFP_KERNEL while we can wait */ 931 /*
932 * Charge page using GFP_KERNEL while we can wait.
933 * Charged back to the user(not to caller) when swap account is used.
934 * add_to_page_cache() will be called with GFP_NOWAIT.
935 */
924 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL); 936 error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
925 if (error) 937 if (error)
926 goto out; 938 goto out;
@@ -1312,15 +1324,19 @@ repeat:
1312 } else { 1324 } else {
1313 shmem_swp_unmap(entry); 1325 shmem_swp_unmap(entry);
1314 spin_unlock(&info->lock); 1326 spin_unlock(&info->lock);
1315 unlock_page(swappage);
1316 page_cache_release(swappage);
1317 if (error == -ENOMEM) { 1327 if (error == -ENOMEM) {
1318 /* allow reclaim from this memory cgroup */ 1328 /* allow reclaim from this memory cgroup */
1319 error = mem_cgroup_shrink_usage(current->mm, 1329 error = mem_cgroup_shrink_usage(swappage,
1330 current->mm,
1320 gfp); 1331 gfp);
1321 if (error) 1332 if (error) {
1333 unlock_page(swappage);
1334 page_cache_release(swappage);
1322 goto failed; 1335 goto failed;
1336 }
1323 } 1337 }
1338 unlock_page(swappage);
1339 page_cache_release(swappage);
1324 goto repeat; 1340 goto repeat;
1325 } 1341 }
1326 } else if (sgp == SGP_READ && !filepage) { 1342 } else if (sgp == SGP_READ && !filepage) {
@@ -1371,7 +1387,7 @@ repeat:
1371 1387
1372 /* Precharge page while we can wait, compensate after */ 1388 /* Precharge page while we can wait, compensate after */
1373 error = mem_cgroup_cache_charge(filepage, current->mm, 1389 error = mem_cgroup_cache_charge(filepage, current->mm,
1374 gfp & ~__GFP_HIGHMEM); 1390 GFP_KERNEL);
1375 if (error) { 1391 if (error) {
1376 page_cache_release(filepage); 1392 page_cache_release(filepage);
1377 shmem_unacct_blocks(info->flags, 1); 1393 shmem_unacct_blocks(info->flags, 1);
@@ -1444,7 +1460,6 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
1444 if (error) 1460 if (error)
1445 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); 1461 return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
1446 1462
1447 mark_page_accessed(vmf->page);
1448 return ret | VM_FAULT_LOCKED; 1463 return ret | VM_FAULT_LOCKED;
1449} 1464}
1450 1465
@@ -2486,7 +2501,6 @@ static struct file_system_type tmpfs_fs_type = {
2486 .get_sb = shmem_get_sb, 2501 .get_sb = shmem_get_sb,
2487 .kill_sb = kill_litter_super, 2502 .kill_sb = kill_litter_super,
2488}; 2503};
2489static struct vfsmount *shm_mnt;
2490 2504
2491static int __init init_tmpfs(void) 2505static int __init init_tmpfs(void)
2492{ 2506{
@@ -2525,7 +2539,51 @@ out4:
2525 shm_mnt = ERR_PTR(error); 2539 shm_mnt = ERR_PTR(error);
2526 return error; 2540 return error;
2527} 2541}
2528module_init(init_tmpfs) 2542
2543#else /* !CONFIG_SHMEM */
2544
2545/*
2546 * tiny-shmem: simple shmemfs and tmpfs using ramfs code
2547 *
2548 * This is intended for small system where the benefits of the full
2549 * shmem code (swap-backed and resource-limited) are outweighed by
2550 * their complexity. On systems without swap this code should be
2551 * effectively equivalent, but much lighter weight.
2552 */
2553
2554#include <linux/ramfs.h>
2555
2556static struct file_system_type tmpfs_fs_type = {
2557 .name = "tmpfs",
2558 .get_sb = ramfs_get_sb,
2559 .kill_sb = kill_litter_super,
2560};
2561
2562static int __init init_tmpfs(void)
2563{
2564 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
2565
2566 shm_mnt = kern_mount(&tmpfs_fs_type);
2567 BUG_ON(IS_ERR(shm_mnt));
2568
2569 return 0;
2570}
2571
2572int shmem_unuse(swp_entry_t entry, struct page *page)
2573{
2574 return 0;
2575}
2576
2577#define shmem_file_operations ramfs_file_operations
2578#define shmem_vm_ops generic_file_vm_ops
2579#define shmem_get_inode ramfs_get_inode
2580#define shmem_acct_size(a, b) 0
2581#define shmem_unacct_size(a, b) do {} while (0)
2582#define SHMEM_MAX_BYTES LLONG_MAX
2583
2584#endif /* CONFIG_SHMEM */
2585
2586/* common code */
2529 2587
2530/** 2588/**
2531 * shmem_file_setup - get an unlinked file living in tmpfs 2589 * shmem_file_setup - get an unlinked file living in tmpfs
@@ -2569,12 +2627,20 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
2569 if (!inode) 2627 if (!inode)
2570 goto close_file; 2628 goto close_file;
2571 2629
2630#ifdef CONFIG_SHMEM
2572 SHMEM_I(inode)->flags = flags & VM_ACCOUNT; 2631 SHMEM_I(inode)->flags = flags & VM_ACCOUNT;
2632#endif
2573 d_instantiate(dentry, inode); 2633 d_instantiate(dentry, inode);
2574 inode->i_size = size; 2634 inode->i_size = size;
2575 inode->i_nlink = 0; /* It is unlinked */ 2635 inode->i_nlink = 0; /* It is unlinked */
2576 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, 2636 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
2577 &shmem_file_operations); 2637 &shmem_file_operations);
2638
2639#ifndef CONFIG_MMU
2640 error = ramfs_nommu_expand_for_mapping(inode, size);
2641 if (error)
2642 goto close_file;
2643#endif
2578 return file; 2644 return file;
2579 2645
2580close_file: 2646close_file:
@@ -2606,3 +2672,5 @@ int shmem_zero_setup(struct vm_area_struct *vma)
2606 vma->vm_ops = &shmem_vm_ops; 2672 vma->vm_ops = &shmem_vm_ops;
2607 return 0; 2673 return 0;
2608} 2674}
2675
2676module_init(init_tmpfs)
diff --git a/mm/slub.c b/mm/slub.c
index 509e96f411fc..f657c88814ee 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2285,7 +2285,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
2285 * Add some empty padding so that we can catch 2285 * Add some empty padding so that we can catch
2286 * overwrites from earlier objects rather than let 2286 * overwrites from earlier objects rather than let
2287 * tracking information or the free pointer be 2287 * tracking information or the free pointer be
2288 * corrupted if an user writes before the start 2288 * corrupted if a user writes before the start
2289 * of the object. 2289 * of the object.
2290 */ 2290 */
2291 size += sizeof(void *); 2291 size += sizeof(void *);
diff --git a/mm/swap.c b/mm/swap.c
index b135ec90cdeb..8adb9feb61e1 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -151,6 +151,26 @@ void rotate_reclaimable_page(struct page *page)
151 } 151 }
152} 152}
153 153
154static void update_page_reclaim_stat(struct zone *zone, struct page *page,
155 int file, int rotated)
156{
157 struct zone_reclaim_stat *reclaim_stat = &zone->reclaim_stat;
158 struct zone_reclaim_stat *memcg_reclaim_stat;
159
160 memcg_reclaim_stat = mem_cgroup_get_reclaim_stat_from_page(page);
161
162 reclaim_stat->recent_scanned[file]++;
163 if (rotated)
164 reclaim_stat->recent_rotated[file]++;
165
166 if (!memcg_reclaim_stat)
167 return;
168
169 memcg_reclaim_stat->recent_scanned[file]++;
170 if (rotated)
171 memcg_reclaim_stat->recent_rotated[file]++;
172}
173
154/* 174/*
155 * FIXME: speed this up? 175 * FIXME: speed this up?
156 */ 176 */
@@ -168,10 +188,8 @@ void activate_page(struct page *page)
168 lru += LRU_ACTIVE; 188 lru += LRU_ACTIVE;
169 add_page_to_lru_list(zone, page, lru); 189 add_page_to_lru_list(zone, page, lru);
170 __count_vm_event(PGACTIVATE); 190 __count_vm_event(PGACTIVATE);
171 mem_cgroup_move_lists(page, lru);
172 191
173 zone->recent_rotated[!!file]++; 192 update_page_reclaim_stat(zone, page, !!file, 1);
174 zone->recent_scanned[!!file]++;
175 } 193 }
176 spin_unlock_irq(&zone->lru_lock); 194 spin_unlock_irq(&zone->lru_lock);
177} 195}
@@ -246,25 +264,6 @@ void add_page_to_unevictable_list(struct page *page)
246 spin_unlock_irq(&zone->lru_lock); 264 spin_unlock_irq(&zone->lru_lock);
247} 265}
248 266
249/**
250 * lru_cache_add_active_or_unevictable
251 * @page: the page to be added to LRU
252 * @vma: vma in which page is mapped for determining reclaimability
253 *
254 * place @page on active or unevictable LRU list, depending on
255 * page_evictable(). Note that if the page is not evictable,
256 * it goes directly back onto it's zone's unevictable list. It does
257 * NOT use a per cpu pagevec.
258 */
259void lru_cache_add_active_or_unevictable(struct page *page,
260 struct vm_area_struct *vma)
261{
262 if (page_evictable(page, vma))
263 lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page));
264 else
265 add_page_to_unevictable_list(page);
266}
267
268/* 267/*
269 * Drain pages out of the cpu's pagevecs. 268 * Drain pages out of the cpu's pagevecs.
270 * Either "cpu" is the current CPU, and preemption has already been 269 * Either "cpu" is the current CPU, and preemption has already been
@@ -398,28 +397,6 @@ void __pagevec_release(struct pagevec *pvec)
398EXPORT_SYMBOL(__pagevec_release); 397EXPORT_SYMBOL(__pagevec_release);
399 398
400/* 399/*
401 * pagevec_release() for pages which are known to not be on the LRU
402 *
403 * This function reinitialises the caller's pagevec.
404 */
405void __pagevec_release_nonlru(struct pagevec *pvec)
406{
407 int i;
408 struct pagevec pages_to_free;
409
410 pagevec_init(&pages_to_free, pvec->cold);
411 for (i = 0; i < pagevec_count(pvec); i++) {
412 struct page *page = pvec->pages[i];
413
414 VM_BUG_ON(PageLRU(page));
415 if (put_page_testzero(page))
416 pagevec_add(&pages_to_free, page);
417 }
418 pagevec_free(&pages_to_free);
419 pagevec_reinit(pvec);
420}
421
422/*
423 * Add the passed pages to the LRU, then drop the caller's refcount 400 * Add the passed pages to the LRU, then drop the caller's refcount
424 * on them. Reinitialises the caller's pagevec. 401 * on them. Reinitialises the caller's pagevec.
425 */ 402 */
@@ -427,12 +404,14 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
427{ 404{
428 int i; 405 int i;
429 struct zone *zone = NULL; 406 struct zone *zone = NULL;
407
430 VM_BUG_ON(is_unevictable_lru(lru)); 408 VM_BUG_ON(is_unevictable_lru(lru));
431 409
432 for (i = 0; i < pagevec_count(pvec); i++) { 410 for (i = 0; i < pagevec_count(pvec); i++) {
433 struct page *page = pvec->pages[i]; 411 struct page *page = pvec->pages[i];
434 struct zone *pagezone = page_zone(page); 412 struct zone *pagezone = page_zone(page);
435 int file; 413 int file;
414 int active;
436 415
437 if (pagezone != zone) { 416 if (pagezone != zone) {
438 if (zone) 417 if (zone)
@@ -444,12 +423,11 @@ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru)
444 VM_BUG_ON(PageUnevictable(page)); 423 VM_BUG_ON(PageUnevictable(page));
445 VM_BUG_ON(PageLRU(page)); 424 VM_BUG_ON(PageLRU(page));
446 SetPageLRU(page); 425 SetPageLRU(page);
426 active = is_active_lru(lru);
447 file = is_file_lru(lru); 427 file = is_file_lru(lru);
448 zone->recent_scanned[file]++; 428 if (active)
449 if (is_active_lru(lru)) {
450 SetPageActive(page); 429 SetPageActive(page);
451 zone->recent_rotated[file]++; 430 update_page_reclaim_stat(zone, page, file, active);
452 }
453 add_page_to_lru_list(zone, page, lru); 431 add_page_to_lru_list(zone, page, lru);
454 } 432 }
455 if (zone) 433 if (zone)
@@ -495,8 +473,7 @@ void pagevec_swap_free(struct pagevec *pvec)
495 struct page *page = pvec->pages[i]; 473 struct page *page = pvec->pages[i];
496 474
497 if (PageSwapCache(page) && trylock_page(page)) { 475 if (PageSwapCache(page) && trylock_page(page)) {
498 if (PageSwapCache(page)) 476 try_to_free_swap(page);
499 remove_exclusive_swap_page_ref(page);
500 unlock_page(page); 477 unlock_page(page);
501 } 478 }
502 } 479 }
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 3353c9029cef..3ecea98ecb45 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -17,6 +17,7 @@
17#include <linux/backing-dev.h> 17#include <linux/backing-dev.h>
18#include <linux/pagevec.h> 18#include <linux/pagevec.h>
19#include <linux/migrate.h> 19#include <linux/migrate.h>
20#include <linux/page_cgroup.h>
20 21
21#include <asm/pgtable.h> 22#include <asm/pgtable.h>
22 23
@@ -72,10 +73,10 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
72{ 73{
73 int error; 74 int error;
74 75
75 BUG_ON(!PageLocked(page)); 76 VM_BUG_ON(!PageLocked(page));
76 BUG_ON(PageSwapCache(page)); 77 VM_BUG_ON(PageSwapCache(page));
77 BUG_ON(PagePrivate(page)); 78 VM_BUG_ON(!PageSwapBacked(page));
78 BUG_ON(!PageSwapBacked(page)); 79
79 error = radix_tree_preload(gfp_mask); 80 error = radix_tree_preload(gfp_mask);
80 if (!error) { 81 if (!error) {
81 page_cache_get(page); 82 page_cache_get(page);
@@ -108,10 +109,11 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
108 */ 109 */
109void __delete_from_swap_cache(struct page *page) 110void __delete_from_swap_cache(struct page *page)
110{ 111{
111 BUG_ON(!PageLocked(page)); 112 swp_entry_t ent = {.val = page_private(page)};
112 BUG_ON(!PageSwapCache(page)); 113
113 BUG_ON(PageWriteback(page)); 114 VM_BUG_ON(!PageLocked(page));
114 BUG_ON(PagePrivate(page)); 115 VM_BUG_ON(!PageSwapCache(page));
116 VM_BUG_ON(PageWriteback(page));
115 117
116 radix_tree_delete(&swapper_space.page_tree, page_private(page)); 118 radix_tree_delete(&swapper_space.page_tree, page_private(page));
117 set_page_private(page, 0); 119 set_page_private(page, 0);
@@ -119,6 +121,7 @@ void __delete_from_swap_cache(struct page *page)
119 total_swapcache_pages--; 121 total_swapcache_pages--;
120 __dec_zone_page_state(page, NR_FILE_PAGES); 122 __dec_zone_page_state(page, NR_FILE_PAGES);
121 INC_CACHE_INFO(del_total); 123 INC_CACHE_INFO(del_total);
124 mem_cgroup_uncharge_swapcache(page, ent);
122} 125}
123 126
124/** 127/**
@@ -129,13 +132,13 @@ void __delete_from_swap_cache(struct page *page)
129 * Allocate swap space for the page and add the page to the 132 * Allocate swap space for the page and add the page to the
130 * swap cache. Caller needs to hold the page lock. 133 * swap cache. Caller needs to hold the page lock.
131 */ 134 */
132int add_to_swap(struct page * page, gfp_t gfp_mask) 135int add_to_swap(struct page *page)
133{ 136{
134 swp_entry_t entry; 137 swp_entry_t entry;
135 int err; 138 int err;
136 139
137 BUG_ON(!PageLocked(page)); 140 VM_BUG_ON(!PageLocked(page));
138 BUG_ON(!PageUptodate(page)); 141 VM_BUG_ON(!PageUptodate(page));
139 142
140 for (;;) { 143 for (;;) {
141 entry = get_swap_page(); 144 entry = get_swap_page();
@@ -154,7 +157,7 @@ int add_to_swap(struct page * page, gfp_t gfp_mask)
154 * Add it to the swap cache and mark it dirty 157 * Add it to the swap cache and mark it dirty
155 */ 158 */
156 err = add_to_swap_cache(page, entry, 159 err = add_to_swap_cache(page, entry,
157 gfp_mask|__GFP_NOMEMALLOC|__GFP_NOWARN); 160 __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
158 161
159 switch (err) { 162 switch (err) {
160 case 0: /* Success */ 163 case 0: /* Success */
@@ -196,14 +199,14 @@ void delete_from_swap_cache(struct page *page)
196 * If we are the only user, then try to free up the swap cache. 199 * If we are the only user, then try to free up the swap cache.
197 * 200 *
198 * Its ok to check for PageSwapCache without the page lock 201 * Its ok to check for PageSwapCache without the page lock
199 * here because we are going to recheck again inside 202 * here because we are going to recheck again inside
200 * exclusive_swap_page() _with_ the lock. 203 * try_to_free_swap() _with_ the lock.
201 * - Marcelo 204 * - Marcelo
202 */ 205 */
203static inline void free_swap_cache(struct page *page) 206static inline void free_swap_cache(struct page *page)
204{ 207{
205 if (PageSwapCache(page) && trylock_page(page)) { 208 if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) {
206 remove_exclusive_swap_page(page); 209 try_to_free_swap(page);
207 unlock_page(page); 210 unlock_page(page);
208 } 211 }
209} 212}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 54a9f87e5162..da422c47e2ee 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -16,6 +16,7 @@
16#include <linux/namei.h> 16#include <linux/namei.h>
17#include <linux/shm.h> 17#include <linux/shm.h>
18#include <linux/blkdev.h> 18#include <linux/blkdev.h>
19#include <linux/random.h>
19#include <linux/writeback.h> 20#include <linux/writeback.h>
20#include <linux/proc_fs.h> 21#include <linux/proc_fs.h>
21#include <linux/seq_file.h> 22#include <linux/seq_file.h>
@@ -32,9 +33,11 @@
32#include <asm/pgtable.h> 33#include <asm/pgtable.h>
33#include <asm/tlbflush.h> 34#include <asm/tlbflush.h>
34#include <linux/swapops.h> 35#include <linux/swapops.h>
36#include <linux/page_cgroup.h>
35 37
36static DEFINE_SPINLOCK(swap_lock); 38static DEFINE_SPINLOCK(swap_lock);
37static unsigned int nr_swapfiles; 39static unsigned int nr_swapfiles;
40long nr_swap_pages;
38long total_swap_pages; 41long total_swap_pages;
39static int swap_overflow; 42static int swap_overflow;
40static int least_priority; 43static int least_priority;
@@ -83,15 +86,96 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
83 up_read(&swap_unplug_sem); 86 up_read(&swap_unplug_sem);
84} 87}
85 88
89/*
90 * swapon tell device that all the old swap contents can be discarded,
91 * to allow the swap device to optimize its wear-levelling.
92 */
93static int discard_swap(struct swap_info_struct *si)
94{
95 struct swap_extent *se;
96 int err = 0;
97
98 list_for_each_entry(se, &si->extent_list, list) {
99 sector_t start_block = se->start_block << (PAGE_SHIFT - 9);
100 sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
101
102 if (se->start_page == 0) {
103 /* Do not discard the swap header page! */
104 start_block += 1 << (PAGE_SHIFT - 9);
105 nr_blocks -= 1 << (PAGE_SHIFT - 9);
106 if (!nr_blocks)
107 continue;
108 }
109
110 err = blkdev_issue_discard(si->bdev, start_block,
111 nr_blocks, GFP_KERNEL);
112 if (err)
113 break;
114
115 cond_resched();
116 }
117 return err; /* That will often be -EOPNOTSUPP */
118}
119
120/*
121 * swap allocation tell device that a cluster of swap can now be discarded,
122 * to allow the swap device to optimize its wear-levelling.
123 */
124static void discard_swap_cluster(struct swap_info_struct *si,
125 pgoff_t start_page, pgoff_t nr_pages)
126{
127 struct swap_extent *se = si->curr_swap_extent;
128 int found_extent = 0;
129
130 while (nr_pages) {
131 struct list_head *lh;
132
133 if (se->start_page <= start_page &&
134 start_page < se->start_page + se->nr_pages) {
135 pgoff_t offset = start_page - se->start_page;
136 sector_t start_block = se->start_block + offset;
137 sector_t nr_blocks = se->nr_pages - offset;
138
139 if (nr_blocks > nr_pages)
140 nr_blocks = nr_pages;
141 start_page += nr_blocks;
142 nr_pages -= nr_blocks;
143
144 if (!found_extent++)
145 si->curr_swap_extent = se;
146
147 start_block <<= PAGE_SHIFT - 9;
148 nr_blocks <<= PAGE_SHIFT - 9;
149 if (blkdev_issue_discard(si->bdev, start_block,
150 nr_blocks, GFP_NOIO))
151 break;
152 }
153
154 lh = se->list.next;
155 if (lh == &si->extent_list)
156 lh = lh->next;
157 se = list_entry(lh, struct swap_extent, list);
158 }
159}
160
161static int wait_for_discard(void *word)
162{
163 schedule();
164 return 0;
165}
166
86#define SWAPFILE_CLUSTER 256 167#define SWAPFILE_CLUSTER 256
87#define LATENCY_LIMIT 256 168#define LATENCY_LIMIT 256
88 169
89static inline unsigned long scan_swap_map(struct swap_info_struct *si) 170static inline unsigned long scan_swap_map(struct swap_info_struct *si)
90{ 171{
91 unsigned long offset, last_in_cluster; 172 unsigned long offset;
173 unsigned long scan_base;
174 unsigned long last_in_cluster = 0;
92 int latency_ration = LATENCY_LIMIT; 175 int latency_ration = LATENCY_LIMIT;
176 int found_free_cluster = 0;
93 177
94 /* 178 /*
95 * We try to cluster swap pages by allocating them sequentially 179 * We try to cluster swap pages by allocating them sequentially
96 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this 180 * in swap. Once we've allocated SWAPFILE_CLUSTER pages this
97 * way, however, we resort to first-free allocation, starting 181 * way, however, we resort to first-free allocation, starting
@@ -99,16 +183,42 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
99 * all over the entire swap partition, so that we reduce 183 * all over the entire swap partition, so that we reduce
100 * overall disk seek times between swap pages. -- sct 184 * overall disk seek times between swap pages. -- sct
101 * But we do now try to find an empty cluster. -Andrea 185 * But we do now try to find an empty cluster. -Andrea
186 * And we let swap pages go all over an SSD partition. Hugh
102 */ 187 */
103 188
104 si->flags += SWP_SCANNING; 189 si->flags += SWP_SCANNING;
105 if (unlikely(!si->cluster_nr)) { 190 scan_base = offset = si->cluster_next;
106 si->cluster_nr = SWAPFILE_CLUSTER - 1; 191
107 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) 192 if (unlikely(!si->cluster_nr--)) {
108 goto lowest; 193 if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
194 si->cluster_nr = SWAPFILE_CLUSTER - 1;
195 goto checks;
196 }
197 if (si->flags & SWP_DISCARDABLE) {
198 /*
199 * Start range check on racing allocations, in case
200 * they overlap the cluster we eventually decide on
201 * (we scan without swap_lock to allow preemption).
202 * It's hardly conceivable that cluster_nr could be
203 * wrapped during our scan, but don't depend on it.
204 */
205 if (si->lowest_alloc)
206 goto checks;
207 si->lowest_alloc = si->max;
208 si->highest_alloc = 0;
209 }
109 spin_unlock(&swap_lock); 210 spin_unlock(&swap_lock);
110 211
111 offset = si->lowest_bit; 212 /*
213 * If seek is expensive, start searching for new cluster from
214 * start of partition, to minimize the span of allocated swap.
215 * But if seek is cheap, search from our current position, so
216 * that swap is allocated from all over the partition: if the
217 * Flash Translation Layer only remaps within limited zones,
218 * we don't want to wear out the first zone too quickly.
219 */
220 if (!(si->flags & SWP_SOLIDSTATE))
221 scan_base = offset = si->lowest_bit;
112 last_in_cluster = offset + SWAPFILE_CLUSTER - 1; 222 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
113 223
114 /* Locate the first empty (unaligned) cluster */ 224 /* Locate the first empty (unaligned) cluster */
@@ -117,43 +227,124 @@ static inline unsigned long scan_swap_map(struct swap_info_struct *si)
117 last_in_cluster = offset + SWAPFILE_CLUSTER; 227 last_in_cluster = offset + SWAPFILE_CLUSTER;
118 else if (offset == last_in_cluster) { 228 else if (offset == last_in_cluster) {
119 spin_lock(&swap_lock); 229 spin_lock(&swap_lock);
120 si->cluster_next = offset-SWAPFILE_CLUSTER+1; 230 offset -= SWAPFILE_CLUSTER - 1;
121 goto cluster; 231 si->cluster_next = offset;
232 si->cluster_nr = SWAPFILE_CLUSTER - 1;
233 found_free_cluster = 1;
234 goto checks;
122 } 235 }
123 if (unlikely(--latency_ration < 0)) { 236 if (unlikely(--latency_ration < 0)) {
124 cond_resched(); 237 cond_resched();
125 latency_ration = LATENCY_LIMIT; 238 latency_ration = LATENCY_LIMIT;
126 } 239 }
127 } 240 }
241
242 offset = si->lowest_bit;
243 last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
244
245 /* Locate the first empty (unaligned) cluster */
246 for (; last_in_cluster < scan_base; offset++) {
247 if (si->swap_map[offset])
248 last_in_cluster = offset + SWAPFILE_CLUSTER;
249 else if (offset == last_in_cluster) {
250 spin_lock(&swap_lock);
251 offset -= SWAPFILE_CLUSTER - 1;
252 si->cluster_next = offset;
253 si->cluster_nr = SWAPFILE_CLUSTER - 1;
254 found_free_cluster = 1;
255 goto checks;
256 }
257 if (unlikely(--latency_ration < 0)) {
258 cond_resched();
259 latency_ration = LATENCY_LIMIT;
260 }
261 }
262
263 offset = scan_base;
128 spin_lock(&swap_lock); 264 spin_lock(&swap_lock);
129 goto lowest; 265 si->cluster_nr = SWAPFILE_CLUSTER - 1;
266 si->lowest_alloc = 0;
130 } 267 }
131 268
132 si->cluster_nr--; 269checks:
133cluster: 270 if (!(si->flags & SWP_WRITEOK))
134 offset = si->cluster_next;
135 if (offset > si->highest_bit)
136lowest: offset = si->lowest_bit;
137checks: if (!(si->flags & SWP_WRITEOK))
138 goto no_page; 271 goto no_page;
139 if (!si->highest_bit) 272 if (!si->highest_bit)
140 goto no_page; 273 goto no_page;
141 if (!si->swap_map[offset]) { 274 if (offset > si->highest_bit)
142 if (offset == si->lowest_bit) 275 scan_base = offset = si->lowest_bit;
143 si->lowest_bit++; 276 if (si->swap_map[offset])
144 if (offset == si->highest_bit) 277 goto scan;
145 si->highest_bit--; 278
146 si->inuse_pages++; 279 if (offset == si->lowest_bit)
147 if (si->inuse_pages == si->pages) { 280 si->lowest_bit++;
148 si->lowest_bit = si->max; 281 if (offset == si->highest_bit)
149 si->highest_bit = 0; 282 si->highest_bit--;
283 si->inuse_pages++;
284 if (si->inuse_pages == si->pages) {
285 si->lowest_bit = si->max;
286 si->highest_bit = 0;
287 }
288 si->swap_map[offset] = 1;
289 si->cluster_next = offset + 1;
290 si->flags -= SWP_SCANNING;
291
292 if (si->lowest_alloc) {
293 /*
294 * Only set when SWP_DISCARDABLE, and there's a scan
295 * for a free cluster in progress or just completed.
296 */
297 if (found_free_cluster) {
298 /*
299 * To optimize wear-levelling, discard the
300 * old data of the cluster, taking care not to
301 * discard any of its pages that have already
302 * been allocated by racing tasks (offset has
303 * already stepped over any at the beginning).
304 */
305 if (offset < si->highest_alloc &&
306 si->lowest_alloc <= last_in_cluster)
307 last_in_cluster = si->lowest_alloc - 1;
308 si->flags |= SWP_DISCARDING;
309 spin_unlock(&swap_lock);
310
311 if (offset < last_in_cluster)
312 discard_swap_cluster(si, offset,
313 last_in_cluster - offset + 1);
314
315 spin_lock(&swap_lock);
316 si->lowest_alloc = 0;
317 si->flags &= ~SWP_DISCARDING;
318
319 smp_mb(); /* wake_up_bit advises this */
320 wake_up_bit(&si->flags, ilog2(SWP_DISCARDING));
321
322 } else if (si->flags & SWP_DISCARDING) {
323 /*
324 * Delay using pages allocated by racing tasks
325 * until the whole discard has been issued. We
326 * could defer that delay until swap_writepage,
327 * but it's easier to keep this self-contained.
328 */
329 spin_unlock(&swap_lock);
330 wait_on_bit(&si->flags, ilog2(SWP_DISCARDING),
331 wait_for_discard, TASK_UNINTERRUPTIBLE);
332 spin_lock(&swap_lock);
333 } else {
334 /*
335 * Note pages allocated by racing tasks while
336 * scan for a free cluster is in progress, so
337 * that its final discard can exclude them.
338 */
339 if (offset < si->lowest_alloc)
340 si->lowest_alloc = offset;
341 if (offset > si->highest_alloc)
342 si->highest_alloc = offset;
150 } 343 }
151 si->swap_map[offset] = 1;
152 si->cluster_next = offset + 1;
153 si->flags -= SWP_SCANNING;
154 return offset;
155 } 344 }
345 return offset;
156 346
347scan:
157 spin_unlock(&swap_lock); 348 spin_unlock(&swap_lock);
158 while (++offset <= si->highest_bit) { 349 while (++offset <= si->highest_bit) {
159 if (!si->swap_map[offset]) { 350 if (!si->swap_map[offset]) {
@@ -165,8 +356,18 @@ checks: if (!(si->flags & SWP_WRITEOK))
165 latency_ration = LATENCY_LIMIT; 356 latency_ration = LATENCY_LIMIT;
166 } 357 }
167 } 358 }
359 offset = si->lowest_bit;
360 while (++offset < scan_base) {
361 if (!si->swap_map[offset]) {
362 spin_lock(&swap_lock);
363 goto checks;
364 }
365 if (unlikely(--latency_ration < 0)) {
366 cond_resched();
367 latency_ration = LATENCY_LIMIT;
368 }
369 }
168 spin_lock(&swap_lock); 370 spin_lock(&swap_lock);
169 goto lowest;
170 371
171no_page: 372no_page:
172 si->flags -= SWP_SCANNING; 373 si->flags -= SWP_SCANNING;
@@ -268,10 +469,11 @@ bad_nofile:
268 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); 469 printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val);
269out: 470out:
270 return NULL; 471 return NULL;
271} 472}
272 473
273static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) 474static int swap_entry_free(struct swap_info_struct *p, swp_entry_t ent)
274{ 475{
476 unsigned long offset = swp_offset(ent);
275 int count = p->swap_map[offset]; 477 int count = p->swap_map[offset];
276 478
277 if (count < SWAP_MAP_MAX) { 479 if (count < SWAP_MAP_MAX) {
@@ -286,6 +488,7 @@ static int swap_entry_free(struct swap_info_struct *p, unsigned long offset)
286 swap_list.next = p - swap_info; 488 swap_list.next = p - swap_info;
287 nr_swap_pages++; 489 nr_swap_pages++;
288 p->inuse_pages--; 490 p->inuse_pages--;
491 mem_cgroup_uncharge_swap(ent);
289 } 492 }
290 } 493 }
291 return count; 494 return count;
@@ -301,7 +504,7 @@ void swap_free(swp_entry_t entry)
301 504
302 p = swap_info_get(entry); 505 p = swap_info_get(entry);
303 if (p) { 506 if (p) {
304 swap_entry_free(p, swp_offset(entry)); 507 swap_entry_free(p, entry);
305 spin_unlock(&swap_lock); 508 spin_unlock(&swap_lock);
306 } 509 }
307} 510}
@@ -326,101 +529,62 @@ static inline int page_swapcount(struct page *page)
326} 529}
327 530
328/* 531/*
329 * We can use this swap cache entry directly 532 * We can write to an anon page without COW if there are no other references
330 * if there are no other references to it. 533 * to it. And as a side-effect, free up its swap: because the old content
534 * on disk will never be read, and seeking back there to write new content
535 * later would only waste time away from clustering.
331 */ 536 */
332int can_share_swap_page(struct page *page) 537int reuse_swap_page(struct page *page)
333{ 538{
334 int count; 539 int count;
335 540
336 BUG_ON(!PageLocked(page)); 541 VM_BUG_ON(!PageLocked(page));
337 count = page_mapcount(page); 542 count = page_mapcount(page);
338 if (count <= 1 && PageSwapCache(page)) 543 if (count <= 1 && PageSwapCache(page)) {
339 count += page_swapcount(page); 544 count += page_swapcount(page);
545 if (count == 1 && !PageWriteback(page)) {
546 delete_from_swap_cache(page);
547 SetPageDirty(page);
548 }
549 }
340 return count == 1; 550 return count == 1;
341} 551}
342 552
343/* 553/*
344 * Work out if there are any other processes sharing this 554 * If swap is getting full, or if there are no more mappings of this page,
345 * swap cache page. Free it if you can. Return success. 555 * then try_to_free_swap is called to free its swap space.
346 */ 556 */
347static int remove_exclusive_swap_page_count(struct page *page, int count) 557int try_to_free_swap(struct page *page)
348{ 558{
349 int retval; 559 VM_BUG_ON(!PageLocked(page));
350 struct swap_info_struct * p;
351 swp_entry_t entry;
352
353 BUG_ON(PagePrivate(page));
354 BUG_ON(!PageLocked(page));
355 560
356 if (!PageSwapCache(page)) 561 if (!PageSwapCache(page))
357 return 0; 562 return 0;
358 if (PageWriteback(page)) 563 if (PageWriteback(page))
359 return 0; 564 return 0;
360 if (page_count(page) != count) /* us + cache + ptes */ 565 if (page_swapcount(page))
361 return 0; 566 return 0;
362 567
363 entry.val = page_private(page); 568 delete_from_swap_cache(page);
364 p = swap_info_get(entry); 569 SetPageDirty(page);
365 if (!p) 570 return 1;
366 return 0;
367
368 /* Is the only swap cache user the cache itself? */
369 retval = 0;
370 if (p->swap_map[swp_offset(entry)] == 1) {
371 /* Recheck the page count with the swapcache lock held.. */
372 spin_lock_irq(&swapper_space.tree_lock);
373 if ((page_count(page) == count) && !PageWriteback(page)) {
374 __delete_from_swap_cache(page);
375 SetPageDirty(page);
376 retval = 1;
377 }
378 spin_unlock_irq(&swapper_space.tree_lock);
379 }
380 spin_unlock(&swap_lock);
381
382 if (retval) {
383 swap_free(entry);
384 page_cache_release(page);
385 }
386
387 return retval;
388}
389
390/*
391 * Most of the time the page should have two references: one for the
392 * process and one for the swap cache.
393 */
394int remove_exclusive_swap_page(struct page *page)
395{
396 return remove_exclusive_swap_page_count(page, 2);
397}
398
399/*
400 * The pageout code holds an extra reference to the page. That raises
401 * the reference count to test for to 2 for a page that is only in the
402 * swap cache plus 1 for each process that maps the page.
403 */
404int remove_exclusive_swap_page_ref(struct page *page)
405{
406 return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page));
407} 571}
408 572
409/* 573/*
410 * Free the swap entry like above, but also try to 574 * Free the swap entry like above, but also try to
411 * free the page cache entry if it is the last user. 575 * free the page cache entry if it is the last user.
412 */ 576 */
413void free_swap_and_cache(swp_entry_t entry) 577int free_swap_and_cache(swp_entry_t entry)
414{ 578{
415 struct swap_info_struct * p; 579 struct swap_info_struct *p;
416 struct page *page = NULL; 580 struct page *page = NULL;
417 581
418 if (is_migration_entry(entry)) 582 if (is_migration_entry(entry))
419 return; 583 return 1;
420 584
421 p = swap_info_get(entry); 585 p = swap_info_get(entry);
422 if (p) { 586 if (p) {
423 if (swap_entry_free(p, swp_offset(entry)) == 1) { 587 if (swap_entry_free(p, entry) == 1) {
424 page = find_get_page(&swapper_space, entry.val); 588 page = find_get_page(&swapper_space, entry.val);
425 if (page && !trylock_page(page)) { 589 if (page && !trylock_page(page)) {
426 page_cache_release(page); 590 page_cache_release(page);
@@ -430,20 +594,19 @@ void free_swap_and_cache(swp_entry_t entry)
430 spin_unlock(&swap_lock); 594 spin_unlock(&swap_lock);
431 } 595 }
432 if (page) { 596 if (page) {
433 int one_user; 597 /*
434 598 * Not mapped elsewhere, or swap space full? Free it!
435 BUG_ON(PagePrivate(page)); 599 * Also recheck PageSwapCache now page is locked (above).
436 one_user = (page_count(page) == 2); 600 */
437 /* Only cache user (+us), or swap space full? Free it! */
438 /* Also recheck PageSwapCache after page is locked (above) */
439 if (PageSwapCache(page) && !PageWriteback(page) && 601 if (PageSwapCache(page) && !PageWriteback(page) &&
440 (one_user || vm_swap_full())) { 602 (!page_mapped(page) || vm_swap_full())) {
441 delete_from_swap_cache(page); 603 delete_from_swap_cache(page);
442 SetPageDirty(page); 604 SetPageDirty(page);
443 } 605 }
444 unlock_page(page); 606 unlock_page(page);
445 page_cache_release(page); 607 page_cache_release(page);
446 } 608 }
609 return p != NULL;
447} 610}
448 611
449#ifdef CONFIG_HIBERNATION 612#ifdef CONFIG_HIBERNATION
@@ -530,17 +693,18 @@ unsigned int count_swap_pages(int type, int free)
530static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, 693static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
531 unsigned long addr, swp_entry_t entry, struct page *page) 694 unsigned long addr, swp_entry_t entry, struct page *page)
532{ 695{
696 struct mem_cgroup *ptr = NULL;
533 spinlock_t *ptl; 697 spinlock_t *ptl;
534 pte_t *pte; 698 pte_t *pte;
535 int ret = 1; 699 int ret = 1;
536 700
537 if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) 701 if (mem_cgroup_try_charge_swapin(vma->vm_mm, page, GFP_KERNEL, &ptr))
538 ret = -ENOMEM; 702 ret = -ENOMEM;
539 703
540 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); 704 pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
541 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { 705 if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) {
542 if (ret > 0) 706 if (ret > 0)
543 mem_cgroup_uncharge_page(page); 707 mem_cgroup_cancel_charge_swapin(ptr);
544 ret = 0; 708 ret = 0;
545 goto out; 709 goto out;
546 } 710 }
@@ -550,6 +714,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
550 set_pte_at(vma->vm_mm, addr, pte, 714 set_pte_at(vma->vm_mm, addr, pte,
551 pte_mkold(mk_pte(page, vma->vm_page_prot))); 715 pte_mkold(mk_pte(page, vma->vm_page_prot)));
552 page_add_anon_rmap(page, vma, addr); 716 page_add_anon_rmap(page, vma, addr);
717 mem_cgroup_commit_charge_swapin(page, ptr);
553 swap_free(entry); 718 swap_free(entry);
554 /* 719 /*
555 * Move the page to the active list so it is not 720 * Move the page to the active list so it is not
@@ -776,10 +941,10 @@ static int try_to_unuse(unsigned int type)
776 break; 941 break;
777 } 942 }
778 943
779 /* 944 /*
780 * Get a page for the entry, using the existing swap 945 * Get a page for the entry, using the existing swap
781 * cache page if there is one. Otherwise, get a clean 946 * cache page if there is one. Otherwise, get a clean
782 * page and read the swap into it. 947 * page and read the swap into it.
783 */ 948 */
784 swap_map = &si->swap_map[i]; 949 swap_map = &si->swap_map[i];
785 entry = swp_entry(type, i); 950 entry = swp_entry(type, i);
@@ -930,7 +1095,16 @@ static int try_to_unuse(unsigned int type)
930 lock_page(page); 1095 lock_page(page);
931 wait_on_page_writeback(page); 1096 wait_on_page_writeback(page);
932 } 1097 }
933 if (PageSwapCache(page)) 1098
1099 /*
1100 * It is conceivable that a racing task removed this page from
1101 * swap cache just before we acquired the page lock at the top,
1102 * or while we dropped it in unuse_mm(). The page might even
1103 * be back in swap cache on another swap area: that we must not
1104 * delete, since it may not have been written out to swap yet.
1105 */
1106 if (PageSwapCache(page) &&
1107 likely(page_private(page) == entry.val))
934 delete_from_swap_cache(page); 1108 delete_from_swap_cache(page);
935 1109
936 /* 1110 /*
@@ -1203,26 +1377,6 @@ out:
1203 return ret; 1377 return ret;
1204} 1378}
1205 1379
1206#if 0 /* We don't need this yet */
1207#include <linux/backing-dev.h>
1208int page_queue_congested(struct page *page)
1209{
1210 struct backing_dev_info *bdi;
1211
1212 BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
1213
1214 if (PageSwapCache(page)) {
1215 swp_entry_t entry = { .val = page_private(page) };
1216 struct swap_info_struct *sis;
1217
1218 sis = get_swap_info_struct(swp_type(entry));
1219 bdi = sis->bdev->bd_inode->i_mapping->backing_dev_info;
1220 } else
1221 bdi = page->mapping->backing_dev_info;
1222 return bdi_write_congested(bdi);
1223}
1224#endif
1225
1226asmlinkage long sys_swapoff(const char __user * specialfile) 1380asmlinkage long sys_swapoff(const char __user * specialfile)
1227{ 1381{
1228 struct swap_info_struct * p = NULL; 1382 struct swap_info_struct * p = NULL;
@@ -1233,7 +1387,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1233 char * pathname; 1387 char * pathname;
1234 int i, type, prev; 1388 int i, type, prev;
1235 int err; 1389 int err;
1236 1390
1237 if (!capable(CAP_SYS_ADMIN)) 1391 if (!capable(CAP_SYS_ADMIN))
1238 return -EPERM; 1392 return -EPERM;
1239 1393
@@ -1253,7 +1407,7 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1253 spin_lock(&swap_lock); 1407 spin_lock(&swap_lock);
1254 for (type = swap_list.head; type >= 0; type = swap_info[type].next) { 1408 for (type = swap_list.head; type >= 0; type = swap_info[type].next) {
1255 p = swap_info + type; 1409 p = swap_info + type;
1256 if ((p->flags & SWP_ACTIVE) == SWP_ACTIVE) { 1410 if (p->flags & SWP_WRITEOK) {
1257 if (p->swap_file->f_mapping == mapping) 1411 if (p->swap_file->f_mapping == mapping)
1258 break; 1412 break;
1259 } 1413 }
@@ -1343,6 +1497,9 @@ asmlinkage long sys_swapoff(const char __user * specialfile)
1343 spin_unlock(&swap_lock); 1497 spin_unlock(&swap_lock);
1344 mutex_unlock(&swapon_mutex); 1498 mutex_unlock(&swapon_mutex);
1345 vfree(swap_map); 1499 vfree(swap_map);
1500 /* Destroy swap account informatin */
1501 swap_cgroup_swapoff(type);
1502
1346 inode = mapping->host; 1503 inode = mapping->host;
1347 if (S_ISBLK(inode->i_mode)) { 1504 if (S_ISBLK(inode->i_mode)) {
1348 struct block_device *bdev = I_BDEV(inode); 1505 struct block_device *bdev = I_BDEV(inode);
@@ -1426,12 +1583,12 @@ static int swap_show(struct seq_file *swap, void *v)
1426 file = ptr->swap_file; 1583 file = ptr->swap_file;
1427 len = seq_path(swap, &file->f_path, " \t\n\\"); 1584 len = seq_path(swap, &file->f_path, " \t\n\\");
1428 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", 1585 seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
1429 len < 40 ? 40 - len : 1, " ", 1586 len < 40 ? 40 - len : 1, " ",
1430 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? 1587 S_ISBLK(file->f_path.dentry->d_inode->i_mode) ?
1431 "partition" : "file\t", 1588 "partition" : "file\t",
1432 ptr->pages << (PAGE_SHIFT - 10), 1589 ptr->pages << (PAGE_SHIFT - 10),
1433 ptr->inuse_pages << (PAGE_SHIFT - 10), 1590 ptr->inuse_pages << (PAGE_SHIFT - 10),
1434 ptr->prio); 1591 ptr->prio);
1435 return 0; 1592 return 0;
1436} 1593}
1437 1594
@@ -1487,12 +1644,11 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1487 int i, prev; 1644 int i, prev;
1488 int error; 1645 int error;
1489 union swap_header *swap_header = NULL; 1646 union swap_header *swap_header = NULL;
1490 int swap_header_version;
1491 unsigned int nr_good_pages = 0; 1647 unsigned int nr_good_pages = 0;
1492 int nr_extents = 0; 1648 int nr_extents = 0;
1493 sector_t span; 1649 sector_t span;
1494 unsigned long maxpages = 1; 1650 unsigned long maxpages = 1;
1495 int swapfilesize; 1651 unsigned long swapfilepages;
1496 unsigned short *swap_map = NULL; 1652 unsigned short *swap_map = NULL;
1497 struct page *page = NULL; 1653 struct page *page = NULL;
1498 struct inode *inode = NULL; 1654 struct inode *inode = NULL;
@@ -1570,7 +1726,7 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1570 goto bad_swap; 1726 goto bad_swap;
1571 } 1727 }
1572 1728
1573 swapfilesize = i_size_read(inode) >> PAGE_SHIFT; 1729 swapfilepages = i_size_read(inode) >> PAGE_SHIFT;
1574 1730
1575 /* 1731 /*
1576 * Read the swap header. 1732 * Read the swap header.
@@ -1584,102 +1740,92 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1584 error = PTR_ERR(page); 1740 error = PTR_ERR(page);
1585 goto bad_swap; 1741 goto bad_swap;
1586 } 1742 }
1587 kmap(page); 1743 swap_header = kmap(page);
1588 swap_header = page_address(page);
1589 1744
1590 if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) 1745 if (memcmp("SWAPSPACE2", swap_header->magic.magic, 10)) {
1591 swap_header_version = 1;
1592 else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10))
1593 swap_header_version = 2;
1594 else {
1595 printk(KERN_ERR "Unable to find swap-space signature\n"); 1746 printk(KERN_ERR "Unable to find swap-space signature\n");
1596 error = -EINVAL; 1747 error = -EINVAL;
1597 goto bad_swap; 1748 goto bad_swap;
1598 } 1749 }
1599 1750
1600 switch (swap_header_version) { 1751 /* swap partition endianess hack... */
1601 case 1: 1752 if (swab32(swap_header->info.version) == 1) {
1602 printk(KERN_ERR "version 0 swap is no longer supported. " 1753 swab32s(&swap_header->info.version);
1603 "Use mkswap -v1 %s\n", name); 1754 swab32s(&swap_header->info.last_page);
1755 swab32s(&swap_header->info.nr_badpages);
1756 for (i = 0; i < swap_header->info.nr_badpages; i++)
1757 swab32s(&swap_header->info.badpages[i]);
1758 }
1759 /* Check the swap header's sub-version */
1760 if (swap_header->info.version != 1) {
1761 printk(KERN_WARNING
1762 "Unable to handle swap header version %d\n",
1763 swap_header->info.version);
1604 error = -EINVAL; 1764 error = -EINVAL;
1605 goto bad_swap; 1765 goto bad_swap;
1606 case 2: 1766 }
1607 /* swap partition endianess hack... */
1608 if (swab32(swap_header->info.version) == 1) {
1609 swab32s(&swap_header->info.version);
1610 swab32s(&swap_header->info.last_page);
1611 swab32s(&swap_header->info.nr_badpages);
1612 for (i = 0; i < swap_header->info.nr_badpages; i++)
1613 swab32s(&swap_header->info.badpages[i]);
1614 }
1615 /* Check the swap header's sub-version and the size of
1616 the swap file and bad block lists */
1617 if (swap_header->info.version != 1) {
1618 printk(KERN_WARNING
1619 "Unable to handle swap header version %d\n",
1620 swap_header->info.version);
1621 error = -EINVAL;
1622 goto bad_swap;
1623 }
1624 1767
1625 p->lowest_bit = 1; 1768 p->lowest_bit = 1;
1626 p->cluster_next = 1; 1769 p->cluster_next = 1;
1627 1770
1628 /* 1771 /*
1629 * Find out how many pages are allowed for a single swap 1772 * Find out how many pages are allowed for a single swap
1630 * device. There are two limiting factors: 1) the number of 1773 * device. There are two limiting factors: 1) the number of
1631 * bits for the swap offset in the swp_entry_t type and 1774 * bits for the swap offset in the swp_entry_t type and
1632 * 2) the number of bits in the a swap pte as defined by 1775 * 2) the number of bits in the a swap pte as defined by
1633 * the different architectures. In order to find the 1776 * the different architectures. In order to find the
1634 * largest possible bit mask a swap entry with swap type 0 1777 * largest possible bit mask a swap entry with swap type 0
1635 * and swap offset ~0UL is created, encoded to a swap pte, 1778 * and swap offset ~0UL is created, encoded to a swap pte,
1636 * decoded to a swp_entry_t again and finally the swap 1779 * decoded to a swp_entry_t again and finally the swap
1637 * offset is extracted. This will mask all the bits from 1780 * offset is extracted. This will mask all the bits from
1638 * the initial ~0UL mask that can't be encoded in either 1781 * the initial ~0UL mask that can't be encoded in either
1639 * the swp_entry_t or the architecture definition of a 1782 * the swp_entry_t or the architecture definition of a
1640 * swap pte. 1783 * swap pte.
1641 */ 1784 */
1642 maxpages = swp_offset(pte_to_swp_entry(swp_entry_to_pte(swp_entry(0,~0UL)))) - 1; 1785 maxpages = swp_offset(pte_to_swp_entry(
1643 if (maxpages > swap_header->info.last_page) 1786 swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1;
1644 maxpages = swap_header->info.last_page; 1787 if (maxpages > swap_header->info.last_page)
1645 p->highest_bit = maxpages - 1; 1788 maxpages = swap_header->info.last_page;
1789 p->highest_bit = maxpages - 1;
1646 1790
1647 error = -EINVAL; 1791 error = -EINVAL;
1648 if (!maxpages) 1792 if (!maxpages)
1649 goto bad_swap; 1793 goto bad_swap;
1650 if (swapfilesize && maxpages > swapfilesize) { 1794 if (swapfilepages && maxpages > swapfilepages) {
1651 printk(KERN_WARNING 1795 printk(KERN_WARNING
1652 "Swap area shorter than signature indicates\n"); 1796 "Swap area shorter than signature indicates\n");
1653 goto bad_swap; 1797 goto bad_swap;
1654 } 1798 }
1655 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode)) 1799 if (swap_header->info.nr_badpages && S_ISREG(inode->i_mode))
1656 goto bad_swap; 1800 goto bad_swap;
1657 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) 1801 if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES)
1658 goto bad_swap; 1802 goto bad_swap;
1659 1803
1660 /* OK, set up the swap map and apply the bad block list */ 1804 /* OK, set up the swap map and apply the bad block list */
1661 swap_map = vmalloc(maxpages * sizeof(short)); 1805 swap_map = vmalloc(maxpages * sizeof(short));
1662 if (!swap_map) { 1806 if (!swap_map) {
1663 error = -ENOMEM; 1807 error = -ENOMEM;
1664 goto bad_swap; 1808 goto bad_swap;
1665 } 1809 }
1666 1810
1667 error = 0; 1811 memset(swap_map, 0, maxpages * sizeof(short));
1668 memset(swap_map, 0, maxpages * sizeof(short)); 1812 for (i = 0; i < swap_header->info.nr_badpages; i++) {
1669 for (i = 0; i < swap_header->info.nr_badpages; i++) { 1813 int page_nr = swap_header->info.badpages[i];
1670 int page_nr = swap_header->info.badpages[i]; 1814 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) {
1671 if (page_nr <= 0 || page_nr >= swap_header->info.last_page) 1815 error = -EINVAL;
1672 error = -EINVAL;
1673 else
1674 swap_map[page_nr] = SWAP_MAP_BAD;
1675 }
1676 nr_good_pages = swap_header->info.last_page -
1677 swap_header->info.nr_badpages -
1678 1 /* header page */;
1679 if (error)
1680 goto bad_swap; 1816 goto bad_swap;
1817 }
1818 swap_map[page_nr] = SWAP_MAP_BAD;
1681 } 1819 }
1682 1820
1821 error = swap_cgroup_swapon(type, maxpages);
1822 if (error)
1823 goto bad_swap;
1824
1825 nr_good_pages = swap_header->info.last_page -
1826 swap_header->info.nr_badpages -
1827 1 /* header page */;
1828
1683 if (nr_good_pages) { 1829 if (nr_good_pages) {
1684 swap_map[0] = SWAP_MAP_BAD; 1830 swap_map[0] = SWAP_MAP_BAD;
1685 p->max = maxpages; 1831 p->max = maxpages;
@@ -1697,6 +1843,13 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1697 goto bad_swap; 1843 goto bad_swap;
1698 } 1844 }
1699 1845
1846 if (blk_queue_nonrot(bdev_get_queue(p->bdev))) {
1847 p->flags |= SWP_SOLIDSTATE;
1848 p->cluster_next = 1 + (random32() % p->highest_bit);
1849 }
1850 if (discard_swap(p) == 0)
1851 p->flags |= SWP_DISCARDABLE;
1852
1700 mutex_lock(&swapon_mutex); 1853 mutex_lock(&swapon_mutex);
1701 spin_lock(&swap_lock); 1854 spin_lock(&swap_lock);
1702 if (swap_flags & SWAP_FLAG_PREFER) 1855 if (swap_flags & SWAP_FLAG_PREFER)
@@ -1705,14 +1858,16 @@ asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags)
1705 else 1858 else
1706 p->prio = --least_priority; 1859 p->prio = --least_priority;
1707 p->swap_map = swap_map; 1860 p->swap_map = swap_map;
1708 p->flags = SWP_ACTIVE; 1861 p->flags |= SWP_WRITEOK;
1709 nr_swap_pages += nr_good_pages; 1862 nr_swap_pages += nr_good_pages;
1710 total_swap_pages += nr_good_pages; 1863 total_swap_pages += nr_good_pages;
1711 1864
1712 printk(KERN_INFO "Adding %uk swap on %s. " 1865 printk(KERN_INFO "Adding %uk swap on %s. "
1713 "Priority:%d extents:%d across:%lluk\n", 1866 "Priority:%d extents:%d across:%lluk %s%s\n",
1714 nr_good_pages<<(PAGE_SHIFT-10), name, p->prio, 1867 nr_good_pages<<(PAGE_SHIFT-10), name, p->prio,
1715 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10)); 1868 nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10),
1869 (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
1870 (p->flags & SWP_DISCARDABLE) ? "D" : "");
1716 1871
1717 /* insert swap space into swap_list: */ 1872 /* insert swap space into swap_list: */
1718 prev = -1; 1873 prev = -1;
@@ -1738,6 +1893,7 @@ bad_swap:
1738 bd_release(bdev); 1893 bd_release(bdev);
1739 } 1894 }
1740 destroy_swap_extents(p); 1895 destroy_swap_extents(p);
1896 swap_cgroup_swapoff(type);
1741bad_swap_2: 1897bad_swap_2:
1742 spin_lock(&swap_lock); 1898 spin_lock(&swap_lock);
1743 p->swap_file = NULL; 1899 p->swap_file = NULL;
diff --git a/mm/tiny-shmem.c b/mm/tiny-shmem.c
deleted file mode 100644
index 3e67d575ee6e..000000000000
--- a/mm/tiny-shmem.c
+++ /dev/null
@@ -1,134 +0,0 @@
1/*
2 * tiny-shmem.c: simple shmemfs and tmpfs using ramfs code
3 *
4 * Matt Mackall <mpm@selenic.com> January, 2004
5 * derived from mm/shmem.c and fs/ramfs/inode.c
6 *
7 * This is intended for small system where the benefits of the full
8 * shmem code (swap-backed and resource-limited) are outweighed by
9 * their complexity. On systems without swap this code should be
10 * effectively equivalent, but much lighter weight.
11 */
12
13#include <linux/fs.h>
14#include <linux/init.h>
15#include <linux/vfs.h>
16#include <linux/mount.h>
17#include <linux/file.h>
18#include <linux/mm.h>
19#include <linux/module.h>
20#include <linux/swap.h>
21#include <linux/ramfs.h>
22
23static struct file_system_type tmpfs_fs_type = {
24 .name = "tmpfs",
25 .get_sb = ramfs_get_sb,
26 .kill_sb = kill_litter_super,
27};
28
29static struct vfsmount *shm_mnt;
30
31static int __init init_tmpfs(void)
32{
33 BUG_ON(register_filesystem(&tmpfs_fs_type) != 0);
34
35 shm_mnt = kern_mount(&tmpfs_fs_type);
36 BUG_ON(IS_ERR(shm_mnt));
37
38 return 0;
39}
40module_init(init_tmpfs)
41
42/**
43 * shmem_file_setup - get an unlinked file living in tmpfs
44 * @name: name for dentry (to be seen in /proc/<pid>/maps
45 * @size: size to be set for the file
46 * @flags: vm_flags
47 */
48struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
49{
50 int error;
51 struct file *file;
52 struct inode *inode;
53 struct dentry *dentry, *root;
54 struct qstr this;
55
56 if (IS_ERR(shm_mnt))
57 return (void *)shm_mnt;
58
59 error = -ENOMEM;
60 this.name = name;
61 this.len = strlen(name);
62 this.hash = 0; /* will go */
63 root = shm_mnt->mnt_root;
64 dentry = d_alloc(root, &this);
65 if (!dentry)
66 goto put_memory;
67
68 error = -ENFILE;
69 file = get_empty_filp();
70 if (!file)
71 goto put_dentry;
72
73 error = -ENOSPC;
74 inode = ramfs_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
75 if (!inode)
76 goto close_file;
77
78 d_instantiate(dentry, inode);
79 inode->i_size = size;
80 inode->i_nlink = 0; /* It is unlinked */
81 init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ,
82 &ramfs_file_operations);
83
84#ifndef CONFIG_MMU
85 error = ramfs_nommu_expand_for_mapping(inode, size);
86 if (error)
87 goto close_file;
88#endif
89 return file;
90
91close_file:
92 put_filp(file);
93put_dentry:
94 dput(dentry);
95put_memory:
96 return ERR_PTR(error);
97}
98EXPORT_SYMBOL_GPL(shmem_file_setup);
99
100/**
101 * shmem_zero_setup - setup a shared anonymous mapping
102 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
103 */
104int shmem_zero_setup(struct vm_area_struct *vma)
105{
106 struct file *file;
107 loff_t size = vma->vm_end - vma->vm_start;
108
109 file = shmem_file_setup("dev/zero", size, vma->vm_flags);
110 if (IS_ERR(file))
111 return PTR_ERR(file);
112
113 if (vma->vm_file)
114 fput(vma->vm_file);
115 vma->vm_file = file;
116 vma->vm_ops = &generic_file_vm_ops;
117 return 0;
118}
119
120int shmem_unuse(swp_entry_t entry, struct page *page)
121{
122 return 0;
123}
124
125#ifndef CONFIG_MMU
126unsigned long shmem_get_unmapped_area(struct file *file,
127 unsigned long addr,
128 unsigned long len,
129 unsigned long pgoff,
130 unsigned long flags)
131{
132 return ramfs_nommu_get_unmapped_area(file, addr, len, pgoff, flags);
133}
134#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 7465f22fec0c..c5db9a7264d9 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -14,6 +14,7 @@
14#include <linux/highmem.h> 14#include <linux/highmem.h>
15#include <linux/slab.h> 15#include <linux/slab.h>
16#include <linux/spinlock.h> 16#include <linux/spinlock.h>
17#include <linux/mutex.h>
17#include <linux/interrupt.h> 18#include <linux/interrupt.h>
18#include <linux/proc_fs.h> 19#include <linux/proc_fs.h>
19#include <linux/seq_file.h> 20#include <linux/seq_file.h>
@@ -381,8 +382,9 @@ found:
381 goto retry; 382 goto retry;
382 } 383 }
383 if (printk_ratelimit()) 384 if (printk_ratelimit())
384 printk(KERN_WARNING "vmap allocation failed: " 385 printk(KERN_WARNING
385 "use vmalloc=<size> to increase size.\n"); 386 "vmap allocation for size %lu failed: "
387 "use vmalloc=<size> to increase size.\n", size);
386 return ERR_PTR(-EBUSY); 388 return ERR_PTR(-EBUSY);
387 } 389 }
388 390
@@ -432,6 +434,27 @@ static void unmap_vmap_area(struct vmap_area *va)
432 vunmap_page_range(va->va_start, va->va_end); 434 vunmap_page_range(va->va_start, va->va_end);
433} 435}
434 436
437static void vmap_debug_free_range(unsigned long start, unsigned long end)
438{
439 /*
440 * Unmap page tables and force a TLB flush immediately if
441 * CONFIG_DEBUG_PAGEALLOC is set. This catches use after free
442 * bugs similarly to those in linear kernel virtual address
443 * space after a page has been freed.
444 *
445 * All the lazy freeing logic is still retained, in order to
446 * minimise intrusiveness of this debugging feature.
447 *
448 * This is going to be *slow* (linear kernel virtual address
449 * debugging doesn't do a broadcast TLB flush so it is a lot
450 * faster).
451 */
452#ifdef CONFIG_DEBUG_PAGEALLOC
453 vunmap_page_range(start, end);
454 flush_tlb_kernel_range(start, end);
455#endif
456}
457
435/* 458/*
436 * lazy_max_pages is the maximum amount of virtual address space we gather up 459 * lazy_max_pages is the maximum amount of virtual address space we gather up
437 * before attempting to purge with a TLB flush. 460 * before attempting to purge with a TLB flush.
@@ -472,7 +495,7 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
472static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, 495static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
473 int sync, int force_flush) 496 int sync, int force_flush)
474{ 497{
475 static DEFINE_SPINLOCK(purge_lock); 498 static DEFINE_MUTEX(purge_lock);
476 LIST_HEAD(valist); 499 LIST_HEAD(valist);
477 struct vmap_area *va; 500 struct vmap_area *va;
478 int nr = 0; 501 int nr = 0;
@@ -483,10 +506,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
483 * the case that isn't actually used at the moment anyway. 506 * the case that isn't actually used at the moment anyway.
484 */ 507 */
485 if (!sync && !force_flush) { 508 if (!sync && !force_flush) {
486 if (!spin_trylock(&purge_lock)) 509 if (!mutex_trylock(&purge_lock))
487 return; 510 return;
488 } else 511 } else
489 spin_lock(&purge_lock); 512 mutex_lock(&purge_lock);
490 513
491 rcu_read_lock(); 514 rcu_read_lock();
492 list_for_each_entry_rcu(va, &vmap_area_list, list) { 515 list_for_each_entry_rcu(va, &vmap_area_list, list) {
@@ -518,7 +541,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
518 __free_vmap_area(va); 541 __free_vmap_area(va);
519 spin_unlock(&vmap_area_lock); 542 spin_unlock(&vmap_area_lock);
520 } 543 }
521 spin_unlock(&purge_lock); 544 mutex_unlock(&purge_lock);
522} 545}
523 546
524/* 547/*
@@ -912,6 +935,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
912 BUG_ON(addr & (PAGE_SIZE-1)); 935 BUG_ON(addr & (PAGE_SIZE-1));
913 936
914 debug_check_no_locks_freed(mem, size); 937 debug_check_no_locks_freed(mem, size);
938 vmap_debug_free_range(addr, addr+size);
915 939
916 if (likely(count <= VMAP_MAX_ALLOC)) 940 if (likely(count <= VMAP_MAX_ALLOC))
917 vb_free(mem, size); 941 vb_free(mem, size);
@@ -1128,6 +1152,8 @@ struct vm_struct *remove_vm_area(const void *addr)
1128 if (va && va->flags & VM_VM_AREA) { 1152 if (va && va->flags & VM_VM_AREA) {
1129 struct vm_struct *vm = va->private; 1153 struct vm_struct *vm = va->private;
1130 struct vm_struct *tmp, **p; 1154 struct vm_struct *tmp, **p;
1155
1156 vmap_debug_free_range(va->va_start, va->va_end);
1131 free_unmap_vmap_area(va); 1157 free_unmap_vmap_area(va);
1132 vm->size -= PAGE_SIZE; 1158 vm->size -= PAGE_SIZE;
1133 1159
@@ -1375,7 +1401,8 @@ void *vmalloc_user(unsigned long size)
1375 struct vm_struct *area; 1401 struct vm_struct *area;
1376 void *ret; 1402 void *ret;
1377 1403
1378 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); 1404 ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
1405 PAGE_KERNEL, -1, __builtin_return_address(0));
1379 if (ret) { 1406 if (ret) {
1380 area = find_vm_area(ret); 1407 area = find_vm_area(ret);
1381 area->flags |= VM_USERMAP; 1408 area->flags |= VM_USERMAP;
@@ -1420,7 +1447,8 @@ EXPORT_SYMBOL(vmalloc_node);
1420 1447
1421void *vmalloc_exec(unsigned long size) 1448void *vmalloc_exec(unsigned long size)
1422{ 1449{
1423 return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC); 1450 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
1451 -1, __builtin_return_address(0));
1424} 1452}
1425 1453
1426#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32) 1454#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
@@ -1440,7 +1468,8 @@ void *vmalloc_exec(unsigned long size)
1440 */ 1468 */
1441void *vmalloc_32(unsigned long size) 1469void *vmalloc_32(unsigned long size)
1442{ 1470{
1443 return __vmalloc(size, GFP_VMALLOC32, PAGE_KERNEL); 1471 return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL,
1472 -1, __builtin_return_address(0));
1444} 1473}
1445EXPORT_SYMBOL(vmalloc_32); 1474EXPORT_SYMBOL(vmalloc_32);
1446 1475
@@ -1456,7 +1485,8 @@ void *vmalloc_32_user(unsigned long size)
1456 struct vm_struct *area; 1485 struct vm_struct *area;
1457 void *ret; 1486 void *ret;
1458 1487
1459 ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); 1488 ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
1489 -1, __builtin_return_address(0));
1460 if (ret) { 1490 if (ret) {
1461 area = find_vm_area(ret); 1491 area = find_vm_area(ret);
1462 area->flags |= VM_USERMAP; 1492 area->flags |= VM_USERMAP;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index d196f46c8808..9a27c44aa327 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -52,6 +52,9 @@ struct scan_control {
52 /* Incremented by the number of inactive pages that were scanned */ 52 /* Incremented by the number of inactive pages that were scanned */
53 unsigned long nr_scanned; 53 unsigned long nr_scanned;
54 54
55 /* Number of pages freed so far during a call to shrink_zones() */
56 unsigned long nr_reclaimed;
57
55 /* This context's GFP mask */ 58 /* This context's GFP mask */
56 gfp_t gfp_mask; 59 gfp_t gfp_mask;
57 60
@@ -122,11 +125,30 @@ static LIST_HEAD(shrinker_list);
122static DECLARE_RWSEM(shrinker_rwsem); 125static DECLARE_RWSEM(shrinker_rwsem);
123 126
124#ifdef CONFIG_CGROUP_MEM_RES_CTLR 127#ifdef CONFIG_CGROUP_MEM_RES_CTLR
125#define scan_global_lru(sc) (!(sc)->mem_cgroup) 128#define scanning_global_lru(sc) (!(sc)->mem_cgroup)
126#else 129#else
127#define scan_global_lru(sc) (1) 130#define scanning_global_lru(sc) (1)
128#endif 131#endif
129 132
133static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone,
134 struct scan_control *sc)
135{
136 if (!scanning_global_lru(sc))
137 return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone);
138
139 return &zone->reclaim_stat;
140}
141
142static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc,
143 enum lru_list lru)
144{
145 if (!scanning_global_lru(sc))
146 return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru);
147
148 return zone_page_state(zone, NR_LRU_BASE + lru);
149}
150
151
130/* 152/*
131 * Add a shrinker callback to be called from the vm 153 * Add a shrinker callback to be called from the vm
132 */ 154 */
@@ -509,7 +531,6 @@ redo:
509 lru = LRU_UNEVICTABLE; 531 lru = LRU_UNEVICTABLE;
510 add_page_to_unevictable_list(page); 532 add_page_to_unevictable_list(page);
511 } 533 }
512 mem_cgroup_move_lists(page, lru);
513 534
514 /* 535 /*
515 * page's status can change while we move it among lru. If an evictable 536 * page's status can change while we move it among lru. If an evictable
@@ -544,7 +565,6 @@ void putback_lru_page(struct page *page)
544 565
545 lru = !!TestClearPageActive(page) + page_is_file_cache(page); 566 lru = !!TestClearPageActive(page) + page_is_file_cache(page);
546 lru_cache_add_lru(page, lru); 567 lru_cache_add_lru(page, lru);
547 mem_cgroup_move_lists(page, lru);
548 put_page(page); 568 put_page(page);
549} 569}
550#endif /* CONFIG_UNEVICTABLE_LRU */ 570#endif /* CONFIG_UNEVICTABLE_LRU */
@@ -617,7 +637,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
617 referenced && page_mapping_inuse(page)) 637 referenced && page_mapping_inuse(page))
618 goto activate_locked; 638 goto activate_locked;
619 639
620#ifdef CONFIG_SWAP
621 /* 640 /*
622 * Anonymous process memory has backing store? 641 * Anonymous process memory has backing store?
623 * Try to allocate it some swap space here. 642 * Try to allocate it some swap space here.
@@ -625,20 +644,10 @@ static unsigned long shrink_page_list(struct list_head *page_list,
625 if (PageAnon(page) && !PageSwapCache(page)) { 644 if (PageAnon(page) && !PageSwapCache(page)) {
626 if (!(sc->gfp_mask & __GFP_IO)) 645 if (!(sc->gfp_mask & __GFP_IO))
627 goto keep_locked; 646 goto keep_locked;
628 switch (try_to_munlock(page)) { 647 if (!add_to_swap(page))
629 case SWAP_FAIL: /* shouldn't happen */
630 case SWAP_AGAIN:
631 goto keep_locked;
632 case SWAP_MLOCK:
633 goto cull_mlocked;
634 case SWAP_SUCCESS:
635 ; /* fall thru'; add to swap cache */
636 }
637 if (!add_to_swap(page, GFP_ATOMIC))
638 goto activate_locked; 648 goto activate_locked;
639 may_enter_fs = 1; 649 may_enter_fs = 1;
640 } 650 }
641#endif /* CONFIG_SWAP */
642 651
643 mapping = page_mapping(page); 652 mapping = page_mapping(page);
644 653
@@ -752,6 +761,8 @@ free_it:
752 continue; 761 continue;
753 762
754cull_mlocked: 763cull_mlocked:
764 if (PageSwapCache(page))
765 try_to_free_swap(page);
755 unlock_page(page); 766 unlock_page(page);
756 putback_lru_page(page); 767 putback_lru_page(page);
757 continue; 768 continue;
@@ -759,7 +770,7 @@ cull_mlocked:
759activate_locked: 770activate_locked:
760 /* Not a candidate for swapping, so reclaim swap space. */ 771 /* Not a candidate for swapping, so reclaim swap space. */
761 if (PageSwapCache(page) && vm_swap_full()) 772 if (PageSwapCache(page) && vm_swap_full())
762 remove_exclusive_swap_page_ref(page); 773 try_to_free_swap(page);
763 VM_BUG_ON(PageActive(page)); 774 VM_BUG_ON(PageActive(page));
764 SetPageActive(page); 775 SetPageActive(page);
765 pgactivate++; 776 pgactivate++;
@@ -819,6 +830,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
819 return ret; 830 return ret;
820 831
821 ret = -EBUSY; 832 ret = -EBUSY;
833
822 if (likely(get_page_unless_zero(page))) { 834 if (likely(get_page_unless_zero(page))) {
823 /* 835 /*
824 * Be careful not to clear PageLRU until after we're 836 * Be careful not to clear PageLRU until after we're
@@ -827,6 +839,7 @@ int __isolate_lru_page(struct page *page, int mode, int file)
827 */ 839 */
828 ClearPageLRU(page); 840 ClearPageLRU(page);
829 ret = 0; 841 ret = 0;
842 mem_cgroup_del_lru(page);
830 } 843 }
831 844
832 return ret; 845 return ret;
@@ -1035,6 +1048,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1035 struct pagevec pvec; 1048 struct pagevec pvec;
1036 unsigned long nr_scanned = 0; 1049 unsigned long nr_scanned = 0;
1037 unsigned long nr_reclaimed = 0; 1050 unsigned long nr_reclaimed = 0;
1051 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1038 1052
1039 pagevec_init(&pvec, 1); 1053 pagevec_init(&pvec, 1);
1040 1054
@@ -1076,13 +1090,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1076 __mod_zone_page_state(zone, NR_INACTIVE_ANON, 1090 __mod_zone_page_state(zone, NR_INACTIVE_ANON,
1077 -count[LRU_INACTIVE_ANON]); 1091 -count[LRU_INACTIVE_ANON]);
1078 1092
1079 if (scan_global_lru(sc)) { 1093 if (scanning_global_lru(sc))
1080 zone->pages_scanned += nr_scan; 1094 zone->pages_scanned += nr_scan;
1081 zone->recent_scanned[0] += count[LRU_INACTIVE_ANON]; 1095
1082 zone->recent_scanned[0] += count[LRU_ACTIVE_ANON]; 1096 reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
1083 zone->recent_scanned[1] += count[LRU_INACTIVE_FILE]; 1097 reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
1084 zone->recent_scanned[1] += count[LRU_ACTIVE_FILE]; 1098 reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE];
1085 } 1099 reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE];
1100
1086 spin_unlock_irq(&zone->lru_lock); 1101 spin_unlock_irq(&zone->lru_lock);
1087 1102
1088 nr_scanned += nr_scan; 1103 nr_scanned += nr_scan;
@@ -1114,7 +1129,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1114 if (current_is_kswapd()) { 1129 if (current_is_kswapd()) {
1115 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); 1130 __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
1116 __count_vm_events(KSWAPD_STEAL, nr_freed); 1131 __count_vm_events(KSWAPD_STEAL, nr_freed);
1117 } else if (scan_global_lru(sc)) 1132 } else if (scanning_global_lru(sc))
1118 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); 1133 __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
1119 1134
1120 __count_zone_vm_events(PGSTEAL, zone, nr_freed); 1135 __count_zone_vm_events(PGSTEAL, zone, nr_freed);
@@ -1140,10 +1155,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
1140 SetPageLRU(page); 1155 SetPageLRU(page);
1141 lru = page_lru(page); 1156 lru = page_lru(page);
1142 add_page_to_lru_list(zone, page, lru); 1157 add_page_to_lru_list(zone, page, lru);
1143 mem_cgroup_move_lists(page, lru); 1158 if (PageActive(page)) {
1144 if (PageActive(page) && scan_global_lru(sc)) {
1145 int file = !!page_is_file_cache(page); 1159 int file = !!page_is_file_cache(page);
1146 zone->recent_rotated[file]++; 1160 reclaim_stat->recent_rotated[file]++;
1147 } 1161 }
1148 if (!pagevec_add(&pvec, page)) { 1162 if (!pagevec_add(&pvec, page)) {
1149 spin_unlock_irq(&zone->lru_lock); 1163 spin_unlock_irq(&zone->lru_lock);
@@ -1173,11 +1187,6 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority)
1173 zone->prev_priority = priority; 1187 zone->prev_priority = priority;
1174} 1188}
1175 1189
1176static inline int zone_is_near_oom(struct zone *zone)
1177{
1178 return zone->pages_scanned >= (zone_lru_pages(zone) * 3);
1179}
1180
1181/* 1190/*
1182 * This moves pages from the active list to the inactive list. 1191 * This moves pages from the active list to the inactive list.
1183 * 1192 *
@@ -1208,6 +1217,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1208 struct page *page; 1217 struct page *page;
1209 struct pagevec pvec; 1218 struct pagevec pvec;
1210 enum lru_list lru; 1219 enum lru_list lru;
1220 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1211 1221
1212 lru_add_drain(); 1222 lru_add_drain();
1213 spin_lock_irq(&zone->lru_lock); 1223 spin_lock_irq(&zone->lru_lock);
@@ -1218,10 +1228,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1218 * zone->pages_scanned is used for detect zone's oom 1228 * zone->pages_scanned is used for detect zone's oom
1219 * mem_cgroup remembers nr_scan by itself. 1229 * mem_cgroup remembers nr_scan by itself.
1220 */ 1230 */
1221 if (scan_global_lru(sc)) { 1231 if (scanning_global_lru(sc)) {
1222 zone->pages_scanned += pgscanned; 1232 zone->pages_scanned += pgscanned;
1223 zone->recent_scanned[!!file] += pgmoved;
1224 } 1233 }
1234 reclaim_stat->recent_scanned[!!file] += pgmoved;
1225 1235
1226 if (file) 1236 if (file)
1227 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); 1237 __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved);
@@ -1248,6 +1258,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1248 list_add(&page->lru, &l_inactive); 1258 list_add(&page->lru, &l_inactive);
1249 } 1259 }
1250 1260
1261 /*
1262 * Move the pages to the [file or anon] inactive list.
1263 */
1264 pagevec_init(&pvec, 1);
1265 pgmoved = 0;
1266 lru = LRU_BASE + file * LRU_FILE;
1267
1251 spin_lock_irq(&zone->lru_lock); 1268 spin_lock_irq(&zone->lru_lock);
1252 /* 1269 /*
1253 * Count referenced pages from currently used mappings as 1270 * Count referenced pages from currently used mappings as
@@ -1255,15 +1272,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1255 * This helps balance scan pressure between file and anonymous 1272 * This helps balance scan pressure between file and anonymous
1256 * pages in get_scan_ratio. 1273 * pages in get_scan_ratio.
1257 */ 1274 */
1258 zone->recent_rotated[!!file] += pgmoved; 1275 reclaim_stat->recent_rotated[!!file] += pgmoved;
1259 1276
1260 /*
1261 * Move the pages to the [file or anon] inactive list.
1262 */
1263 pagevec_init(&pvec, 1);
1264
1265 pgmoved = 0;
1266 lru = LRU_BASE + file * LRU_FILE;
1267 while (!list_empty(&l_inactive)) { 1277 while (!list_empty(&l_inactive)) {
1268 page = lru_to_page(&l_inactive); 1278 page = lru_to_page(&l_inactive);
1269 prefetchw_prev_lru_page(page, &l_inactive, flags); 1279 prefetchw_prev_lru_page(page, &l_inactive, flags);
@@ -1273,7 +1283,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1273 ClearPageActive(page); 1283 ClearPageActive(page);
1274 1284
1275 list_move(&page->lru, &zone->lru[lru].list); 1285 list_move(&page->lru, &zone->lru[lru].list);
1276 mem_cgroup_move_lists(page, lru); 1286 mem_cgroup_add_lru_list(page, lru);
1277 pgmoved++; 1287 pgmoved++;
1278 if (!pagevec_add(&pvec, page)) { 1288 if (!pagevec_add(&pvec, page)) {
1279 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); 1289 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
@@ -1302,6 +1312,38 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1302 pagevec_release(&pvec); 1312 pagevec_release(&pvec);
1303} 1313}
1304 1314
1315static int inactive_anon_is_low_global(struct zone *zone)
1316{
1317 unsigned long active, inactive;
1318
1319 active = zone_page_state(zone, NR_ACTIVE_ANON);
1320 inactive = zone_page_state(zone, NR_INACTIVE_ANON);
1321
1322 if (inactive * zone->inactive_ratio < active)
1323 return 1;
1324
1325 return 0;
1326}
1327
1328/**
1329 * inactive_anon_is_low - check if anonymous pages need to be deactivated
1330 * @zone: zone to check
1331 * @sc: scan control of this context
1332 *
1333 * Returns true if the zone does not have enough inactive anon pages,
1334 * meaning some active anon pages need to be deactivated.
1335 */
1336static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc)
1337{
1338 int low;
1339
1340 if (scanning_global_lru(sc))
1341 low = inactive_anon_is_low_global(zone);
1342 else
1343 low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup);
1344 return low;
1345}
1346
1305static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, 1347static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1306 struct zone *zone, struct scan_control *sc, int priority) 1348 struct zone *zone, struct scan_control *sc, int priority)
1307{ 1349{
@@ -1312,8 +1354,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
1312 return 0; 1354 return 0;
1313 } 1355 }
1314 1356
1315 if (lru == LRU_ACTIVE_ANON && 1357 if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) {
1316 (!scan_global_lru(sc) || inactive_anon_is_low(zone))) {
1317 shrink_active_list(nr_to_scan, zone, sc, priority, file); 1358 shrink_active_list(nr_to_scan, zone, sc, priority, file);
1318 return 0; 1359 return 0;
1319 } 1360 }
@@ -1335,12 +1376,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1335 unsigned long anon, file, free; 1376 unsigned long anon, file, free;
1336 unsigned long anon_prio, file_prio; 1377 unsigned long anon_prio, file_prio;
1337 unsigned long ap, fp; 1378 unsigned long ap, fp;
1338 1379 struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
1339 anon = zone_page_state(zone, NR_ACTIVE_ANON) +
1340 zone_page_state(zone, NR_INACTIVE_ANON);
1341 file = zone_page_state(zone, NR_ACTIVE_FILE) +
1342 zone_page_state(zone, NR_INACTIVE_FILE);
1343 free = zone_page_state(zone, NR_FREE_PAGES);
1344 1380
1345 /* If we have no swap space, do not bother scanning anon pages. */ 1381 /* If we have no swap space, do not bother scanning anon pages. */
1346 if (nr_swap_pages <= 0) { 1382 if (nr_swap_pages <= 0) {
@@ -1349,11 +1385,20 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1349 return; 1385 return;
1350 } 1386 }
1351 1387
1352 /* If we have very few page cache pages, force-scan anon pages. */ 1388 anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) +
1353 if (unlikely(file + free <= zone->pages_high)) { 1389 zone_nr_pages(zone, sc, LRU_INACTIVE_ANON);
1354 percent[0] = 100; 1390 file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) +
1355 percent[1] = 0; 1391 zone_nr_pages(zone, sc, LRU_INACTIVE_FILE);
1356 return; 1392
1393 if (scanning_global_lru(sc)) {
1394 free = zone_page_state(zone, NR_FREE_PAGES);
1395 /* If we have very few page cache pages,
1396 force-scan anon pages. */
1397 if (unlikely(file + free <= zone->pages_high)) {
1398 percent[0] = 100;
1399 percent[1] = 0;
1400 return;
1401 }
1357 } 1402 }
1358 1403
1359 /* 1404 /*
@@ -1367,17 +1412,17 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1367 * 1412 *
1368 * anon in [0], file in [1] 1413 * anon in [0], file in [1]
1369 */ 1414 */
1370 if (unlikely(zone->recent_scanned[0] > anon / 4)) { 1415 if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
1371 spin_lock_irq(&zone->lru_lock); 1416 spin_lock_irq(&zone->lru_lock);
1372 zone->recent_scanned[0] /= 2; 1417 reclaim_stat->recent_scanned[0] /= 2;
1373 zone->recent_rotated[0] /= 2; 1418 reclaim_stat->recent_rotated[0] /= 2;
1374 spin_unlock_irq(&zone->lru_lock); 1419 spin_unlock_irq(&zone->lru_lock);
1375 } 1420 }
1376 1421
1377 if (unlikely(zone->recent_scanned[1] > file / 4)) { 1422 if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
1378 spin_lock_irq(&zone->lru_lock); 1423 spin_lock_irq(&zone->lru_lock);
1379 zone->recent_scanned[1] /= 2; 1424 reclaim_stat->recent_scanned[1] /= 2;
1380 zone->recent_rotated[1] /= 2; 1425 reclaim_stat->recent_rotated[1] /= 2;
1381 spin_unlock_irq(&zone->lru_lock); 1426 spin_unlock_irq(&zone->lru_lock);
1382 } 1427 }
1383 1428
@@ -1393,11 +1438,11 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1393 * proportional to the fraction of recently scanned pages on 1438 * proportional to the fraction of recently scanned pages on
1394 * each list that were recently referenced and in active use. 1439 * each list that were recently referenced and in active use.
1395 */ 1440 */
1396 ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1); 1441 ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1);
1397 ap /= zone->recent_rotated[0] + 1; 1442 ap /= reclaim_stat->recent_rotated[0] + 1;
1398 1443
1399 fp = (file_prio + 1) * (zone->recent_scanned[1] + 1); 1444 fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1);
1400 fp /= zone->recent_rotated[1] + 1; 1445 fp /= reclaim_stat->recent_rotated[1] + 1;
1401 1446
1402 /* Normalize to percentages */ 1447 /* Normalize to percentages */
1403 percent[0] = 100 * ap / (ap + fp + 1); 1448 percent[0] = 100 * ap / (ap + fp + 1);
@@ -1408,69 +1453,72 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
1408/* 1453/*
1409 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. 1454 * This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
1410 */ 1455 */
1411static unsigned long shrink_zone(int priority, struct zone *zone, 1456static void shrink_zone(int priority, struct zone *zone,
1412 struct scan_control *sc) 1457 struct scan_control *sc)
1413{ 1458{
1414 unsigned long nr[NR_LRU_LISTS]; 1459 unsigned long nr[NR_LRU_LISTS];
1415 unsigned long nr_to_scan; 1460 unsigned long nr_to_scan;
1416 unsigned long nr_reclaimed = 0;
1417 unsigned long percent[2]; /* anon @ 0; file @ 1 */ 1461 unsigned long percent[2]; /* anon @ 0; file @ 1 */
1418 enum lru_list l; 1462 enum lru_list l;
1463 unsigned long nr_reclaimed = sc->nr_reclaimed;
1464 unsigned long swap_cluster_max = sc->swap_cluster_max;
1419 1465
1420 get_scan_ratio(zone, sc, percent); 1466 get_scan_ratio(zone, sc, percent);
1421 1467
1422 for_each_evictable_lru(l) { 1468 for_each_evictable_lru(l) {
1423 if (scan_global_lru(sc)) { 1469 int file = is_file_lru(l);
1424 int file = is_file_lru(l); 1470 int scan;
1425 int scan; 1471
1426 1472 scan = zone_page_state(zone, NR_LRU_BASE + l);
1427 scan = zone_page_state(zone, NR_LRU_BASE + l); 1473 if (priority) {
1428 if (priority) { 1474 scan >>= priority;
1429 scan >>= priority; 1475 scan = (scan * percent[file]) / 100;
1430 scan = (scan * percent[file]) / 100; 1476 }
1431 } 1477 if (scanning_global_lru(sc)) {
1432 zone->lru[l].nr_scan += scan; 1478 zone->lru[l].nr_scan += scan;
1433 nr[l] = zone->lru[l].nr_scan; 1479 nr[l] = zone->lru[l].nr_scan;
1434 if (nr[l] >= sc->swap_cluster_max) 1480 if (nr[l] >= swap_cluster_max)
1435 zone->lru[l].nr_scan = 0; 1481 zone->lru[l].nr_scan = 0;
1436 else 1482 else
1437 nr[l] = 0; 1483 nr[l] = 0;
1438 } else { 1484 } else
1439 /* 1485 nr[l] = scan;
1440 * This reclaim occurs not because zone memory shortage
1441 * but because memory controller hits its limit.
1442 * Don't modify zone reclaim related data.
1443 */
1444 nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone,
1445 priority, l);
1446 }
1447 } 1486 }
1448 1487
1449 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || 1488 while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
1450 nr[LRU_INACTIVE_FILE]) { 1489 nr[LRU_INACTIVE_FILE]) {
1451 for_each_evictable_lru(l) { 1490 for_each_evictable_lru(l) {
1452 if (nr[l]) { 1491 if (nr[l]) {
1453 nr_to_scan = min(nr[l], 1492 nr_to_scan = min(nr[l], swap_cluster_max);
1454 (unsigned long)sc->swap_cluster_max);
1455 nr[l] -= nr_to_scan; 1493 nr[l] -= nr_to_scan;
1456 1494
1457 nr_reclaimed += shrink_list(l, nr_to_scan, 1495 nr_reclaimed += shrink_list(l, nr_to_scan,
1458 zone, sc, priority); 1496 zone, sc, priority);
1459 } 1497 }
1460 } 1498 }
1499 /*
1500 * On large memory systems, scan >> priority can become
1501 * really large. This is fine for the starting priority;
1502 * we want to put equal scanning pressure on each zone.
1503 * However, if the VM has a harder time of freeing pages,
1504 * with multiple processes reclaiming pages, the total
1505 * freeing target can get unreasonably large.
1506 */
1507 if (nr_reclaimed > swap_cluster_max &&
1508 priority < DEF_PRIORITY && !current_is_kswapd())
1509 break;
1461 } 1510 }
1462 1511
1512 sc->nr_reclaimed = nr_reclaimed;
1513
1463 /* 1514 /*
1464 * Even if we did not try to evict anon pages at all, we want to 1515 * Even if we did not try to evict anon pages at all, we want to
1465 * rebalance the anon lru active/inactive ratio. 1516 * rebalance the anon lru active/inactive ratio.
1466 */ 1517 */
1467 if (!scan_global_lru(sc) || inactive_anon_is_low(zone)) 1518 if (inactive_anon_is_low(zone, sc))
1468 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1469 else if (!scan_global_lru(sc))
1470 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); 1519 shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0);
1471 1520
1472 throttle_vm_writeout(sc->gfp_mask); 1521 throttle_vm_writeout(sc->gfp_mask);
1473 return nr_reclaimed;
1474} 1522}
1475 1523
1476/* 1524/*
@@ -1484,16 +1532,13 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
1484 * b) The zones may be over pages_high but they must go *over* pages_high to 1532 * b) The zones may be over pages_high but they must go *over* pages_high to
1485 * satisfy the `incremental min' zone defense algorithm. 1533 * satisfy the `incremental min' zone defense algorithm.
1486 * 1534 *
1487 * Returns the number of reclaimed pages.
1488 *
1489 * If a zone is deemed to be full of pinned pages then just give it a light 1535 * If a zone is deemed to be full of pinned pages then just give it a light
1490 * scan then give up on it. 1536 * scan then give up on it.
1491 */ 1537 */
1492static unsigned long shrink_zones(int priority, struct zonelist *zonelist, 1538static void shrink_zones(int priority, struct zonelist *zonelist,
1493 struct scan_control *sc) 1539 struct scan_control *sc)
1494{ 1540{
1495 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); 1541 enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
1496 unsigned long nr_reclaimed = 0;
1497 struct zoneref *z; 1542 struct zoneref *z;
1498 struct zone *zone; 1543 struct zone *zone;
1499 1544
@@ -1505,7 +1550,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
1505 * Take care memory controller reclaiming has small influence 1550 * Take care memory controller reclaiming has small influence
1506 * to global LRU. 1551 * to global LRU.
1507 */ 1552 */
1508 if (scan_global_lru(sc)) { 1553 if (scanning_global_lru(sc)) {
1509 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1554 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
1510 continue; 1555 continue;
1511 note_zone_scanning_priority(zone, priority); 1556 note_zone_scanning_priority(zone, priority);
@@ -1524,10 +1569,8 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
1524 priority); 1569 priority);
1525 } 1570 }
1526 1571
1527 nr_reclaimed += shrink_zone(priority, zone, sc); 1572 shrink_zone(priority, zone, sc);
1528 } 1573 }
1529
1530 return nr_reclaimed;
1531} 1574}
1532 1575
1533/* 1576/*
@@ -1552,7 +1595,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1552 int priority; 1595 int priority;
1553 unsigned long ret = 0; 1596 unsigned long ret = 0;
1554 unsigned long total_scanned = 0; 1597 unsigned long total_scanned = 0;
1555 unsigned long nr_reclaimed = 0;
1556 struct reclaim_state *reclaim_state = current->reclaim_state; 1598 struct reclaim_state *reclaim_state = current->reclaim_state;
1557 unsigned long lru_pages = 0; 1599 unsigned long lru_pages = 0;
1558 struct zoneref *z; 1600 struct zoneref *z;
@@ -1561,12 +1603,12 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1561 1603
1562 delayacct_freepages_start(); 1604 delayacct_freepages_start();
1563 1605
1564 if (scan_global_lru(sc)) 1606 if (scanning_global_lru(sc))
1565 count_vm_event(ALLOCSTALL); 1607 count_vm_event(ALLOCSTALL);
1566 /* 1608 /*
1567 * mem_cgroup will not do shrink_slab. 1609 * mem_cgroup will not do shrink_slab.
1568 */ 1610 */
1569 if (scan_global_lru(sc)) { 1611 if (scanning_global_lru(sc)) {
1570 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1612 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1571 1613
1572 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1614 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
@@ -1580,21 +1622,21 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1580 sc->nr_scanned = 0; 1622 sc->nr_scanned = 0;
1581 if (!priority) 1623 if (!priority)
1582 disable_swap_token(); 1624 disable_swap_token();
1583 nr_reclaimed += shrink_zones(priority, zonelist, sc); 1625 shrink_zones(priority, zonelist, sc);
1584 /* 1626 /*
1585 * Don't shrink slabs when reclaiming memory from 1627 * Don't shrink slabs when reclaiming memory from
1586 * over limit cgroups 1628 * over limit cgroups
1587 */ 1629 */
1588 if (scan_global_lru(sc)) { 1630 if (scanning_global_lru(sc)) {
1589 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); 1631 shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
1590 if (reclaim_state) { 1632 if (reclaim_state) {
1591 nr_reclaimed += reclaim_state->reclaimed_slab; 1633 sc->nr_reclaimed += reclaim_state->reclaimed_slab;
1592 reclaim_state->reclaimed_slab = 0; 1634 reclaim_state->reclaimed_slab = 0;
1593 } 1635 }
1594 } 1636 }
1595 total_scanned += sc->nr_scanned; 1637 total_scanned += sc->nr_scanned;
1596 if (nr_reclaimed >= sc->swap_cluster_max) { 1638 if (sc->nr_reclaimed >= sc->swap_cluster_max) {
1597 ret = nr_reclaimed; 1639 ret = sc->nr_reclaimed;
1598 goto out; 1640 goto out;
1599 } 1641 }
1600 1642
@@ -1616,8 +1658,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
1616 congestion_wait(WRITE, HZ/10); 1658 congestion_wait(WRITE, HZ/10);
1617 } 1659 }
1618 /* top priority shrink_zones still had more to do? don't OOM, then */ 1660 /* top priority shrink_zones still had more to do? don't OOM, then */
1619 if (!sc->all_unreclaimable && scan_global_lru(sc)) 1661 if (!sc->all_unreclaimable && scanning_global_lru(sc))
1620 ret = nr_reclaimed; 1662 ret = sc->nr_reclaimed;
1621out: 1663out:
1622 /* 1664 /*
1623 * Now that we've scanned all the zones at this priority level, note 1665 * Now that we've scanned all the zones at this priority level, note
@@ -1629,7 +1671,7 @@ out:
1629 if (priority < 0) 1671 if (priority < 0)
1630 priority = 0; 1672 priority = 0;
1631 1673
1632 if (scan_global_lru(sc)) { 1674 if (scanning_global_lru(sc)) {
1633 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1675 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
1634 1676
1635 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) 1677 if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
@@ -1665,19 +1707,24 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1665#ifdef CONFIG_CGROUP_MEM_RES_CTLR 1707#ifdef CONFIG_CGROUP_MEM_RES_CTLR
1666 1708
1667unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, 1709unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1668 gfp_t gfp_mask) 1710 gfp_t gfp_mask,
1711 bool noswap,
1712 unsigned int swappiness)
1669{ 1713{
1670 struct scan_control sc = { 1714 struct scan_control sc = {
1671 .may_writepage = !laptop_mode, 1715 .may_writepage = !laptop_mode,
1672 .may_swap = 1, 1716 .may_swap = 1,
1673 .swap_cluster_max = SWAP_CLUSTER_MAX, 1717 .swap_cluster_max = SWAP_CLUSTER_MAX,
1674 .swappiness = vm_swappiness, 1718 .swappiness = swappiness,
1675 .order = 0, 1719 .order = 0,
1676 .mem_cgroup = mem_cont, 1720 .mem_cgroup = mem_cont,
1677 .isolate_pages = mem_cgroup_isolate_pages, 1721 .isolate_pages = mem_cgroup_isolate_pages,
1678 }; 1722 };
1679 struct zonelist *zonelist; 1723 struct zonelist *zonelist;
1680 1724
1725 if (noswap)
1726 sc.may_swap = 0;
1727
1681 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 1728 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1682 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 1729 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
1683 zonelist = NODE_DATA(numa_node_id())->node_zonelists; 1730 zonelist = NODE_DATA(numa_node_id())->node_zonelists;
@@ -1712,7 +1759,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1712 int priority; 1759 int priority;
1713 int i; 1760 int i;
1714 unsigned long total_scanned; 1761 unsigned long total_scanned;
1715 unsigned long nr_reclaimed;
1716 struct reclaim_state *reclaim_state = current->reclaim_state; 1762 struct reclaim_state *reclaim_state = current->reclaim_state;
1717 struct scan_control sc = { 1763 struct scan_control sc = {
1718 .gfp_mask = GFP_KERNEL, 1764 .gfp_mask = GFP_KERNEL,
@@ -1731,7 +1777,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1731 1777
1732loop_again: 1778loop_again:
1733 total_scanned = 0; 1779 total_scanned = 0;
1734 nr_reclaimed = 0; 1780 sc.nr_reclaimed = 0;
1735 sc.may_writepage = !laptop_mode; 1781 sc.may_writepage = !laptop_mode;
1736 count_vm_event(PAGEOUTRUN); 1782 count_vm_event(PAGEOUTRUN);
1737 1783
@@ -1766,7 +1812,7 @@ loop_again:
1766 * Do some background aging of the anon list, to give 1812 * Do some background aging of the anon list, to give
1767 * pages a chance to be referenced before reclaiming. 1813 * pages a chance to be referenced before reclaiming.
1768 */ 1814 */
1769 if (inactive_anon_is_low(zone)) 1815 if (inactive_anon_is_low(zone, &sc))
1770 shrink_active_list(SWAP_CLUSTER_MAX, zone, 1816 shrink_active_list(SWAP_CLUSTER_MAX, zone,
1771 &sc, priority, 0); 1817 &sc, priority, 0);
1772 1818
@@ -1817,11 +1863,11 @@ loop_again:
1817 */ 1863 */
1818 if (!zone_watermark_ok(zone, order, 8*zone->pages_high, 1864 if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
1819 end_zone, 0)) 1865 end_zone, 0))
1820 nr_reclaimed += shrink_zone(priority, zone, &sc); 1866 shrink_zone(priority, zone, &sc);
1821 reclaim_state->reclaimed_slab = 0; 1867 reclaim_state->reclaimed_slab = 0;
1822 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, 1868 nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
1823 lru_pages); 1869 lru_pages);
1824 nr_reclaimed += reclaim_state->reclaimed_slab; 1870 sc.nr_reclaimed += reclaim_state->reclaimed_slab;
1825 total_scanned += sc.nr_scanned; 1871 total_scanned += sc.nr_scanned;
1826 if (zone_is_all_unreclaimable(zone)) 1872 if (zone_is_all_unreclaimable(zone))
1827 continue; 1873 continue;
@@ -1835,7 +1881,7 @@ loop_again:
1835 * even in laptop mode 1881 * even in laptop mode
1836 */ 1882 */
1837 if (total_scanned > SWAP_CLUSTER_MAX * 2 && 1883 if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
1838 total_scanned > nr_reclaimed + nr_reclaimed / 2) 1884 total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
1839 sc.may_writepage = 1; 1885 sc.may_writepage = 1;
1840 } 1886 }
1841 if (all_zones_ok) 1887 if (all_zones_ok)
@@ -1853,7 +1899,7 @@ loop_again:
1853 * matches the direct reclaim path behaviour in terms of impact 1899 * matches the direct reclaim path behaviour in terms of impact
1854 * on zone->*_priority. 1900 * on zone->*_priority.
1855 */ 1901 */
1856 if (nr_reclaimed >= SWAP_CLUSTER_MAX) 1902 if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX)
1857 break; 1903 break;
1858 } 1904 }
1859out: 1905out:
@@ -1872,10 +1918,27 @@ out:
1872 1918
1873 try_to_freeze(); 1919 try_to_freeze();
1874 1920
1921 /*
1922 * Fragmentation may mean that the system cannot be
1923 * rebalanced for high-order allocations in all zones.
1924 * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX,
1925 * it means the zones have been fully scanned and are still
1926 * not balanced. For high-order allocations, there is
1927 * little point trying all over again as kswapd may
1928 * infinite loop.
1929 *
1930 * Instead, recheck all watermarks at order-0 as they
1931 * are the most important. If watermarks are ok, kswapd will go
1932 * back to sleep. High-order users can still perform direct
1933 * reclaim if they wish.
1934 */
1935 if (sc.nr_reclaimed < SWAP_CLUSTER_MAX)
1936 order = sc.order = 0;
1937
1875 goto loop_again; 1938 goto loop_again;
1876 } 1939 }
1877 1940
1878 return nr_reclaimed; 1941 return sc.nr_reclaimed;
1879} 1942}
1880 1943
1881/* 1944/*
@@ -2227,7 +2290,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2227 struct task_struct *p = current; 2290 struct task_struct *p = current;
2228 struct reclaim_state reclaim_state; 2291 struct reclaim_state reclaim_state;
2229 int priority; 2292 int priority;
2230 unsigned long nr_reclaimed = 0;
2231 struct scan_control sc = { 2293 struct scan_control sc = {
2232 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 2294 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
2233 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), 2295 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
@@ -2260,9 +2322,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2260 priority = ZONE_RECLAIM_PRIORITY; 2322 priority = ZONE_RECLAIM_PRIORITY;
2261 do { 2323 do {
2262 note_zone_scanning_priority(zone, priority); 2324 note_zone_scanning_priority(zone, priority);
2263 nr_reclaimed += shrink_zone(priority, zone, &sc); 2325 shrink_zone(priority, zone, &sc);
2264 priority--; 2326 priority--;
2265 } while (priority >= 0 && nr_reclaimed < nr_pages); 2327 } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
2266 } 2328 }
2267 2329
2268 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); 2330 slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
@@ -2286,13 +2348,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2286 * Update nr_reclaimed by the number of slab pages we 2348 * Update nr_reclaimed by the number of slab pages we
2287 * reclaimed from this zone. 2349 * reclaimed from this zone.
2288 */ 2350 */
2289 nr_reclaimed += slab_reclaimable - 2351 sc.nr_reclaimed += slab_reclaimable -
2290 zone_page_state(zone, NR_SLAB_RECLAIMABLE); 2352 zone_page_state(zone, NR_SLAB_RECLAIMABLE);
2291 } 2353 }
2292 2354
2293 p->reclaim_state = NULL; 2355 p->reclaim_state = NULL;
2294 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); 2356 current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
2295 return nr_reclaimed >= nr_pages; 2357 return sc.nr_reclaimed >= nr_pages;
2296} 2358}
2297 2359
2298int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) 2360int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
@@ -2393,6 +2455,7 @@ retry:
2393 2455
2394 __dec_zone_state(zone, NR_UNEVICTABLE); 2456 __dec_zone_state(zone, NR_UNEVICTABLE);
2395 list_move(&page->lru, &zone->lru[l].list); 2457 list_move(&page->lru, &zone->lru[l].list);
2458 mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
2396 __inc_zone_state(zone, NR_INACTIVE_ANON + l); 2459 __inc_zone_state(zone, NR_INACTIVE_ANON + l);
2397 __count_vm_event(UNEVICTABLE_PGRESCUED); 2460 __count_vm_event(UNEVICTABLE_PGRESCUED);
2398 } else { 2461 } else {
@@ -2401,6 +2464,7 @@ retry:
2401 */ 2464 */
2402 SetPageUnevictable(page); 2465 SetPageUnevictable(page);
2403 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); 2466 list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
2467 mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
2404 if (page_evictable(page, NULL)) 2468 if (page_evictable(page, NULL))
2405 goto retry; 2469 goto retry;
2406 } 2470 }
@@ -2472,7 +2536,7 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
2472 * back onto @zone's unevictable list. 2536 * back onto @zone's unevictable list.
2473 */ 2537 */
2474#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ 2538#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
2475void scan_zone_unevictable_pages(struct zone *zone) 2539static void scan_zone_unevictable_pages(struct zone *zone)
2476{ 2540{
2477 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; 2541 struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
2478 unsigned long scan; 2542 unsigned long scan;
@@ -2514,7 +2578,7 @@ void scan_zone_unevictable_pages(struct zone *zone)
2514 * that has possibly/probably made some previously unevictable pages 2578 * that has possibly/probably made some previously unevictable pages
2515 * evictable. 2579 * evictable.
2516 */ 2580 */
2517void scan_all_zones_unevictable_pages(void) 2581static void scan_all_zones_unevictable_pages(void)
2518{ 2582{
2519 struct zone *zone; 2583 struct zone *zone;
2520 2584