aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/Kconfig.debug26
-rw-r--r--mm/Makefile1
-rw-r--r--mm/debug-pagealloc.c129
-rw-r--r--mm/filemap.c21
-rw-r--r--mm/filemap_xip.c4
-rw-r--r--mm/highmem.c45
-rw-r--r--mm/hugetlb.c6
-rw-r--r--mm/internal.h8
-rw-r--r--mm/memcontrol.c687
-rw-r--r--mm/memory.c33
-rw-r--r--mm/migrate.c10
-rw-r--r--mm/mmap.c3
-rw-r--r--mm/nommu.c52
-rw-r--r--mm/oom_kill.c13
-rw-r--r--mm/page-writeback.c42
-rw-r--r--mm/page_alloc.c31
-rw-r--r--mm/page_cgroup.c37
-rw-r--r--mm/readahead.c40
-rw-r--r--mm/shmem.c3
-rw-r--r--mm/slab.c3
-rw-r--r--mm/sparse.c4
-rw-r--r--mm/swap.c27
-rw-r--r--mm/truncate.c10
-rw-r--r--mm/util.c30
-rw-r--r--mm/vmalloc.c19
-rw-r--r--mm/vmscan.c107
-rw-r--r--mm/vmstat.c16
28 files changed, 933 insertions, 483 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a5b77811fdf2..b53427ad30a3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -206,7 +206,6 @@ config VIRT_TO_BUS
206config UNEVICTABLE_LRU 206config UNEVICTABLE_LRU
207 bool "Add LRU list to track non-evictable pages" 207 bool "Add LRU list to track non-evictable pages"
208 default y 208 default y
209 depends on MMU
210 help 209 help
211 Keeps unevictable pages off of the active and inactive pageout 210 Keeps unevictable pages off of the active and inactive pageout
212 lists, so kswapd will not waste CPU time or have its balancing 211 lists, so kswapd will not waste CPU time or have its balancing
@@ -214,5 +213,13 @@ config UNEVICTABLE_LRU
214 will use one page flag and increase the code size a little, 213 will use one page flag and increase the code size a little,
215 say Y unless you know what you are doing. 214 say Y unless you know what you are doing.
216 215
216config HAVE_MLOCK
217 bool
218 default y if MMU=y
219
220config HAVE_MLOCKED_PAGE_BIT
221 bool
222 default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y
223
217config MMU_NOTIFIER 224config MMU_NOTIFIER
218 bool 225 bool
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
new file mode 100644
index 000000000000..bb01e298f260
--- /dev/null
+++ b/mm/Kconfig.debug
@@ -0,0 +1,26 @@
1config DEBUG_PAGEALLOC
2 bool "Debug page memory allocations"
3 depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC
4 depends on !HIBERNATION || !PPC && !SPARC
5 ---help---
6 Unmap pages from the kernel linear mapping after free_pages().
7 This results in a large slowdown, but helps to find certain types
8 of memory corruptions.
9
10config WANT_PAGE_DEBUG_FLAGS
11 bool
12
13config PAGE_POISONING
14 bool "Debug page memory allocations"
15 depends on DEBUG_KERNEL && !ARCH_SUPPORTS_DEBUG_PAGEALLOC
16 depends on !HIBERNATION
17 select DEBUG_PAGEALLOC
18 select WANT_PAGE_DEBUG_FLAGS
19 help
20 Fill the pages with poison patterns after free_pages() and verify
21 the patterns before alloc_pages(). This results in a large slowdown,
22 but helps to find certain types of memory corruptions.
23
24 This option cannot enalbe with hibernation. Otherwise, it will get
25 wrong messages for memory corruption because the free pages are not
26 saved to the suspend image.
diff --git a/mm/Makefile b/mm/Makefile
index 818569b68f46..ec73c68b6015 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o
24obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o 24obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o
25obj-$(CONFIG_SLOB) += slob.o 25obj-$(CONFIG_SLOB) += slob.o
26obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o 26obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o
27obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
27obj-$(CONFIG_SLAB) += slab.o 28obj-$(CONFIG_SLAB) += slab.o
28obj-$(CONFIG_SLUB) += slub.o 29obj-$(CONFIG_SLUB) += slub.o
29obj-$(CONFIG_FAILSLAB) += failslab.o 30obj-$(CONFIG_FAILSLAB) += failslab.o
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
new file mode 100644
index 000000000000..a1e3324de2b5
--- /dev/null
+++ b/mm/debug-pagealloc.c
@@ -0,0 +1,129 @@
1#include <linux/kernel.h>
2#include <linux/mm.h>
3#include <linux/page-debug-flags.h>
4#include <linux/poison.h>
5
6static inline void set_page_poison(struct page *page)
7{
8 __set_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
9}
10
11static inline void clear_page_poison(struct page *page)
12{
13 __clear_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
14}
15
16static inline bool page_poison(struct page *page)
17{
18 return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
19}
20
21static void poison_highpage(struct page *page)
22{
23 /*
24 * Page poisoning for highmem pages is not implemented.
25 *
26 * This can be called from interrupt contexts.
27 * So we need to create a new kmap_atomic slot for this
28 * application and it will need interrupt protection.
29 */
30}
31
32static void poison_page(struct page *page)
33{
34 void *addr;
35
36 if (PageHighMem(page)) {
37 poison_highpage(page);
38 return;
39 }
40 set_page_poison(page);
41 addr = page_address(page);
42 memset(addr, PAGE_POISON, PAGE_SIZE);
43}
44
45static void poison_pages(struct page *page, int n)
46{
47 int i;
48
49 for (i = 0; i < n; i++)
50 poison_page(page + i);
51}
52
53static bool single_bit_flip(unsigned char a, unsigned char b)
54{
55 unsigned char error = a ^ b;
56
57 return error && !(error & (error - 1));
58}
59
60static void check_poison_mem(unsigned char *mem, size_t bytes)
61{
62 unsigned char *start;
63 unsigned char *end;
64
65 for (start = mem; start < mem + bytes; start++) {
66 if (*start != PAGE_POISON)
67 break;
68 }
69 if (start == mem + bytes)
70 return;
71
72 for (end = mem + bytes - 1; end > start; end--) {
73 if (*end != PAGE_POISON)
74 break;
75 }
76
77 if (!printk_ratelimit())
78 return;
79 else if (start == end && single_bit_flip(*start, PAGE_POISON))
80 printk(KERN_ERR "pagealloc: single bit error\n");
81 else
82 printk(KERN_ERR "pagealloc: memory corruption\n");
83
84 print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, start,
85 end - start + 1, 1);
86 dump_stack();
87}
88
89static void unpoison_highpage(struct page *page)
90{
91 /*
92 * See comment in poison_highpage().
93 * Highmem pages should not be poisoned for now
94 */
95 BUG_ON(page_poison(page));
96}
97
98static void unpoison_page(struct page *page)
99{
100 if (PageHighMem(page)) {
101 unpoison_highpage(page);
102 return;
103 }
104 if (page_poison(page)) {
105 void *addr = page_address(page);
106
107 check_poison_mem(addr, PAGE_SIZE);
108 clear_page_poison(page);
109 }
110}
111
112static void unpoison_pages(struct page *page, int n)
113{
114 int i;
115
116 for (i = 0; i < n; i++)
117 unpoison_page(page + i);
118}
119
120void kernel_map_pages(struct page *page, int numpages, int enable)
121{
122 if (!debug_pagealloc_enabled)
123 return;
124
125 if (enable)
126 unpoison_pages(page, numpages);
127 else
128 poison_pages(page, numpages);
129}
diff --git a/mm/filemap.c b/mm/filemap.c
index 126d3973b3d1..fc11974f2bee 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -565,6 +565,24 @@ void wait_on_page_bit(struct page *page, int bit_nr)
565EXPORT_SYMBOL(wait_on_page_bit); 565EXPORT_SYMBOL(wait_on_page_bit);
566 566
567/** 567/**
568 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
569 * @page - Page defining the wait queue of interest
570 * @waiter - Waiter to add to the queue
571 *
572 * Add an arbitrary @waiter to the wait queue for the nominated @page.
573 */
574void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
575{
576 wait_queue_head_t *q = page_waitqueue(page);
577 unsigned long flags;
578
579 spin_lock_irqsave(&q->lock, flags);
580 __add_wait_queue(q, waiter);
581 spin_unlock_irqrestore(&q->lock, flags);
582}
583EXPORT_SYMBOL_GPL(add_page_wait_queue);
584
585/**
568 * unlock_page - unlock a locked page 586 * unlock_page - unlock a locked page
569 * @page: the page 587 * @page: the page
570 * 588 *
@@ -2463,6 +2481,9 @@ EXPORT_SYMBOL(generic_file_aio_write);
2463 * (presumably at page->private). If the release was successful, return `1'. 2481 * (presumably at page->private). If the release was successful, return `1'.
2464 * Otherwise return zero. 2482 * Otherwise return zero.
2465 * 2483 *
2484 * This may also be called if PG_fscache is set on a page, indicating that the
2485 * page is known to the local caching routines.
2486 *
2466 * The @gfp_mask argument specifies whether I/O may be performed to release 2487 * The @gfp_mask argument specifies whether I/O may be performed to release
2467 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). 2488 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
2468 * 2489 *
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 0c04615651b7..427dfe3ce78c 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -89,8 +89,8 @@ do_xip_mapping_read(struct address_space *mapping,
89 } 89 }
90 } 90 }
91 nr = nr - offset; 91 nr = nr - offset;
92 if (nr > len) 92 if (nr > len - copied)
93 nr = len; 93 nr = len - copied;
94 94
95 error = mapping->a_ops->get_xip_mem(mapping, index, 0, 95 error = mapping->a_ops->get_xip_mem(mapping, index, 0,
96 &xip_mem, &xip_pfn); 96 &xip_mem, &xip_pfn);
diff --git a/mm/highmem.c b/mm/highmem.c
index 910198037bf5..68eb1d9b63fa 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -422,3 +422,48 @@ void __init page_address_init(void)
422} 422}
423 423
424#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ 424#endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */
425
426#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT)
427
428void debug_kmap_atomic(enum km_type type)
429{
430 static unsigned warn_count = 10;
431
432 if (unlikely(warn_count == 0))
433 return;
434
435 if (unlikely(in_interrupt())) {
436 if (in_irq()) {
437 if (type != KM_IRQ0 && type != KM_IRQ1 &&
438 type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ &&
439 type != KM_BOUNCE_READ) {
440 WARN_ON(1);
441 warn_count--;
442 }
443 } else if (!irqs_disabled()) { /* softirq */
444 if (type != KM_IRQ0 && type != KM_IRQ1 &&
445 type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 &&
446 type != KM_SKB_SUNRPC_DATA &&
447 type != KM_SKB_DATA_SOFTIRQ &&
448 type != KM_BOUNCE_READ) {
449 WARN_ON(1);
450 warn_count--;
451 }
452 }
453 }
454
455 if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ ||
456 type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) {
457 if (!irqs_disabled()) {
458 WARN_ON(1);
459 warn_count--;
460 }
461 } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) {
462 if (irq_count() == 0 && !irqs_disabled()) {
463 WARN_ON(1);
464 warn_count--;
465 }
466 }
467}
468
469#endif
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 107da3d809a8..28c655ba9353 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -918,7 +918,7 @@ static void return_unused_surplus_pages(struct hstate *h,
918 * an instantiated the change should be committed via vma_commit_reservation. 918 * an instantiated the change should be committed via vma_commit_reservation.
919 * No action is required on failure. 919 * No action is required on failure.
920 */ 920 */
921static int vma_needs_reservation(struct hstate *h, 921static long vma_needs_reservation(struct hstate *h,
922 struct vm_area_struct *vma, unsigned long addr) 922 struct vm_area_struct *vma, unsigned long addr)
923{ 923{
924 struct address_space *mapping = vma->vm_file->f_mapping; 924 struct address_space *mapping = vma->vm_file->f_mapping;
@@ -933,7 +933,7 @@ static int vma_needs_reservation(struct hstate *h,
933 return 1; 933 return 1;
934 934
935 } else { 935 } else {
936 int err; 936 long err;
937 pgoff_t idx = vma_hugecache_offset(h, vma, addr); 937 pgoff_t idx = vma_hugecache_offset(h, vma, addr);
938 struct resv_map *reservations = vma_resv_map(vma); 938 struct resv_map *reservations = vma_resv_map(vma);
939 939
@@ -969,7 +969,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma,
969 struct page *page; 969 struct page *page;
970 struct address_space *mapping = vma->vm_file->f_mapping; 970 struct address_space *mapping = vma->vm_file->f_mapping;
971 struct inode *inode = mapping->host; 971 struct inode *inode = mapping->host;
972 unsigned int chg; 972 long chg;
973 973
974 /* 974 /*
975 * Processes that did not create the mapping will have no reserves and 975 * Processes that did not create the mapping will have no reserves and
diff --git a/mm/internal.h b/mm/internal.h
index 478223b73a2a..987bb03fbdd8 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -63,6 +63,7 @@ static inline unsigned long page_order(struct page *page)
63 return page_private(page); 63 return page_private(page);
64} 64}
65 65
66#ifdef CONFIG_HAVE_MLOCK
66extern long mlock_vma_pages_range(struct vm_area_struct *vma, 67extern long mlock_vma_pages_range(struct vm_area_struct *vma,
67 unsigned long start, unsigned long end); 68 unsigned long start, unsigned long end);
68extern void munlock_vma_pages_range(struct vm_area_struct *vma, 69extern void munlock_vma_pages_range(struct vm_area_struct *vma,
@@ -71,6 +72,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
71{ 72{
72 munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); 73 munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
73} 74}
75#endif
74 76
75#ifdef CONFIG_UNEVICTABLE_LRU 77#ifdef CONFIG_UNEVICTABLE_LRU
76/* 78/*
@@ -90,7 +92,7 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old)
90} 92}
91#endif 93#endif
92 94
93#ifdef CONFIG_UNEVICTABLE_LRU 95#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT
94/* 96/*
95 * Called only in fault path via page_evictable() for a new page 97 * Called only in fault path via page_evictable() for a new page
96 * to determine if it's being mapped into a LOCKED vma. 98 * to determine if it's being mapped into a LOCKED vma.
@@ -165,7 +167,7 @@ static inline void free_page_mlock(struct page *page)
165 } 167 }
166} 168}
167 169
168#else /* CONFIG_UNEVICTABLE_LRU */ 170#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
169static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) 171static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p)
170{ 172{
171 return 0; 173 return 0;
@@ -175,7 +177,7 @@ static inline void mlock_vma_page(struct page *page) { }
175static inline void mlock_migrate_page(struct page *new, struct page *old) { } 177static inline void mlock_migrate_page(struct page *new, struct page *old) { }
176static inline void free_page_mlock(struct page *page) { } 178static inline void free_page_mlock(struct page *page) { }
177 179
178#endif /* CONFIG_UNEVICTABLE_LRU */ 180#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */
179 181
180/* 182/*
181 * Return the mem_map entry representing the 'offset' subpage within 183 * Return the mem_map entry representing the 'offset' subpage within
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8e4be9cb2a6a..2fc6d6c48238 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -27,6 +27,7 @@
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/bit_spinlock.h> 28#include <linux/bit_spinlock.h>
29#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30#include <linux/limits.h>
30#include <linux/mutex.h> 31#include <linux/mutex.h>
31#include <linux/slab.h> 32#include <linux/slab.h>
32#include <linux/swap.h> 33#include <linux/swap.h>
@@ -95,6 +96,15 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
95 return ret; 96 return ret;
96} 97}
97 98
99static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
100{
101 s64 ret;
102
103 ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
104 ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
105 return ret;
106}
107
98/* 108/*
99 * per-zone information in memory controller. 109 * per-zone information in memory controller.
100 */ 110 */
@@ -154,9 +164,9 @@ struct mem_cgroup {
154 164
155 /* 165 /*
156 * While reclaiming in a hiearchy, we cache the last child we 166 * While reclaiming in a hiearchy, we cache the last child we
157 * reclaimed from. Protected by hierarchy_mutex 167 * reclaimed from.
158 */ 168 */
159 struct mem_cgroup *last_scanned_child; 169 int last_scanned_child;
160 /* 170 /*
161 * Should the accounting and control be hierarchical, per subtree? 171 * Should the accounting and control be hierarchical, per subtree?
162 */ 172 */
@@ -247,7 +257,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
247 return mem_cgroup_zoneinfo(mem, nid, zid); 257 return mem_cgroup_zoneinfo(mem, nid, zid);
248} 258}
249 259
250static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, 260static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
251 enum lru_list idx) 261 enum lru_list idx)
252{ 262{
253 int nid, zid; 263 int nid, zid;
@@ -286,6 +296,9 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
286static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 296static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
287{ 297{
288 struct mem_cgroup *mem = NULL; 298 struct mem_cgroup *mem = NULL;
299
300 if (!mm)
301 return NULL;
289 /* 302 /*
290 * Because we have no locks, mm->owner's may be being moved to other 303 * Because we have no locks, mm->owner's may be being moved to other
291 * cgroup. We use css_tryget() here even if this looks 304 * cgroup. We use css_tryget() here even if this looks
@@ -308,6 +321,42 @@ static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem)
308 return css_is_removed(&mem->css); 321 return css_is_removed(&mem->css);
309} 322}
310 323
324
325/*
326 * Call callback function against all cgroup under hierarchy tree.
327 */
328static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
329 int (*func)(struct mem_cgroup *, void *))
330{
331 int found, ret, nextid;
332 struct cgroup_subsys_state *css;
333 struct mem_cgroup *mem;
334
335 if (!root->use_hierarchy)
336 return (*func)(root, data);
337
338 nextid = 1;
339 do {
340 ret = 0;
341 mem = NULL;
342
343 rcu_read_lock();
344 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
345 &found);
346 if (css && css_tryget(css))
347 mem = container_of(css, struct mem_cgroup, css);
348 rcu_read_unlock();
349
350 if (mem) {
351 ret = (*func)(mem, data);
352 css_put(&mem->css);
353 }
354 nextid = found + 1;
355 } while (!ret && css);
356
357 return ret;
358}
359
311/* 360/*
312 * Following LRU functions are allowed to be used without PCG_LOCK. 361 * Following LRU functions are allowed to be used without PCG_LOCK.
313 * Operations are called by routine of global LRU independently from memcg. 362 * Operations are called by routine of global LRU independently from memcg.
@@ -441,31 +490,24 @@ void mem_cgroup_move_lists(struct page *page,
441int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 490int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
442{ 491{
443 int ret; 492 int ret;
493 struct mem_cgroup *curr = NULL;
444 494
445 task_lock(task); 495 task_lock(task);
446 ret = task->mm && mm_match_cgroup(task->mm, mem); 496 rcu_read_lock();
497 curr = try_get_mem_cgroup_from_mm(task->mm);
498 rcu_read_unlock();
447 task_unlock(task); 499 task_unlock(task);
500 if (!curr)
501 return 0;
502 if (curr->use_hierarchy)
503 ret = css_is_ancestor(&curr->css, &mem->css);
504 else
505 ret = (curr == mem);
506 css_put(&curr->css);
448 return ret; 507 return ret;
449} 508}
450 509
451/* 510/*
452 * Calculate mapped_ratio under memory controller. This will be used in
453 * vmscan.c for deteremining we have to reclaim mapped pages.
454 */
455int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
456{
457 long total, rss;
458
459 /*
460 * usage is recorded in bytes. But, here, we assume the number of
461 * physical pages can be represented by "long" on any arch.
462 */
463 total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
464 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
465 return (int)((rss * 100L) / total);
466}
467
468/*
469 * prev_priority control...this will be used in memory reclaim path. 511 * prev_priority control...this will be used in memory reclaim path.
470 */ 512 */
471int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 513int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
@@ -501,8 +543,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
501 unsigned long gb; 543 unsigned long gb;
502 unsigned long inactive_ratio; 544 unsigned long inactive_ratio;
503 545
504 inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON); 546 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
505 active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON); 547 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
506 548
507 gb = (inactive + active) >> (30 - PAGE_SHIFT); 549 gb = (inactive + active) >> (30 - PAGE_SHIFT);
508 if (gb) 550 if (gb)
@@ -629,172 +671,202 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
629#define mem_cgroup_from_res_counter(counter, member) \ 671#define mem_cgroup_from_res_counter(counter, member) \
630 container_of(counter, struct mem_cgroup, member) 672 container_of(counter, struct mem_cgroup, member)
631 673
632/* 674static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
633 * This routine finds the DFS walk successor. This routine should be
634 * called with hierarchy_mutex held
635 */
636static struct mem_cgroup *
637__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
638{ 675{
639 struct cgroup *cgroup, *curr_cgroup, *root_cgroup; 676 if (do_swap_account) {
640 677 if (res_counter_check_under_limit(&mem->res) &&
641 curr_cgroup = curr->css.cgroup; 678 res_counter_check_under_limit(&mem->memsw))
642 root_cgroup = root_mem->css.cgroup; 679 return true;
680 } else
681 if (res_counter_check_under_limit(&mem->res))
682 return true;
683 return false;
684}
643 685
644 if (!list_empty(&curr_cgroup->children)) { 686static unsigned int get_swappiness(struct mem_cgroup *memcg)
645 /* 687{
646 * Walk down to children 688 struct cgroup *cgrp = memcg->css.cgroup;
647 */ 689 unsigned int swappiness;
648 cgroup = list_entry(curr_cgroup->children.next,
649 struct cgroup, sibling);
650 curr = mem_cgroup_from_cont(cgroup);
651 goto done;
652 }
653 690
654visit_parent: 691 /* root ? */
655 if (curr_cgroup == root_cgroup) { 692 if (cgrp->parent == NULL)
656 /* caller handles NULL case */ 693 return vm_swappiness;
657 curr = NULL;
658 goto done;
659 }
660 694
661 /* 695 spin_lock(&memcg->reclaim_param_lock);
662 * Goto next sibling 696 swappiness = memcg->swappiness;
663 */ 697 spin_unlock(&memcg->reclaim_param_lock);
664 if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
665 cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
666 sibling);
667 curr = mem_cgroup_from_cont(cgroup);
668 goto done;
669 }
670 698
671 /* 699 return swappiness;
672 * Go up to next parent and next parent's sibling if need be 700}
673 */
674 curr_cgroup = curr_cgroup->parent;
675 goto visit_parent;
676 701
677done: 702static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
678 return curr; 703{
704 int *val = data;
705 (*val)++;
706 return 0;
679} 707}
680 708
681/* 709/**
682 * Visit the first child (need not be the first child as per the ordering 710 * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode.
683 * of the cgroup list, since we track last_scanned_child) of @mem and use 711 * @memcg: The memory cgroup that went over limit
684 * that to reclaim free pages from. 712 * @p: Task that is going to be killed
713 *
714 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
715 * enabled
685 */ 716 */
686static struct mem_cgroup * 717void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
687mem_cgroup_get_next_node(struct mem_cgroup *root_mem)
688{ 718{
689 struct cgroup *cgroup; 719 struct cgroup *task_cgrp;
690 struct mem_cgroup *orig, *next; 720 struct cgroup *mem_cgrp;
691 bool obsolete;
692
693 /* 721 /*
694 * Scan all children under the mem_cgroup mem 722 * Need a buffer in BSS, can't rely on allocations. The code relies
723 * on the assumption that OOM is serialized for memory controller.
724 * If this assumption is broken, revisit this code.
695 */ 725 */
696 mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); 726 static char memcg_name[PATH_MAX];
727 int ret;
728
729 if (!memcg)
730 return;
697 731
698 orig = root_mem->last_scanned_child;
699 obsolete = mem_cgroup_is_obsolete(orig);
700 732
701 if (list_empty(&root_mem->css.cgroup->children)) { 733 rcu_read_lock();
734
735 mem_cgrp = memcg->css.cgroup;
736 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
737
738 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
739 if (ret < 0) {
702 /* 740 /*
703 * root_mem might have children before and last_scanned_child 741 * Unfortunately, we are unable to convert to a useful name
704 * may point to one of them. We put it later. 742 * But we'll still print out the usage information
705 */ 743 */
706 if (orig) 744 rcu_read_unlock();
707 VM_BUG_ON(!obsolete);
708 next = NULL;
709 goto done; 745 goto done;
710 } 746 }
747 rcu_read_unlock();
711 748
712 if (!orig || obsolete) { 749 printk(KERN_INFO "Task in %s killed", memcg_name);
713 cgroup = list_first_entry(&root_mem->css.cgroup->children,
714 struct cgroup, sibling);
715 next = mem_cgroup_from_cont(cgroup);
716 } else
717 next = __mem_cgroup_get_next_node(orig, root_mem);
718 750
751 rcu_read_lock();
752 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
753 if (ret < 0) {
754 rcu_read_unlock();
755 goto done;
756 }
757 rcu_read_unlock();
758
759 /*
760 * Continues from above, so we don't need an KERN_ level
761 */
762 printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
719done: 763done:
720 if (next) 764
721 mem_cgroup_get(next); 765 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
722 root_mem->last_scanned_child = next; 766 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
723 if (orig) 767 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
724 mem_cgroup_put(orig); 768 res_counter_read_u64(&memcg->res, RES_FAILCNT));
725 mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); 769 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
726 return (next) ? next : root_mem; 770 "failcnt %llu\n",
771 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
772 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
773 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
727} 774}
728 775
729static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 776/*
777 * This function returns the number of memcg under hierarchy tree. Returns
778 * 1(self count) if no children.
779 */
780static int mem_cgroup_count_children(struct mem_cgroup *mem)
730{ 781{
731 if (do_swap_account) { 782 int num = 0;
732 if (res_counter_check_under_limit(&mem->res) && 783 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
733 res_counter_check_under_limit(&mem->memsw)) 784 return num;
734 return true;
735 } else
736 if (res_counter_check_under_limit(&mem->res))
737 return true;
738 return false;
739} 785}
740 786
741static unsigned int get_swappiness(struct mem_cgroup *memcg) 787/*
788 * Visit the first child (need not be the first child as per the ordering
789 * of the cgroup list, since we track last_scanned_child) of @mem and use
790 * that to reclaim free pages from.
791 */
792static struct mem_cgroup *
793mem_cgroup_select_victim(struct mem_cgroup *root_mem)
742{ 794{
743 struct cgroup *cgrp = memcg->css.cgroup; 795 struct mem_cgroup *ret = NULL;
744 unsigned int swappiness; 796 struct cgroup_subsys_state *css;
797 int nextid, found;
745 798
746 /* root ? */ 799 if (!root_mem->use_hierarchy) {
747 if (cgrp->parent == NULL) 800 css_get(&root_mem->css);
748 return vm_swappiness; 801 ret = root_mem;
802 }
749 803
750 spin_lock(&memcg->reclaim_param_lock); 804 while (!ret) {
751 swappiness = memcg->swappiness; 805 rcu_read_lock();
752 spin_unlock(&memcg->reclaim_param_lock); 806 nextid = root_mem->last_scanned_child + 1;
807 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
808 &found);
809 if (css && css_tryget(css))
810 ret = container_of(css, struct mem_cgroup, css);
811
812 rcu_read_unlock();
813 /* Updates scanning parameter */
814 spin_lock(&root_mem->reclaim_param_lock);
815 if (!css) {
816 /* this means start scan from ID:1 */
817 root_mem->last_scanned_child = 0;
818 } else
819 root_mem->last_scanned_child = found;
820 spin_unlock(&root_mem->reclaim_param_lock);
821 }
753 822
754 return swappiness; 823 return ret;
755} 824}
756 825
757/* 826/*
758 * Dance down the hierarchy if needed to reclaim memory. We remember the 827 * Scan the hierarchy if needed to reclaim memory. We remember the last child
759 * last child we reclaimed from, so that we don't end up penalizing 828 * we reclaimed from, so that we don't end up penalizing one child extensively
760 * one child extensively based on its position in the children list. 829 * based on its position in the children list.
761 * 830 *
762 * root_mem is the original ancestor that we've been reclaim from. 831 * root_mem is the original ancestor that we've been reclaim from.
832 *
833 * We give up and return to the caller when we visit root_mem twice.
834 * (other groups can be removed while we're walking....)
835 *
836 * If shrink==true, for avoiding to free too much, this returns immedieately.
763 */ 837 */
764static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 838static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
765 gfp_t gfp_mask, bool noswap) 839 gfp_t gfp_mask, bool noswap, bool shrink)
766{ 840{
767 struct mem_cgroup *next_mem; 841 struct mem_cgroup *victim;
768 int ret = 0; 842 int ret, total = 0;
769 843 int loop = 0;
770 /* 844
771 * Reclaim unconditionally and don't check for return value. 845 while (loop < 2) {
772 * We need to reclaim in the current group and down the tree. 846 victim = mem_cgroup_select_victim(root_mem);
773 * One might think about checking for children before reclaiming, 847 if (victim == root_mem)
774 * but there might be left over accounting, even after children 848 loop++;
775 * have left. 849 if (!mem_cgroup_local_usage(&victim->stat)) {
776 */ 850 /* this cgroup's local usage == 0 */
777 ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, 851 css_put(&victim->css);
778 get_swappiness(root_mem));
779 if (mem_cgroup_check_under_limit(root_mem))
780 return 1; /* indicate reclaim has succeeded */
781 if (!root_mem->use_hierarchy)
782 return ret;
783
784 next_mem = mem_cgroup_get_next_node(root_mem);
785
786 while (next_mem != root_mem) {
787 if (mem_cgroup_is_obsolete(next_mem)) {
788 next_mem = mem_cgroup_get_next_node(root_mem);
789 continue; 852 continue;
790 } 853 }
791 ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, 854 /* we use swappiness of local cgroup */
792 get_swappiness(next_mem)); 855 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap,
856 get_swappiness(victim));
857 css_put(&victim->css);
858 /*
859 * At shrinking usage, we can't check we should stop here or
860 * reclaim more. It's depends on callers. last_scanned_child
861 * will work enough for keeping fairness under tree.
862 */
863 if (shrink)
864 return ret;
865 total += ret;
793 if (mem_cgroup_check_under_limit(root_mem)) 866 if (mem_cgroup_check_under_limit(root_mem))
794 return 1; /* indicate reclaim has succeeded */ 867 return 1 + total;
795 next_mem = mem_cgroup_get_next_node(root_mem);
796 } 868 }
797 return ret; 869 return total;
798} 870}
799 871
800bool mem_cgroup_oom_called(struct task_struct *task) 872bool mem_cgroup_oom_called(struct task_struct *task)
@@ -813,6 +885,19 @@ bool mem_cgroup_oom_called(struct task_struct *task)
813 rcu_read_unlock(); 885 rcu_read_unlock();
814 return ret; 886 return ret;
815} 887}
888
889static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
890{
891 mem->last_oom_jiffies = jiffies;
892 return 0;
893}
894
895static void record_last_oom(struct mem_cgroup *mem)
896{
897 mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
898}
899
900
816/* 901/*
817 * Unlike exported interface, "oom" parameter is added. if oom==true, 902 * Unlike exported interface, "oom" parameter is added. if oom==true,
818 * oom-killer can be invoked. 903 * oom-killer can be invoked.
@@ -875,7 +960,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
875 goto nomem; 960 goto nomem;
876 961
877 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, 962 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
878 noswap); 963 noswap, false);
879 if (ret) 964 if (ret)
880 continue; 965 continue;
881 966
@@ -895,7 +980,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
895 mutex_lock(&memcg_tasklist); 980 mutex_lock(&memcg_tasklist);
896 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 981 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
897 mutex_unlock(&memcg_tasklist); 982 mutex_unlock(&memcg_tasklist);
898 mem_over_limit->last_oom_jiffies = jiffies; 983 record_last_oom(mem_over_limit);
899 } 984 }
900 goto nomem; 985 goto nomem;
901 } 986 }
@@ -906,20 +991,55 @@ nomem:
906 return -ENOMEM; 991 return -ENOMEM;
907} 992}
908 993
994
995/*
996 * A helper function to get mem_cgroup from ID. must be called under
997 * rcu_read_lock(). The caller must check css_is_removed() or some if
998 * it's concern. (dropping refcnt from swap can be called against removed
999 * memcg.)
1000 */
1001static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1002{
1003 struct cgroup_subsys_state *css;
1004
1005 /* ID 0 is unused ID */
1006 if (!id)
1007 return NULL;
1008 css = css_lookup(&mem_cgroup_subsys, id);
1009 if (!css)
1010 return NULL;
1011 return container_of(css, struct mem_cgroup, css);
1012}
1013
909static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) 1014static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
910{ 1015{
911 struct mem_cgroup *mem; 1016 struct mem_cgroup *mem;
1017 struct page_cgroup *pc;
1018 unsigned short id;
912 swp_entry_t ent; 1019 swp_entry_t ent;
913 1020
1021 VM_BUG_ON(!PageLocked(page));
1022
914 if (!PageSwapCache(page)) 1023 if (!PageSwapCache(page))
915 return NULL; 1024 return NULL;
916 1025
917 ent.val = page_private(page); 1026 pc = lookup_page_cgroup(page);
918 mem = lookup_swap_cgroup(ent); 1027 /*
919 if (!mem) 1028 * Used bit of swapcache is solid under page lock.
920 return NULL; 1029 */
921 if (!css_tryget(&mem->css)) 1030 if (PageCgroupUsed(pc)) {
922 return NULL; 1031 mem = pc->mem_cgroup;
1032 if (mem && !css_tryget(&mem->css))
1033 mem = NULL;
1034 } else {
1035 ent.val = page_private(page);
1036 id = lookup_swap_cgroup(ent);
1037 rcu_read_lock();
1038 mem = mem_cgroup_lookup(id);
1039 if (mem && !css_tryget(&mem->css))
1040 mem = NULL;
1041 rcu_read_unlock();
1042 }
923 return mem; 1043 return mem;
924} 1044}
925 1045
@@ -1118,6 +1238,10 @@ int mem_cgroup_newpage_charge(struct page *page,
1118 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 1238 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
1119} 1239}
1120 1240
1241static void
1242__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1243 enum charge_type ctype);
1244
1121int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 1245int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1122 gfp_t gfp_mask) 1246 gfp_t gfp_mask)
1123{ 1247{
@@ -1154,16 +1278,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1154 unlock_page_cgroup(pc); 1278 unlock_page_cgroup(pc);
1155 } 1279 }
1156 1280
1157 if (do_swap_account && PageSwapCache(page)) {
1158 mem = try_get_mem_cgroup_from_swapcache(page);
1159 if (mem)
1160 mm = NULL;
1161 else
1162 mem = NULL;
1163 /* SwapCache may be still linked to LRU now. */
1164 mem_cgroup_lru_del_before_commit_swapcache(page);
1165 }
1166
1167 if (unlikely(!mm && !mem)) 1281 if (unlikely(!mm && !mem))
1168 mm = &init_mm; 1282 mm = &init_mm;
1169 1283
@@ -1171,22 +1285,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1171 return mem_cgroup_charge_common(page, mm, gfp_mask, 1285 return mem_cgroup_charge_common(page, mm, gfp_mask,
1172 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 1286 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
1173 1287
1174 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 1288 /* shmem */
1175 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); 1289 if (PageSwapCache(page)) {
1176 if (mem) 1290 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
1177 css_put(&mem->css); 1291 if (!ret)
1178 if (PageSwapCache(page)) 1292 __mem_cgroup_commit_charge_swapin(page, mem,
1179 mem_cgroup_lru_add_after_commit_swapcache(page); 1293 MEM_CGROUP_CHARGE_TYPE_SHMEM);
1294 } else
1295 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
1296 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
1180 1297
1181 if (do_swap_account && !ret && PageSwapCache(page)) {
1182 swp_entry_t ent = {.val = page_private(page)};
1183 /* avoid double counting */
1184 mem = swap_cgroup_record(ent, NULL);
1185 if (mem) {
1186 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1187 mem_cgroup_put(mem);
1188 }
1189 }
1190 return ret; 1298 return ret;
1191} 1299}
1192 1300
@@ -1229,7 +1337,9 @@ charge_cur_mm:
1229 return __mem_cgroup_try_charge(mm, mask, ptr, true); 1337 return __mem_cgroup_try_charge(mm, mask, ptr, true);
1230} 1338}
1231 1339
1232void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 1340static void
1341__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1342 enum charge_type ctype)
1233{ 1343{
1234 struct page_cgroup *pc; 1344 struct page_cgroup *pc;
1235 1345
@@ -1239,7 +1349,7 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1239 return; 1349 return;
1240 pc = lookup_page_cgroup(page); 1350 pc = lookup_page_cgroup(page);
1241 mem_cgroup_lru_del_before_commit_swapcache(page); 1351 mem_cgroup_lru_del_before_commit_swapcache(page);
1242 __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); 1352 __mem_cgroup_commit_charge(ptr, pc, ctype);
1243 mem_cgroup_lru_add_after_commit_swapcache(page); 1353 mem_cgroup_lru_add_after_commit_swapcache(page);
1244 /* 1354 /*
1245 * Now swap is on-memory. This means this page may be 1355 * Now swap is on-memory. This means this page may be
@@ -1250,18 +1360,32 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1250 */ 1360 */
1251 if (do_swap_account && PageSwapCache(page)) { 1361 if (do_swap_account && PageSwapCache(page)) {
1252 swp_entry_t ent = {.val = page_private(page)}; 1362 swp_entry_t ent = {.val = page_private(page)};
1363 unsigned short id;
1253 struct mem_cgroup *memcg; 1364 struct mem_cgroup *memcg;
1254 memcg = swap_cgroup_record(ent, NULL); 1365
1366 id = swap_cgroup_record(ent, 0);
1367 rcu_read_lock();
1368 memcg = mem_cgroup_lookup(id);
1255 if (memcg) { 1369 if (memcg) {
1370 /*
1371 * This recorded memcg can be obsolete one. So, avoid
1372 * calling css_tryget
1373 */
1256 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1374 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1257 mem_cgroup_put(memcg); 1375 mem_cgroup_put(memcg);
1258 } 1376 }
1259 1377 rcu_read_unlock();
1260 } 1378 }
1261 /* add this page(page_cgroup) to the LRU we want. */ 1379 /* add this page(page_cgroup) to the LRU we want. */
1262 1380
1263} 1381}
1264 1382
1383void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1384{
1385 __mem_cgroup_commit_charge_swapin(page, ptr,
1386 MEM_CGROUP_CHARGE_TYPE_MAPPED);
1387}
1388
1265void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 1389void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1266{ 1390{
1267 if (mem_cgroup_disabled()) 1391 if (mem_cgroup_disabled())
@@ -1324,8 +1448,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1324 res_counter_uncharge(&mem->res, PAGE_SIZE); 1448 res_counter_uncharge(&mem->res, PAGE_SIZE);
1325 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1449 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1326 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1450 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1327
1328 mem_cgroup_charge_statistics(mem, pc, false); 1451 mem_cgroup_charge_statistics(mem, pc, false);
1452
1329 ClearPageCgroupUsed(pc); 1453 ClearPageCgroupUsed(pc);
1330 /* 1454 /*
1331 * pc->mem_cgroup is not cleared here. It will be accessed when it's 1455 * pc->mem_cgroup is not cleared here. It will be accessed when it's
@@ -1377,7 +1501,7 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
1377 MEM_CGROUP_CHARGE_TYPE_SWAPOUT); 1501 MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
1378 /* record memcg information */ 1502 /* record memcg information */
1379 if (do_swap_account && memcg) { 1503 if (do_swap_account && memcg) {
1380 swap_cgroup_record(ent, memcg); 1504 swap_cgroup_record(ent, css_id(&memcg->css));
1381 mem_cgroup_get(memcg); 1505 mem_cgroup_get(memcg);
1382 } 1506 }
1383 if (memcg) 1507 if (memcg)
@@ -1392,15 +1516,23 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
1392void mem_cgroup_uncharge_swap(swp_entry_t ent) 1516void mem_cgroup_uncharge_swap(swp_entry_t ent)
1393{ 1517{
1394 struct mem_cgroup *memcg; 1518 struct mem_cgroup *memcg;
1519 unsigned short id;
1395 1520
1396 if (!do_swap_account) 1521 if (!do_swap_account)
1397 return; 1522 return;
1398 1523
1399 memcg = swap_cgroup_record(ent, NULL); 1524 id = swap_cgroup_record(ent, 0);
1525 rcu_read_lock();
1526 memcg = mem_cgroup_lookup(id);
1400 if (memcg) { 1527 if (memcg) {
1528 /*
1529 * We uncharge this because swap is freed.
1530 * This memcg can be obsolete one. We avoid calling css_tryget
1531 */
1401 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1532 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1402 mem_cgroup_put(memcg); 1533 mem_cgroup_put(memcg);
1403 } 1534 }
1535 rcu_read_unlock();
1404} 1536}
1405#endif 1537#endif
1406 1538
@@ -1508,7 +1640,8 @@ int mem_cgroup_shrink_usage(struct page *page,
1508 return 0; 1640 return 0;
1509 1641
1510 do { 1642 do {
1511 progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true); 1643 progress = mem_cgroup_hierarchical_reclaim(mem,
1644 gfp_mask, true, false);
1512 progress += mem_cgroup_check_under_limit(mem); 1645 progress += mem_cgroup_check_under_limit(mem);
1513 } while (!progress && --retry); 1646 } while (!progress && --retry);
1514 1647
@@ -1523,11 +1656,21 @@ static DEFINE_MUTEX(set_limit_mutex);
1523static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 1656static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1524 unsigned long long val) 1657 unsigned long long val)
1525{ 1658{
1526 1659 int retry_count;
1527 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1528 int progress; 1660 int progress;
1529 u64 memswlimit; 1661 u64 memswlimit;
1530 int ret = 0; 1662 int ret = 0;
1663 int children = mem_cgroup_count_children(memcg);
1664 u64 curusage, oldusage;
1665
1666 /*
1667 * For keeping hierarchical_reclaim simple, how long we should retry
1668 * is depends on callers. We set our retry-count to be function
1669 * of # of children which we should visit in this loop.
1670 */
1671 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
1672
1673 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
1531 1674
1532 while (retry_count) { 1675 while (retry_count) {
1533 if (signal_pending(current)) { 1676 if (signal_pending(current)) {
@@ -1553,8 +1696,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1553 break; 1696 break;
1554 1697
1555 progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, 1698 progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
1556 false); 1699 false, true);
1557 if (!progress) retry_count--; 1700 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
1701 /* Usage is reduced ? */
1702 if (curusage >= oldusage)
1703 retry_count--;
1704 else
1705 oldusage = curusage;
1558 } 1706 }
1559 1707
1560 return ret; 1708 return ret;
@@ -1563,13 +1711,16 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1563int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 1711int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1564 unsigned long long val) 1712 unsigned long long val)
1565{ 1713{
1566 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 1714 int retry_count;
1567 u64 memlimit, oldusage, curusage; 1715 u64 memlimit, oldusage, curusage;
1568 int ret; 1716 int children = mem_cgroup_count_children(memcg);
1717 int ret = -EBUSY;
1569 1718
1570 if (!do_swap_account) 1719 if (!do_swap_account)
1571 return -EINVAL; 1720 return -EINVAL;
1572 1721 /* see mem_cgroup_resize_res_limit */
1722 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
1723 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1573 while (retry_count) { 1724 while (retry_count) {
1574 if (signal_pending(current)) { 1725 if (signal_pending(current)) {
1575 ret = -EINTR; 1726 ret = -EINTR;
@@ -1593,11 +1744,13 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1593 if (!ret) 1744 if (!ret)
1594 break; 1745 break;
1595 1746
1596 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 1747 mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true);
1597 mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true);
1598 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 1748 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1749 /* Usage is reduced ? */
1599 if (curusage >= oldusage) 1750 if (curusage >= oldusage)
1600 retry_count--; 1751 retry_count--;
1752 else
1753 oldusage = curusage;
1601 } 1754 }
1602 return ret; 1755 return ret;
1603} 1756}
@@ -1893,54 +2046,90 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
1893 return 0; 2046 return 0;
1894} 2047}
1895 2048
1896static const struct mem_cgroup_stat_desc { 2049
1897 const char *msg; 2050/* For read statistics */
1898 u64 unit; 2051enum {
1899} mem_cgroup_stat_desc[] = { 2052 MCS_CACHE,
1900 [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, 2053 MCS_RSS,
1901 [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, 2054 MCS_PGPGIN,
1902 [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, 2055 MCS_PGPGOUT,
1903 [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, 2056 MCS_INACTIVE_ANON,
2057 MCS_ACTIVE_ANON,
2058 MCS_INACTIVE_FILE,
2059 MCS_ACTIVE_FILE,
2060 MCS_UNEVICTABLE,
2061 NR_MCS_STAT,
2062};
2063
2064struct mcs_total_stat {
2065 s64 stat[NR_MCS_STAT];
2066};
2067
2068struct {
2069 char *local_name;
2070 char *total_name;
2071} memcg_stat_strings[NR_MCS_STAT] = {
2072 {"cache", "total_cache"},
2073 {"rss", "total_rss"},
2074 {"pgpgin", "total_pgpgin"},
2075 {"pgpgout", "total_pgpgout"},
2076 {"inactive_anon", "total_inactive_anon"},
2077 {"active_anon", "total_active_anon"},
2078 {"inactive_file", "total_inactive_file"},
2079 {"active_file", "total_active_file"},
2080 {"unevictable", "total_unevictable"}
1904}; 2081};
1905 2082
2083
2084static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2085{
2086 struct mcs_total_stat *s = data;
2087 s64 val;
2088
2089 /* per cpu stat */
2090 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE);
2091 s->stat[MCS_CACHE] += val * PAGE_SIZE;
2092 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
2093 s->stat[MCS_RSS] += val * PAGE_SIZE;
2094 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
2095 s->stat[MCS_PGPGIN] += val;
2096 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
2097 s->stat[MCS_PGPGOUT] += val;
2098
2099 /* per zone stat */
2100 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
2101 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
2102 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
2103 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
2104 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
2105 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
2106 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
2107 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
2108 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
2109 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
2110 return 0;
2111}
2112
2113static void
2114mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
2115{
2116 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
2117}
2118
1906static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 2119static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1907 struct cgroup_map_cb *cb) 2120 struct cgroup_map_cb *cb)
1908{ 2121{
1909 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 2122 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
1910 struct mem_cgroup_stat *stat = &mem_cont->stat; 2123 struct mcs_total_stat mystat;
1911 int i; 2124 int i;
1912 2125
1913 for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { 2126 memset(&mystat, 0, sizeof(mystat));
1914 s64 val; 2127 mem_cgroup_get_local_stat(mem_cont, &mystat);
1915 2128
1916 val = mem_cgroup_read_stat(stat, i); 2129 for (i = 0; i < NR_MCS_STAT; i++)
1917 val *= mem_cgroup_stat_desc[i].unit; 2130 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
1918 cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
1919 }
1920 /* showing # of active pages */
1921 {
1922 unsigned long active_anon, inactive_anon;
1923 unsigned long active_file, inactive_file;
1924 unsigned long unevictable;
1925
1926 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
1927 LRU_INACTIVE_ANON);
1928 active_anon = mem_cgroup_get_all_zonestat(mem_cont,
1929 LRU_ACTIVE_ANON);
1930 inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
1931 LRU_INACTIVE_FILE);
1932 active_file = mem_cgroup_get_all_zonestat(mem_cont,
1933 LRU_ACTIVE_FILE);
1934 unevictable = mem_cgroup_get_all_zonestat(mem_cont,
1935 LRU_UNEVICTABLE);
1936
1937 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
1938 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
1939 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
1940 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
1941 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
1942 2131
1943 } 2132 /* Hierarchical information */
1944 { 2133 {
1945 unsigned long long limit, memsw_limit; 2134 unsigned long long limit, memsw_limit;
1946 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 2135 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
@@ -1949,6 +2138,12 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1949 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 2138 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
1950 } 2139 }
1951 2140
2141 memset(&mystat, 0, sizeof(mystat));
2142 mem_cgroup_get_total_stat(mem_cont, &mystat);
2143 for (i = 0; i < NR_MCS_STAT; i++)
2144 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
2145
2146
1952#ifdef CONFIG_DEBUG_VM 2147#ifdef CONFIG_DEBUG_VM
1953 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 2148 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
1954 2149
@@ -2178,6 +2373,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
2178{ 2373{
2179 int node; 2374 int node;
2180 2375
2376 free_css_id(&mem_cgroup_subsys, &mem->css);
2377
2181 for_each_node_state(node, N_POSSIBLE) 2378 for_each_node_state(node, N_POSSIBLE)
2182 free_mem_cgroup_per_zone_info(mem, node); 2379 free_mem_cgroup_per_zone_info(mem, node);
2183 2380
@@ -2228,11 +2425,12 @@ static struct cgroup_subsys_state * __ref
2228mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 2425mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2229{ 2426{
2230 struct mem_cgroup *mem, *parent; 2427 struct mem_cgroup *mem, *parent;
2428 long error = -ENOMEM;
2231 int node; 2429 int node;
2232 2430
2233 mem = mem_cgroup_alloc(); 2431 mem = mem_cgroup_alloc();
2234 if (!mem) 2432 if (!mem)
2235 return ERR_PTR(-ENOMEM); 2433 return ERR_PTR(error);
2236 2434
2237 for_each_node_state(node, N_POSSIBLE) 2435 for_each_node_state(node, N_POSSIBLE)
2238 if (alloc_mem_cgroup_per_zone_info(mem, node)) 2436 if (alloc_mem_cgroup_per_zone_info(mem, node))
@@ -2260,7 +2458,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2260 res_counter_init(&mem->res, NULL); 2458 res_counter_init(&mem->res, NULL);
2261 res_counter_init(&mem->memsw, NULL); 2459 res_counter_init(&mem->memsw, NULL);
2262 } 2460 }
2263 mem->last_scanned_child = NULL; 2461 mem->last_scanned_child = 0;
2264 spin_lock_init(&mem->reclaim_param_lock); 2462 spin_lock_init(&mem->reclaim_param_lock);
2265 2463
2266 if (parent) 2464 if (parent)
@@ -2269,26 +2467,22 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2269 return &mem->css; 2467 return &mem->css;
2270free_out: 2468free_out:
2271 __mem_cgroup_free(mem); 2469 __mem_cgroup_free(mem);
2272 return ERR_PTR(-ENOMEM); 2470 return ERR_PTR(error);
2273} 2471}
2274 2472
2275static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 2473static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
2276 struct cgroup *cont) 2474 struct cgroup *cont)
2277{ 2475{
2278 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2476 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2279 mem_cgroup_force_empty(mem, false); 2477
2478 return mem_cgroup_force_empty(mem, false);
2280} 2479}
2281 2480
2282static void mem_cgroup_destroy(struct cgroup_subsys *ss, 2481static void mem_cgroup_destroy(struct cgroup_subsys *ss,
2283 struct cgroup *cont) 2482 struct cgroup *cont)
2284{ 2483{
2285 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2484 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2286 struct mem_cgroup *last_scanned_child = mem->last_scanned_child;
2287 2485
2288 if (last_scanned_child) {
2289 VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child));
2290 mem_cgroup_put(last_scanned_child);
2291 }
2292 mem_cgroup_put(mem); 2486 mem_cgroup_put(mem);
2293} 2487}
2294 2488
@@ -2327,6 +2521,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
2327 .populate = mem_cgroup_populate, 2521 .populate = mem_cgroup_populate,
2328 .attach = mem_cgroup_move_task, 2522 .attach = mem_cgroup_move_task,
2329 .early_init = 0, 2523 .early_init = 0,
2524 .use_id = 1,
2330}; 2525};
2331 2526
2332#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2527#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
diff --git a/mm/memory.c b/mm/memory.c
index 2032ad2fc34b..cf6873e91c6a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1151,6 +1151,11 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
1151 if ((flags & FOLL_WRITE) && 1151 if ((flags & FOLL_WRITE) &&
1152 !pte_dirty(pte) && !PageDirty(page)) 1152 !pte_dirty(pte) && !PageDirty(page))
1153 set_page_dirty(page); 1153 set_page_dirty(page);
1154 /*
1155 * pte_mkyoung() would be more correct here, but atomic care
1156 * is needed to avoid losing the dirty bit: it is easier to use
1157 * mark_page_accessed().
1158 */
1154 mark_page_accessed(page); 1159 mark_page_accessed(page);
1155 } 1160 }
1156unlock: 1161unlock:
@@ -1940,6 +1945,15 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1940 * get_user_pages(.write=1, .force=1). 1945 * get_user_pages(.write=1, .force=1).
1941 */ 1946 */
1942 if (vma->vm_ops && vma->vm_ops->page_mkwrite) { 1947 if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
1948 struct vm_fault vmf;
1949 int tmp;
1950
1951 vmf.virtual_address = (void __user *)(address &
1952 PAGE_MASK);
1953 vmf.pgoff = old_page->index;
1954 vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
1955 vmf.page = old_page;
1956
1943 /* 1957 /*
1944 * Notify the address space that the page is about to 1958 * Notify the address space that the page is about to
1945 * become writable so that it can prohibit this or wait 1959 * become writable so that it can prohibit this or wait
@@ -1951,8 +1965,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
1951 page_cache_get(old_page); 1965 page_cache_get(old_page);
1952 pte_unmap_unlock(page_table, ptl); 1966 pte_unmap_unlock(page_table, ptl);
1953 1967
1954 if (vma->vm_ops->page_mkwrite(vma, old_page) < 0) 1968 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
1969 if (unlikely(tmp &
1970 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
1971 ret = tmp;
1955 goto unwritable_page; 1972 goto unwritable_page;
1973 }
1956 1974
1957 /* 1975 /*
1958 * Since we dropped the lock we need to revalidate 1976 * Since we dropped the lock we need to revalidate
@@ -2101,7 +2119,7 @@ oom:
2101 2119
2102unwritable_page: 2120unwritable_page:
2103 page_cache_release(old_page); 2121 page_cache_release(old_page);
2104 return VM_FAULT_SIGBUS; 2122 return ret;
2105} 2123}
2106 2124
2107/* 2125/*
@@ -2435,8 +2453,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
2435 count_vm_event(PGMAJFAULT); 2453 count_vm_event(PGMAJFAULT);
2436 } 2454 }
2437 2455
2438 mark_page_accessed(page);
2439
2440 lock_page(page); 2456 lock_page(page);
2441 delayacct_clear_flag(DELAYACCT_PF_SWAPIN); 2457 delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
2442 2458
@@ -2645,9 +2661,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
2645 * to become writable 2661 * to become writable
2646 */ 2662 */
2647 if (vma->vm_ops->page_mkwrite) { 2663 if (vma->vm_ops->page_mkwrite) {
2664 int tmp;
2665
2648 unlock_page(page); 2666 unlock_page(page);
2649 if (vma->vm_ops->page_mkwrite(vma, page) < 0) { 2667 vmf.flags |= FAULT_FLAG_MKWRITE;
2650 ret = VM_FAULT_SIGBUS; 2668 tmp = vma->vm_ops->page_mkwrite(vma, &vmf);
2669 if (unlikely(tmp &
2670 (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) {
2671 ret = tmp;
2651 anon = 1; /* no anon but release vmf.page */ 2672 anon = 1; /* no anon but release vmf.page */
2652 goto out_unlocked; 2673 goto out_unlocked;
2653 } 2674 }
diff --git a/mm/migrate.c b/mm/migrate.c
index a9eff3f092f6..068655d8f883 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -250,7 +250,7 @@ out:
250 * The number of remaining references must be: 250 * The number of remaining references must be:
251 * 1 for anonymous pages without a mapping 251 * 1 for anonymous pages without a mapping
252 * 2 for pages with a mapping 252 * 2 for pages with a mapping
253 * 3 for pages with a mapping and PagePrivate set. 253 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
254 */ 254 */
255static int migrate_page_move_mapping(struct address_space *mapping, 255static int migrate_page_move_mapping(struct address_space *mapping,
256 struct page *newpage, struct page *page) 256 struct page *newpage, struct page *page)
@@ -270,7 +270,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
270 pslot = radix_tree_lookup_slot(&mapping->page_tree, 270 pslot = radix_tree_lookup_slot(&mapping->page_tree,
271 page_index(page)); 271 page_index(page));
272 272
273 expected_count = 2 + !!PagePrivate(page); 273 expected_count = 2 + !!page_has_private(page);
274 if (page_count(page) != expected_count || 274 if (page_count(page) != expected_count ||
275 (struct page *)radix_tree_deref_slot(pslot) != page) { 275 (struct page *)radix_tree_deref_slot(pslot) != page) {
276 spin_unlock_irq(&mapping->tree_lock); 276 spin_unlock_irq(&mapping->tree_lock);
@@ -386,7 +386,7 @@ EXPORT_SYMBOL(fail_migrate_page);
386 386
387/* 387/*
388 * Common logic to directly migrate a single page suitable for 388 * Common logic to directly migrate a single page suitable for
389 * pages that do not use PagePrivate. 389 * pages that do not use PagePrivate/PagePrivate2.
390 * 390 *
391 * Pages are locked upon entry and exit. 391 * Pages are locked upon entry and exit.
392 */ 392 */
@@ -522,7 +522,7 @@ static int fallback_migrate_page(struct address_space *mapping,
522 * Buffers may be managed in a filesystem specific way. 522 * Buffers may be managed in a filesystem specific way.
523 * We must have no buffers or drop them. 523 * We must have no buffers or drop them.
524 */ 524 */
525 if (PagePrivate(page) && 525 if (page_has_private(page) &&
526 !try_to_release_page(page, GFP_KERNEL)) 526 !try_to_release_page(page, GFP_KERNEL))
527 return -EAGAIN; 527 return -EAGAIN;
528 528
@@ -655,7 +655,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
655 * free the metadata, so the page can be freed. 655 * free the metadata, so the page can be freed.
656 */ 656 */
657 if (!page->mapping) { 657 if (!page->mapping) {
658 if (!PageAnon(page) && PagePrivate(page)) { 658 if (!PageAnon(page) && page_has_private(page)) {
659 /* 659 /*
660 * Go direct to try_to_free_buffers() here because 660 * Go direct to try_to_free_buffers() here because
661 * a) that's what try_to_release_page() would do anyway 661 * a) that's what try_to_release_page() would do anyway
diff --git a/mm/mmap.c b/mm/mmap.c
index 1abb9185a686..4a3841186c11 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2481,7 +2481,4 @@ void mm_drop_all_locks(struct mm_struct *mm)
2481 */ 2481 */
2482void __init mmap_init(void) 2482void __init mmap_init(void)
2483{ 2483{
2484 vm_area_cachep = kmem_cache_create("vm_area_struct",
2485 sizeof(struct vm_area_struct), 0,
2486 SLAB_PANIC, NULL);
2487} 2484}
diff --git a/mm/nommu.c b/mm/nommu.c
index 2fcf47d449b4..72eda4aee2cb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -69,7 +69,7 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
69int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ 69int sysctl_nr_trim_pages = 1; /* page trimming behaviour */
70int heap_stack_gap = 0; 70int heap_stack_gap = 0;
71 71
72atomic_t mmap_pages_allocated; 72atomic_long_t mmap_pages_allocated;
73 73
74EXPORT_SYMBOL(mem_map); 74EXPORT_SYMBOL(mem_map);
75EXPORT_SYMBOL(num_physpages); 75EXPORT_SYMBOL(num_physpages);
@@ -463,12 +463,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
463 */ 463 */
464void __init mmap_init(void) 464void __init mmap_init(void)
465{ 465{
466 vm_region_jar = kmem_cache_create("vm_region_jar", 466 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
467 sizeof(struct vm_region), 0,
468 SLAB_PANIC, NULL);
469 vm_area_cachep = kmem_cache_create("vm_area_struct",
470 sizeof(struct vm_area_struct), 0,
471 SLAB_PANIC, NULL);
472} 467}
473 468
474/* 469/*
@@ -486,27 +481,24 @@ static noinline void validate_nommu_regions(void)
486 return; 481 return;
487 482
488 last = rb_entry(lastp, struct vm_region, vm_rb); 483 last = rb_entry(lastp, struct vm_region, vm_rb);
489 if (unlikely(last->vm_end <= last->vm_start)) 484 BUG_ON(unlikely(last->vm_end <= last->vm_start));
490 BUG(); 485 BUG_ON(unlikely(last->vm_top < last->vm_end));
491 if (unlikely(last->vm_top < last->vm_end))
492 BUG();
493 486
494 while ((p = rb_next(lastp))) { 487 while ((p = rb_next(lastp))) {
495 region = rb_entry(p, struct vm_region, vm_rb); 488 region = rb_entry(p, struct vm_region, vm_rb);
496 last = rb_entry(lastp, struct vm_region, vm_rb); 489 last = rb_entry(lastp, struct vm_region, vm_rb);
497 490
498 if (unlikely(region->vm_end <= region->vm_start)) 491 BUG_ON(unlikely(region->vm_end <= region->vm_start));
499 BUG(); 492 BUG_ON(unlikely(region->vm_top < region->vm_end));
500 if (unlikely(region->vm_top < region->vm_end)) 493 BUG_ON(unlikely(region->vm_start < last->vm_top));
501 BUG();
502 if (unlikely(region->vm_start < last->vm_top))
503 BUG();
504 494
505 lastp = p; 495 lastp = p;
506 } 496 }
507} 497}
508#else 498#else
509#define validate_nommu_regions() do {} while(0) 499static void validate_nommu_regions(void)
500{
501}
510#endif 502#endif
511 503
512/* 504/*
@@ -563,16 +555,17 @@ static void free_page_series(unsigned long from, unsigned long to)
563 struct page *page = virt_to_page(from); 555 struct page *page = virt_to_page(from);
564 556
565 kdebug("- free %lx", from); 557 kdebug("- free %lx", from);
566 atomic_dec(&mmap_pages_allocated); 558 atomic_long_dec(&mmap_pages_allocated);
567 if (page_count(page) != 1) 559 if (page_count(page) != 1)
568 kdebug("free page %p [%d]", page, page_count(page)); 560 kdebug("free page %p: refcount not one: %d",
561 page, page_count(page));
569 put_page(page); 562 put_page(page);
570 } 563 }
571} 564}
572 565
573/* 566/*
574 * release a reference to a region 567 * release a reference to a region
575 * - the caller must hold the region semaphore, which this releases 568 * - the caller must hold the region semaphore for writing, which this releases
576 * - the region may not have been added to the tree yet, in which case vm_top 569 * - the region may not have been added to the tree yet, in which case vm_top
577 * will equal vm_start 570 * will equal vm_start
578 */ 571 */
@@ -1096,7 +1089,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
1096 goto enomem; 1089 goto enomem;
1097 1090
1098 total = 1 << order; 1091 total = 1 << order;
1099 atomic_add(total, &mmap_pages_allocated); 1092 atomic_long_add(total, &mmap_pages_allocated);
1100 1093
1101 point = rlen >> PAGE_SHIFT; 1094 point = rlen >> PAGE_SHIFT;
1102 1095
@@ -1107,7 +1100,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
1107 order = ilog2(total - point); 1100 order = ilog2(total - point);
1108 n = 1 << order; 1101 n = 1 << order;
1109 kdebug("shave %lu/%lu @%lu", n, total - point, total); 1102 kdebug("shave %lu/%lu @%lu", n, total - point, total);
1110 atomic_sub(n, &mmap_pages_allocated); 1103 atomic_long_sub(n, &mmap_pages_allocated);
1111 total -= n; 1104 total -= n;
1112 set_page_refcounted(pages + total); 1105 set_page_refcounted(pages + total);
1113 __free_pages(pages + total, order); 1106 __free_pages(pages + total, order);
@@ -1536,10 +1529,15 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1536 /* find the first potentially overlapping VMA */ 1529 /* find the first potentially overlapping VMA */
1537 vma = find_vma(mm, start); 1530 vma = find_vma(mm, start);
1538 if (!vma) { 1531 if (!vma) {
1539 printk(KERN_WARNING 1532 static int limit = 0;
1540 "munmap of memory not mmapped by process %d (%s):" 1533 if (limit < 5) {
1541 " 0x%lx-0x%lx\n", 1534 printk(KERN_WARNING
1542 current->pid, current->comm, start, start + len - 1); 1535 "munmap of memory not mmapped by process %d"
1536 " (%s): 0x%lx-0x%lx\n",
1537 current->pid, current->comm,
1538 start, start + len - 1);
1539 limit++;
1540 }
1543 return -EINVAL; 1541 return -EINVAL;
1544 } 1542 }
1545 1543
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 40ba05061a4f..2f3166e308d9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -55,7 +55,7 @@ static DEFINE_SPINLOCK(zone_scan_lock);
55 55
56unsigned long badness(struct task_struct *p, unsigned long uptime) 56unsigned long badness(struct task_struct *p, unsigned long uptime)
57{ 57{
58 unsigned long points, cpu_time, run_time, s; 58 unsigned long points, cpu_time, run_time;
59 struct mm_struct *mm; 59 struct mm_struct *mm;
60 struct task_struct *child; 60 struct task_struct *child;
61 61
@@ -110,12 +110,10 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
110 else 110 else
111 run_time = 0; 111 run_time = 0;
112 112
113 s = int_sqrt(cpu_time); 113 if (cpu_time)
114 if (s) 114 points /= int_sqrt(cpu_time);
115 points /= s; 115 if (run_time)
116 s = int_sqrt(int_sqrt(run_time)); 116 points /= int_sqrt(int_sqrt(run_time));
117 if (s)
118 points /= s;
119 117
120 /* 118 /*
121 * Niced processes are most likely less important, so double 119 * Niced processes are most likely less important, so double
@@ -396,6 +394,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
396 cpuset_print_task_mems_allowed(current); 394 cpuset_print_task_mems_allowed(current);
397 task_unlock(current); 395 task_unlock(current);
398 dump_stack(); 396 dump_stack();
397 mem_cgroup_print_oom_info(mem, current);
399 show_mem(); 398 show_mem();
400 if (sysctl_oom_dump_tasks) 399 if (sysctl_oom_dump_tasks)
401 dump_tasks(mem); 400 dump_tasks(mem);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 40ca7cdb653e..30351f0063ac 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -92,14 +92,14 @@ int vm_dirty_ratio = 20;
92unsigned long vm_dirty_bytes; 92unsigned long vm_dirty_bytes;
93 93
94/* 94/*
95 * The interval between `kupdate'-style writebacks, in jiffies 95 * The interval between `kupdate'-style writebacks
96 */ 96 */
97int dirty_writeback_interval = 5 * HZ; 97unsigned int dirty_writeback_interval = 5 * 100; /* sentiseconds */
98 98
99/* 99/*
100 * The longest number of jiffies for which data is allowed to remain dirty 100 * The longest time for which data is allowed to remain dirty
101 */ 101 */
102int dirty_expire_interval = 30 * HZ; 102unsigned int dirty_expire_interval = 30 * 100; /* sentiseconds */
103 103
104/* 104/*
105 * Flag that makes the machine dump writes/reads and block dirtyings. 105 * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -770,9 +770,9 @@ static void wb_kupdate(unsigned long arg)
770 770
771 sync_supers(); 771 sync_supers();
772 772
773 oldest_jif = jiffies - dirty_expire_interval; 773 oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval);
774 start_jif = jiffies; 774 start_jif = jiffies;
775 next_jif = start_jif + dirty_writeback_interval; 775 next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
776 nr_to_write = global_page_state(NR_FILE_DIRTY) + 776 nr_to_write = global_page_state(NR_FILE_DIRTY) +
777 global_page_state(NR_UNSTABLE_NFS) + 777 global_page_state(NR_UNSTABLE_NFS) +
778 (inodes_stat.nr_inodes - inodes_stat.nr_unused); 778 (inodes_stat.nr_inodes - inodes_stat.nr_unused);
@@ -801,9 +801,10 @@ static void wb_kupdate(unsigned long arg)
801int dirty_writeback_centisecs_handler(ctl_table *table, int write, 801int dirty_writeback_centisecs_handler(ctl_table *table, int write,
802 struct file *file, void __user *buffer, size_t *length, loff_t *ppos) 802 struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
803{ 803{
804 proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos); 804 proc_dointvec(table, write, file, buffer, length, ppos);
805 if (dirty_writeback_interval) 805 if (dirty_writeback_interval)
806 mod_timer(&wb_timer, jiffies + dirty_writeback_interval); 806 mod_timer(&wb_timer, jiffies +
807 msecs_to_jiffies(dirty_writeback_interval * 10));
807 else 808 else
808 del_timer(&wb_timer); 809 del_timer(&wb_timer);
809 return 0; 810 return 0;
@@ -905,7 +906,8 @@ void __init page_writeback_init(void)
905{ 906{
906 int shift; 907 int shift;
907 908
908 mod_timer(&wb_timer, jiffies + dirty_writeback_interval); 909 mod_timer(&wb_timer,
910 jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
909 writeback_set_ratelimit(); 911 writeback_set_ratelimit();
910 register_cpu_notifier(&ratelimit_nb); 912 register_cpu_notifier(&ratelimit_nb);
911 913
@@ -1198,6 +1200,20 @@ int __set_page_dirty_no_writeback(struct page *page)
1198} 1200}
1199 1201
1200/* 1202/*
1203 * Helper function for set_page_dirty family.
1204 * NOTE: This relies on being atomic wrt interrupts.
1205 */
1206void account_page_dirtied(struct page *page, struct address_space *mapping)
1207{
1208 if (mapping_cap_account_dirty(mapping)) {
1209 __inc_zone_page_state(page, NR_FILE_DIRTY);
1210 __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
1211 task_dirty_inc(current);
1212 task_io_account_write(PAGE_CACHE_SIZE);
1213 }
1214}
1215
1216/*
1201 * For address_spaces which do not use buffers. Just tag the page as dirty in 1217 * For address_spaces which do not use buffers. Just tag the page as dirty in
1202 * its radix tree. 1218 * its radix tree.
1203 * 1219 *
@@ -1226,13 +1242,7 @@ int __set_page_dirty_nobuffers(struct page *page)
1226 if (mapping2) { /* Race with truncate? */ 1242 if (mapping2) { /* Race with truncate? */
1227 BUG_ON(mapping2 != mapping); 1243 BUG_ON(mapping2 != mapping);
1228 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); 1244 WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
1229 if (mapping_cap_account_dirty(mapping)) { 1245 account_page_dirtied(page, mapping);
1230 __inc_zone_page_state(page, NR_FILE_DIRTY);
1231 __inc_bdi_stat(mapping->backing_dev_info,
1232 BDI_RECLAIMABLE);
1233 task_dirty_inc(current);
1234 task_io_account_write(PAGE_CACHE_SIZE);
1235 }
1236 radix_tree_tag_set(&mapping->page_tree, 1246 radix_tree_tag_set(&mapping->page_tree,
1237 page_index(page), PAGECACHE_TAG_DIRTY); 1247 page_index(page), PAGECACHE_TAG_DIRTY);
1238 } 1248 }
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f87e0d8df5a7..e2f26991fff1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -331,7 +331,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
331 for (i = 1; i < nr_pages; i++) { 331 for (i = 1; i < nr_pages; i++) {
332 struct page *p = page + i; 332 struct page *p = page + i;
333 333
334 if (unlikely(!PageTail(p) | (p->first_page != page))) { 334 if (unlikely(!PageTail(p) || (p->first_page != page))) {
335 bad_page(page); 335 bad_page(page);
336 bad++; 336 bad++;
337 } 337 }
@@ -922,13 +922,10 @@ static void drain_pages(unsigned int cpu)
922 unsigned long flags; 922 unsigned long flags;
923 struct zone *zone; 923 struct zone *zone;
924 924
925 for_each_zone(zone) { 925 for_each_populated_zone(zone) {
926 struct per_cpu_pageset *pset; 926 struct per_cpu_pageset *pset;
927 struct per_cpu_pages *pcp; 927 struct per_cpu_pages *pcp;
928 928
929 if (!populated_zone(zone))
930 continue;
931
932 pset = zone_pcp(zone, cpu); 929 pset = zone_pcp(zone, cpu);
933 930
934 pcp = &pset->pcp; 931 pcp = &pset->pcp;
@@ -1585,7 +1582,8 @@ nofail_alloc:
1585 reclaim_state.reclaimed_slab = 0; 1582 reclaim_state.reclaimed_slab = 0;
1586 p->reclaim_state = &reclaim_state; 1583 p->reclaim_state = &reclaim_state;
1587 1584
1588 did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); 1585 did_some_progress = try_to_free_pages(zonelist, order,
1586 gfp_mask, nodemask);
1589 1587
1590 p->reclaim_state = NULL; 1588 p->reclaim_state = NULL;
1591 lockdep_clear_current_reclaim_state(); 1589 lockdep_clear_current_reclaim_state();
@@ -1879,10 +1877,7 @@ void show_free_areas(void)
1879 int cpu; 1877 int cpu;
1880 struct zone *zone; 1878 struct zone *zone;
1881 1879
1882 for_each_zone(zone) { 1880 for_each_populated_zone(zone) {
1883 if (!populated_zone(zone))
1884 continue;
1885
1886 show_node(zone); 1881 show_node(zone);
1887 printk("%s per-cpu:\n", zone->name); 1882 printk("%s per-cpu:\n", zone->name);
1888 1883
@@ -1922,12 +1917,9 @@ void show_free_areas(void)
1922 global_page_state(NR_PAGETABLE), 1917 global_page_state(NR_PAGETABLE),
1923 global_page_state(NR_BOUNCE)); 1918 global_page_state(NR_BOUNCE));
1924 1919
1925 for_each_zone(zone) { 1920 for_each_populated_zone(zone) {
1926 int i; 1921 int i;
1927 1922
1928 if (!populated_zone(zone))
1929 continue;
1930
1931 show_node(zone); 1923 show_node(zone);
1932 printk("%s" 1924 printk("%s"
1933 " free:%lukB" 1925 " free:%lukB"
@@ -1967,12 +1959,9 @@ void show_free_areas(void)
1967 printk("\n"); 1959 printk("\n");
1968 } 1960 }
1969 1961
1970 for_each_zone(zone) { 1962 for_each_populated_zone(zone) {
1971 unsigned long nr[MAX_ORDER], flags, order, total = 0; 1963 unsigned long nr[MAX_ORDER], flags, order, total = 0;
1972 1964
1973 if (!populated_zone(zone))
1974 continue;
1975
1976 show_node(zone); 1965 show_node(zone);
1977 printk("%s: ", zone->name); 1966 printk("%s: ", zone->name);
1978 1967
@@ -2784,11 +2773,7 @@ static int __cpuinit process_zones(int cpu)
2784 2773
2785 node_set_state(node, N_CPU); /* this node has a cpu */ 2774 node_set_state(node, N_CPU); /* this node has a cpu */
2786 2775
2787 for_each_zone(zone) { 2776 for_each_populated_zone(zone) {
2788
2789 if (!populated_zone(zone))
2790 continue;
2791
2792 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), 2777 zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
2793 GFP_KERNEL, node); 2778 GFP_KERNEL, node);
2794 if (!zone_pcp(zone, cpu)) 2779 if (!zone_pcp(zone, cpu))
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index ceecfbb143fa..791905c991df 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -285,12 +285,8 @@ struct swap_cgroup_ctrl {
285 285
286struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; 286struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
287 287
288/*
289 * This 8bytes seems big..maybe we can reduce this when we can use "id" for
290 * cgroup rather than pointer.
291 */
292struct swap_cgroup { 288struct swap_cgroup {
293 struct mem_cgroup *val; 289 unsigned short id;
294}; 290};
295#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) 291#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
296#define SC_POS_MASK (SC_PER_PAGE - 1) 292#define SC_POS_MASK (SC_PER_PAGE - 1)
@@ -342,10 +338,10 @@ not_enough_page:
342 * @ent: swap entry to be recorded into 338 * @ent: swap entry to be recorded into
343 * @mem: mem_cgroup to be recorded 339 * @mem: mem_cgroup to be recorded
344 * 340 *
345 * Returns old value at success, NULL at failure. 341 * Returns old value at success, 0 at failure.
346 * (Of course, old value can be NULL.) 342 * (Of course, old value can be 0.)
347 */ 343 */
348struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem) 344unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
349{ 345{
350 int type = swp_type(ent); 346 int type = swp_type(ent);
351 unsigned long offset = swp_offset(ent); 347 unsigned long offset = swp_offset(ent);
@@ -354,18 +350,18 @@ struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
354 struct swap_cgroup_ctrl *ctrl; 350 struct swap_cgroup_ctrl *ctrl;
355 struct page *mappage; 351 struct page *mappage;
356 struct swap_cgroup *sc; 352 struct swap_cgroup *sc;
357 struct mem_cgroup *old; 353 unsigned short old;
358 354
359 if (!do_swap_account) 355 if (!do_swap_account)
360 return NULL; 356 return 0;
361 357
362 ctrl = &swap_cgroup_ctrl[type]; 358 ctrl = &swap_cgroup_ctrl[type];
363 359
364 mappage = ctrl->map[idx]; 360 mappage = ctrl->map[idx];
365 sc = page_address(mappage); 361 sc = page_address(mappage);
366 sc += pos; 362 sc += pos;
367 old = sc->val; 363 old = sc->id;
368 sc->val = mem; 364 sc->id = id;
369 365
370 return old; 366 return old;
371} 367}
@@ -374,9 +370,9 @@ struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
374 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry 370 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
375 * @ent: swap entry to be looked up. 371 * @ent: swap entry to be looked up.
376 * 372 *
377 * Returns pointer to mem_cgroup at success. NULL at failure. 373 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
378 */ 374 */
379struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent) 375unsigned short lookup_swap_cgroup(swp_entry_t ent)
380{ 376{
381 int type = swp_type(ent); 377 int type = swp_type(ent);
382 unsigned long offset = swp_offset(ent); 378 unsigned long offset = swp_offset(ent);
@@ -385,16 +381,16 @@ struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
385 struct swap_cgroup_ctrl *ctrl; 381 struct swap_cgroup_ctrl *ctrl;
386 struct page *mappage; 382 struct page *mappage;
387 struct swap_cgroup *sc; 383 struct swap_cgroup *sc;
388 struct mem_cgroup *ret; 384 unsigned short ret;
389 385
390 if (!do_swap_account) 386 if (!do_swap_account)
391 return NULL; 387 return 0;
392 388
393 ctrl = &swap_cgroup_ctrl[type]; 389 ctrl = &swap_cgroup_ctrl[type];
394 mappage = ctrl->map[idx]; 390 mappage = ctrl->map[idx];
395 sc = page_address(mappage); 391 sc = page_address(mappage);
396 sc += pos; 392 sc += pos;
397 ret = sc->val; 393 ret = sc->id;
398 return ret; 394 return ret;
399} 395}
400 396
@@ -430,13 +426,6 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
430 } 426 }
431 mutex_unlock(&swap_cgroup_mutex); 427 mutex_unlock(&swap_cgroup_mutex);
432 428
433 printk(KERN_INFO
434 "swap_cgroup: uses %ld bytes of vmalloc for pointer array space"
435 " and %ld bytes to hold mem_cgroup pointers on swap\n",
436 array_size, length * PAGE_SIZE);
437 printk(KERN_INFO
438 "swap_cgroup can be disabled by noswapaccount boot option.\n");
439
440 return 0; 429 return 0;
441nomem: 430nomem:
442 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); 431 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
diff --git a/mm/readahead.c b/mm/readahead.c
index 9ce303d4b810..133b6d525513 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -31,6 +31,42 @@ EXPORT_SYMBOL_GPL(file_ra_state_init);
31 31
32#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) 32#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
33 33
34/*
35 * see if a page needs releasing upon read_cache_pages() failure
36 * - the caller of read_cache_pages() may have set PG_private or PG_fscache
37 * before calling, such as the NFS fs marking pages that are cached locally
38 * on disk, thus we need to give the fs a chance to clean up in the event of
39 * an error
40 */
41static void read_cache_pages_invalidate_page(struct address_space *mapping,
42 struct page *page)
43{
44 if (page_has_private(page)) {
45 if (!trylock_page(page))
46 BUG();
47 page->mapping = mapping;
48 do_invalidatepage(page, 0);
49 page->mapping = NULL;
50 unlock_page(page);
51 }
52 page_cache_release(page);
53}
54
55/*
56 * release a list of pages, invalidating them first if need be
57 */
58static void read_cache_pages_invalidate_pages(struct address_space *mapping,
59 struct list_head *pages)
60{
61 struct page *victim;
62
63 while (!list_empty(pages)) {
64 victim = list_to_page(pages);
65 list_del(&victim->lru);
66 read_cache_pages_invalidate_page(mapping, victim);
67 }
68}
69
34/** 70/**
35 * read_cache_pages - populate an address space with some pages & start reads against them 71 * read_cache_pages - populate an address space with some pages & start reads against them
36 * @mapping: the address_space 72 * @mapping: the address_space
@@ -52,14 +88,14 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
52 list_del(&page->lru); 88 list_del(&page->lru);
53 if (add_to_page_cache_lru(page, mapping, 89 if (add_to_page_cache_lru(page, mapping,
54 page->index, GFP_KERNEL)) { 90 page->index, GFP_KERNEL)) {
55 page_cache_release(page); 91 read_cache_pages_invalidate_page(mapping, page);
56 continue; 92 continue;
57 } 93 }
58 page_cache_release(page); 94 page_cache_release(page);
59 95
60 ret = filler(data, page); 96 ret = filler(data, page);
61 if (unlikely(ret)) { 97 if (unlikely(ret)) {
62 put_pages_list(pages); 98 read_cache_pages_invalidate_pages(mapping, pages);
63 break; 99 break;
64 } 100 }
65 task_io_account_read(PAGE_CACHE_SIZE); 101 task_io_account_read(PAGE_CACHE_SIZE);
diff --git a/mm/shmem.c b/mm/shmem.c
index 7ec78e24a30d..d94d2e9146bc 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1068,8 +1068,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
1068 swap_duplicate(swap); 1068 swap_duplicate(swap);
1069 BUG_ON(page_mapped(page)); 1069 BUG_ON(page_mapped(page));
1070 page_cache_release(page); /* pagecache ref */ 1070 page_cache_release(page); /* pagecache ref */
1071 set_page_dirty(page); 1071 swap_writepage(page, wbc);
1072 unlock_page(page);
1073 if (inode) { 1072 if (inode) {
1074 mutex_lock(&shmem_swaplist_mutex); 1073 mutex_lock(&shmem_swaplist_mutex);
1075 /* move instead of add in case we're racing */ 1074 /* move instead of add in case we're racing */
diff --git a/mm/slab.c b/mm/slab.c
index 59839d7ee5b3..d7d1414a5285 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3992,8 +3992,7 @@ static void cache_reap(struct work_struct *w)
3992 struct kmem_cache *searchp; 3992 struct kmem_cache *searchp;
3993 struct kmem_list3 *l3; 3993 struct kmem_list3 *l3;
3994 int node = numa_node_id(); 3994 int node = numa_node_id();
3995 struct delayed_work *work = 3995 struct delayed_work *work = to_delayed_work(w);
3996 container_of(w, struct delayed_work, work);
3997 3996
3998 if (!mutex_trylock(&cache_chain_mutex)) 3997 if (!mutex_trylock(&cache_chain_mutex))
3999 /* Give up. Setup the next iteration. */ 3998 /* Give up. Setup the next iteration. */
diff --git a/mm/sparse.c b/mm/sparse.c
index 083f5b63e7a8..da432d9f0ae8 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -164,9 +164,7 @@ void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn,
164 WARN_ON_ONCE(1); 164 WARN_ON_ONCE(1);
165 *start_pfn = max_sparsemem_pfn; 165 *start_pfn = max_sparsemem_pfn;
166 *end_pfn = max_sparsemem_pfn; 166 *end_pfn = max_sparsemem_pfn;
167 } 167 } else if (*end_pfn > max_sparsemem_pfn) {
168
169 if (*end_pfn > max_sparsemem_pfn) {
170 mminit_dprintk(MMINIT_WARNING, "pfnvalidation", 168 mminit_dprintk(MMINIT_WARNING, "pfnvalidation",
171 "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n", 169 "End of range %lu -> %lu exceeds SPARSEMEM max %lu\n",
172 *start_pfn, *end_pfn, max_sparsemem_pfn); 170 *start_pfn, *end_pfn, max_sparsemem_pfn);
diff --git a/mm/swap.c b/mm/swap.c
index 8adb9feb61e1..bede23ce64ea 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -448,8 +448,8 @@ void pagevec_strip(struct pagevec *pvec)
448 for (i = 0; i < pagevec_count(pvec); i++) { 448 for (i = 0; i < pagevec_count(pvec); i++) {
449 struct page *page = pvec->pages[i]; 449 struct page *page = pvec->pages[i];
450 450
451 if (PagePrivate(page) && trylock_page(page)) { 451 if (page_has_private(page) && trylock_page(page)) {
452 if (PagePrivate(page)) 452 if (page_has_private(page))
453 try_to_release_page(page, 0); 453 try_to_release_page(page, 0);
454 unlock_page(page); 454 unlock_page(page);
455 } 455 }
@@ -457,29 +457,6 @@ void pagevec_strip(struct pagevec *pvec)
457} 457}
458 458
459/** 459/**
460 * pagevec_swap_free - try to free swap space from the pages in a pagevec
461 * @pvec: pagevec with swapcache pages to free the swap space of
462 *
463 * The caller needs to hold an extra reference to each page and
464 * not hold the page lock on the pages. This function uses a
465 * trylock on the page lock so it may not always free the swap
466 * space associated with a page.
467 */
468void pagevec_swap_free(struct pagevec *pvec)
469{
470 int i;
471
472 for (i = 0; i < pagevec_count(pvec); i++) {
473 struct page *page = pvec->pages[i];
474
475 if (PageSwapCache(page) && trylock_page(page)) {
476 try_to_free_swap(page);
477 unlock_page(page);
478 }
479 }
480}
481
482/**
483 * pagevec_lookup - gang pagecache lookup 460 * pagevec_lookup - gang pagecache lookup
484 * @pvec: Where the resulting pages are placed 461 * @pvec: Where the resulting pages are placed
485 * @mapping: The address_space to search 462 * @mapping: The address_space to search
diff --git a/mm/truncate.c b/mm/truncate.c
index 1229211104f8..55206fab7b99 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -50,7 +50,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
50static inline void truncate_partial_page(struct page *page, unsigned partial) 50static inline void truncate_partial_page(struct page *page, unsigned partial)
51{ 51{
52 zero_user_segment(page, partial, PAGE_CACHE_SIZE); 52 zero_user_segment(page, partial, PAGE_CACHE_SIZE);
53 if (PagePrivate(page)) 53 if (page_has_private(page))
54 do_invalidatepage(page, partial); 54 do_invalidatepage(page, partial);
55} 55}
56 56
@@ -99,7 +99,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
99 if (page->mapping != mapping) 99 if (page->mapping != mapping)
100 return; 100 return;
101 101
102 if (PagePrivate(page)) 102 if (page_has_private(page))
103 do_invalidatepage(page, 0); 103 do_invalidatepage(page, 0);
104 104
105 cancel_dirty_page(page, PAGE_CACHE_SIZE); 105 cancel_dirty_page(page, PAGE_CACHE_SIZE);
@@ -126,7 +126,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
126 if (page->mapping != mapping) 126 if (page->mapping != mapping)
127 return 0; 127 return 0;
128 128
129 if (PagePrivate(page) && !try_to_release_page(page, 0)) 129 if (page_has_private(page) && !try_to_release_page(page, 0))
130 return 0; 130 return 0;
131 131
132 clear_page_mlock(page); 132 clear_page_mlock(page);
@@ -348,7 +348,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
348 if (page->mapping != mapping) 348 if (page->mapping != mapping)
349 return 0; 349 return 0;
350 350
351 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) 351 if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
352 return 0; 352 return 0;
353 353
354 spin_lock_irq(&mapping->tree_lock); 354 spin_lock_irq(&mapping->tree_lock);
@@ -356,7 +356,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
356 goto failed; 356 goto failed;
357 357
358 clear_page_mlock(page); 358 clear_page_mlock(page);
359 BUG_ON(PagePrivate(page)); 359 BUG_ON(page_has_private(page));
360 __remove_from_page_cache(page); 360 __remove_from_page_cache(page);
361 spin_unlock_irq(&mapping->tree_lock); 361 spin_unlock_irq(&mapping->tree_lock);
362 page_cache_release(page); /* pagecache ref */ 362 page_cache_release(page); /* pagecache ref */
diff --git a/mm/util.c b/mm/util.c
index 37eaccdf3054..7c122e49f769 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -70,6 +70,36 @@ void *kmemdup(const void *src, size_t len, gfp_t gfp)
70EXPORT_SYMBOL(kmemdup); 70EXPORT_SYMBOL(kmemdup);
71 71
72/** 72/**
73 * memdup_user - duplicate memory region from user space
74 *
75 * @src: source address in user space
76 * @len: number of bytes to copy
77 *
78 * Returns an ERR_PTR() on failure.
79 */
80void *memdup_user(const void __user *src, size_t len)
81{
82 void *p;
83
84 /*
85 * Always use GFP_KERNEL, since copy_from_user() can sleep and
86 * cause pagefault, which makes it pointless to use GFP_NOFS
87 * or GFP_ATOMIC.
88 */
89 p = kmalloc_track_caller(len, GFP_KERNEL);
90 if (!p)
91 return ERR_PTR(-ENOMEM);
92
93 if (copy_from_user(p, src, len)) {
94 kfree(p);
95 return ERR_PTR(-EFAULT);
96 }
97
98 return p;
99}
100EXPORT_SYMBOL(memdup_user);
101
102/**
73 * __krealloc - like krealloc() but don't free @p. 103 * __krealloc - like krealloc() but don't free @p.
74 * @p: object to reallocate memory for. 104 * @p: object to reallocate memory for.
75 * @new_size: how many bytes of memory are required. 105 * @new_size: how many bytes of memory are required.
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index af58324c361a..fab19876b4d1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -671,10 +671,7 @@ struct vmap_block {
671 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); 671 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
672 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); 672 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
673 union { 673 union {
674 struct { 674 struct list_head free_list;
675 struct list_head free_list;
676 struct list_head dirty_list;
677 };
678 struct rcu_head rcu_head; 675 struct rcu_head rcu_head;
679 }; 676 };
680}; 677};
@@ -741,7 +738,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
741 bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); 738 bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
742 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); 739 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
743 INIT_LIST_HEAD(&vb->free_list); 740 INIT_LIST_HEAD(&vb->free_list);
744 INIT_LIST_HEAD(&vb->dirty_list);
745 741
746 vb_idx = addr_to_vb_idx(va->va_start); 742 vb_idx = addr_to_vb_idx(va->va_start);
747 spin_lock(&vmap_block_tree_lock); 743 spin_lock(&vmap_block_tree_lock);
@@ -772,12 +768,7 @@ static void free_vmap_block(struct vmap_block *vb)
772 struct vmap_block *tmp; 768 struct vmap_block *tmp;
773 unsigned long vb_idx; 769 unsigned long vb_idx;
774 770
775 spin_lock(&vb->vbq->lock); 771 BUG_ON(!list_empty(&vb->free_list));
776 if (!list_empty(&vb->free_list))
777 list_del(&vb->free_list);
778 if (!list_empty(&vb->dirty_list))
779 list_del(&vb->dirty_list);
780 spin_unlock(&vb->vbq->lock);
781 772
782 vb_idx = addr_to_vb_idx(vb->va->va_start); 773 vb_idx = addr_to_vb_idx(vb->va->va_start);
783 spin_lock(&vmap_block_tree_lock); 774 spin_lock(&vmap_block_tree_lock);
@@ -862,11 +853,7 @@ static void vb_free(const void *addr, unsigned long size)
862 853
863 spin_lock(&vb->lock); 854 spin_lock(&vb->lock);
864 bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); 855 bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
865 if (!vb->dirty) { 856
866 spin_lock(&vb->vbq->lock);
867 list_add(&vb->dirty_list, &vb->vbq->dirty);
868 spin_unlock(&vb->vbq->lock);
869 }
870 vb->dirty += 1UL << order; 857 vb->dirty += 1UL << order;
871 if (vb->dirty == VMAP_BBMAP_BITS) { 858 if (vb->dirty == VMAP_BBMAP_BITS) {
872 BUG_ON(vb->free || !list_empty(&vb->free_list)); 859 BUG_ON(vb->free || !list_empty(&vb->free_list));
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f74a61e522f4..39fdfb14eeaa 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -60,8 +60,8 @@ struct scan_control {
60 60
61 int may_writepage; 61 int may_writepage;
62 62
63 /* Can pages be swapped as part of reclaim? */ 63 /* Can mapped pages be reclaimed? */
64 int may_swap; 64 int may_unmap;
65 65
66 /* This context's SWAP_CLUSTER_MAX. If freeing memory for 66 /* This context's SWAP_CLUSTER_MAX. If freeing memory for
67 * suspend, we effectively ignore SWAP_CLUSTER_MAX. 67 * suspend, we effectively ignore SWAP_CLUSTER_MAX.
@@ -78,6 +78,12 @@ struct scan_control {
78 /* Which cgroup do we reclaim from */ 78 /* Which cgroup do we reclaim from */
79 struct mem_cgroup *mem_cgroup; 79 struct mem_cgroup *mem_cgroup;
80 80
81 /*
82 * Nodemask of nodes allowed by the caller. If NULL, all nodes
83 * are scanned.
84 */
85 nodemask_t *nodemask;
86
81 /* Pluggable isolate pages callback */ 87 /* Pluggable isolate pages callback */
82 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, 88 unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
83 unsigned long *scanned, int order, int mode, 89 unsigned long *scanned, int order, int mode,
@@ -214,8 +220,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
214 do_div(delta, lru_pages + 1); 220 do_div(delta, lru_pages + 1);
215 shrinker->nr += delta; 221 shrinker->nr += delta;
216 if (shrinker->nr < 0) { 222 if (shrinker->nr < 0) {
217 printk(KERN_ERR "%s: nr=%ld\n", 223 printk(KERN_ERR "shrink_slab: %pF negative objects to "
218 __func__, shrinker->nr); 224 "delete nr=%ld\n",
225 shrinker->shrink, shrinker->nr);
219 shrinker->nr = max_pass; 226 shrinker->nr = max_pass;
220 } 227 }
221 228
@@ -276,7 +283,7 @@ static inline int page_mapping_inuse(struct page *page)
276 283
277static inline int is_page_cache_freeable(struct page *page) 284static inline int is_page_cache_freeable(struct page *page)
278{ 285{
279 return page_count(page) - !!PagePrivate(page) == 2; 286 return page_count(page) - !!page_has_private(page) == 2;
280} 287}
281 288
282static int may_write_to_queue(struct backing_dev_info *bdi) 289static int may_write_to_queue(struct backing_dev_info *bdi)
@@ -360,7 +367,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
360 * Some data journaling orphaned pages can have 367 * Some data journaling orphaned pages can have
361 * page->mapping == NULL while being dirty with clean buffers. 368 * page->mapping == NULL while being dirty with clean buffers.
362 */ 369 */
363 if (PagePrivate(page)) { 370 if (page_has_private(page)) {
364 if (try_to_free_buffers(page)) { 371 if (try_to_free_buffers(page)) {
365 ClearPageDirty(page); 372 ClearPageDirty(page);
366 printk("%s: orphaned page\n", __func__); 373 printk("%s: orphaned page\n", __func__);
@@ -606,7 +613,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
606 if (unlikely(!page_evictable(page, NULL))) 613 if (unlikely(!page_evictable(page, NULL)))
607 goto cull_mlocked; 614 goto cull_mlocked;
608 615
609 if (!sc->may_swap && page_mapped(page)) 616 if (!sc->may_unmap && page_mapped(page))
610 goto keep_locked; 617 goto keep_locked;
611 618
612 /* Double the slab pressure for mapped and swapcache pages */ 619 /* Double the slab pressure for mapped and swapcache pages */
@@ -720,7 +727,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
720 * process address space (page_count == 1) it can be freed. 727 * process address space (page_count == 1) it can be freed.
721 * Otherwise, leave the page on the LRU so it is swappable. 728 * Otherwise, leave the page on the LRU so it is swappable.
722 */ 729 */
723 if (PagePrivate(page)) { 730 if (page_has_private(page)) {
724 if (!try_to_release_page(page, sc->gfp_mask)) 731 if (!try_to_release_page(page, sc->gfp_mask))
725 goto activate_locked; 732 goto activate_locked;
726 if (!mapping && page_count(page) == 1) { 733 if (!mapping && page_count(page) == 1) {
@@ -1298,17 +1305,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
1298 } 1305 }
1299 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); 1306 __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved);
1300 pgdeactivate += pgmoved; 1307 pgdeactivate += pgmoved;
1301 if (buffer_heads_over_limit) {
1302 spin_unlock_irq(&zone->lru_lock);
1303 pagevec_strip(&pvec);
1304 spin_lock_irq(&zone->lru_lock);
1305 }
1306 __count_zone_vm_events(PGREFILL, zone, pgscanned); 1308 __count_zone_vm_events(PGREFILL, zone, pgscanned);
1307 __count_vm_events(PGDEACTIVATE, pgdeactivate); 1309 __count_vm_events(PGDEACTIVATE, pgdeactivate);
1308 spin_unlock_irq(&zone->lru_lock); 1310 spin_unlock_irq(&zone->lru_lock);
1309 if (vm_swap_full()) 1311 if (buffer_heads_over_limit)
1310 pagevec_swap_free(&pvec); 1312 pagevec_strip(&pvec);
1311
1312 pagevec_release(&pvec); 1313 pagevec_release(&pvec);
1313} 1314}
1314 1315
@@ -1543,7 +1544,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
1543 struct zone *zone; 1544 struct zone *zone;
1544 1545
1545 sc->all_unreclaimable = 1; 1546 sc->all_unreclaimable = 1;
1546 for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { 1547 for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
1548 sc->nodemask) {
1547 if (!populated_zone(zone)) 1549 if (!populated_zone(zone))
1548 continue; 1550 continue;
1549 /* 1551 /*
@@ -1688,17 +1690,18 @@ out:
1688} 1690}
1689 1691
1690unsigned long try_to_free_pages(struct zonelist *zonelist, int order, 1692unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
1691 gfp_t gfp_mask) 1693 gfp_t gfp_mask, nodemask_t *nodemask)
1692{ 1694{
1693 struct scan_control sc = { 1695 struct scan_control sc = {
1694 .gfp_mask = gfp_mask, 1696 .gfp_mask = gfp_mask,
1695 .may_writepage = !laptop_mode, 1697 .may_writepage = !laptop_mode,
1696 .swap_cluster_max = SWAP_CLUSTER_MAX, 1698 .swap_cluster_max = SWAP_CLUSTER_MAX,
1697 .may_swap = 1, 1699 .may_unmap = 1,
1698 .swappiness = vm_swappiness, 1700 .swappiness = vm_swappiness,
1699 .order = order, 1701 .order = order,
1700 .mem_cgroup = NULL, 1702 .mem_cgroup = NULL,
1701 .isolate_pages = isolate_pages_global, 1703 .isolate_pages = isolate_pages_global,
1704 .nodemask = nodemask,
1702 }; 1705 };
1703 1706
1704 return do_try_to_free_pages(zonelist, &sc); 1707 return do_try_to_free_pages(zonelist, &sc);
@@ -1713,17 +1716,18 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
1713{ 1716{
1714 struct scan_control sc = { 1717 struct scan_control sc = {
1715 .may_writepage = !laptop_mode, 1718 .may_writepage = !laptop_mode,
1716 .may_swap = 1, 1719 .may_unmap = 1,
1717 .swap_cluster_max = SWAP_CLUSTER_MAX, 1720 .swap_cluster_max = SWAP_CLUSTER_MAX,
1718 .swappiness = swappiness, 1721 .swappiness = swappiness,
1719 .order = 0, 1722 .order = 0,
1720 .mem_cgroup = mem_cont, 1723 .mem_cgroup = mem_cont,
1721 .isolate_pages = mem_cgroup_isolate_pages, 1724 .isolate_pages = mem_cgroup_isolate_pages,
1725 .nodemask = NULL, /* we don't care the placement */
1722 }; 1726 };
1723 struct zonelist *zonelist; 1727 struct zonelist *zonelist;
1724 1728
1725 if (noswap) 1729 if (noswap)
1726 sc.may_swap = 0; 1730 sc.may_unmap = 0;
1727 1731
1728 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | 1732 sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
1729 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); 1733 (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -1762,7 +1766,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
1762 struct reclaim_state *reclaim_state = current->reclaim_state; 1766 struct reclaim_state *reclaim_state = current->reclaim_state;
1763 struct scan_control sc = { 1767 struct scan_control sc = {
1764 .gfp_mask = GFP_KERNEL, 1768 .gfp_mask = GFP_KERNEL,
1765 .may_swap = 1, 1769 .may_unmap = 1,
1766 .swap_cluster_max = SWAP_CLUSTER_MAX, 1770 .swap_cluster_max = SWAP_CLUSTER_MAX,
1767 .swappiness = vm_swappiness, 1771 .swappiness = vm_swappiness,
1768 .order = order, 1772 .order = order,
@@ -2050,22 +2054,19 @@ unsigned long global_lru_pages(void)
2050#ifdef CONFIG_PM 2054#ifdef CONFIG_PM
2051/* 2055/*
2052 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages 2056 * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
2053 * from LRU lists system-wide, for given pass and priority, and returns the 2057 * from LRU lists system-wide, for given pass and priority.
2054 * number of reclaimed pages
2055 * 2058 *
2056 * For pass > 3 we also try to shrink the LRU lists that contain a few pages 2059 * For pass > 3 we also try to shrink the LRU lists that contain a few pages
2057 */ 2060 */
2058static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, 2061static void shrink_all_zones(unsigned long nr_pages, int prio,
2059 int pass, struct scan_control *sc) 2062 int pass, struct scan_control *sc)
2060{ 2063{
2061 struct zone *zone; 2064 struct zone *zone;
2062 unsigned long ret = 0; 2065 unsigned long nr_reclaimed = 0;
2063 2066
2064 for_each_zone(zone) { 2067 for_each_populated_zone(zone) {
2065 enum lru_list l; 2068 enum lru_list l;
2066 2069
2067 if (!populated_zone(zone))
2068 continue;
2069 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) 2070 if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
2070 continue; 2071 continue;
2071 2072
@@ -2084,14 +2085,16 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
2084 2085
2085 zone->lru[l].nr_scan = 0; 2086 zone->lru[l].nr_scan = 0;
2086 nr_to_scan = min(nr_pages, lru_pages); 2087 nr_to_scan = min(nr_pages, lru_pages);
2087 ret += shrink_list(l, nr_to_scan, zone, 2088 nr_reclaimed += shrink_list(l, nr_to_scan, zone,
2088 sc, prio); 2089 sc, prio);
2089 if (ret >= nr_pages) 2090 if (nr_reclaimed >= nr_pages) {
2090 return ret; 2091 sc->nr_reclaimed = nr_reclaimed;
2092 return;
2093 }
2091 } 2094 }
2092 } 2095 }
2093 } 2096 }
2094 return ret; 2097 sc->nr_reclaimed = nr_reclaimed;
2095} 2098}
2096 2099
2097/* 2100/*
@@ -2105,13 +2108,11 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
2105unsigned long shrink_all_memory(unsigned long nr_pages) 2108unsigned long shrink_all_memory(unsigned long nr_pages)
2106{ 2109{
2107 unsigned long lru_pages, nr_slab; 2110 unsigned long lru_pages, nr_slab;
2108 unsigned long ret = 0;
2109 int pass; 2111 int pass;
2110 struct reclaim_state reclaim_state; 2112 struct reclaim_state reclaim_state;
2111 struct scan_control sc = { 2113 struct scan_control sc = {
2112 .gfp_mask = GFP_KERNEL, 2114 .gfp_mask = GFP_KERNEL,
2113 .may_swap = 0, 2115 .may_unmap = 0,
2114 .swap_cluster_max = nr_pages,
2115 .may_writepage = 1, 2116 .may_writepage = 1,
2116 .isolate_pages = isolate_pages_global, 2117 .isolate_pages = isolate_pages_global,
2117 }; 2118 };
@@ -2127,8 +2128,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2127 if (!reclaim_state.reclaimed_slab) 2128 if (!reclaim_state.reclaimed_slab)
2128 break; 2129 break;
2129 2130
2130 ret += reclaim_state.reclaimed_slab; 2131 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2131 if (ret >= nr_pages) 2132 if (sc.nr_reclaimed >= nr_pages)
2132 goto out; 2133 goto out;
2133 2134
2134 nr_slab -= reclaim_state.reclaimed_slab; 2135 nr_slab -= reclaim_state.reclaimed_slab;
@@ -2147,21 +2148,22 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2147 2148
2148 /* Force reclaiming mapped pages in the passes #3 and #4 */ 2149 /* Force reclaiming mapped pages in the passes #3 and #4 */
2149 if (pass > 2) 2150 if (pass > 2)
2150 sc.may_swap = 1; 2151 sc.may_unmap = 1;
2151 2152
2152 for (prio = DEF_PRIORITY; prio >= 0; prio--) { 2153 for (prio = DEF_PRIORITY; prio >= 0; prio--) {
2153 unsigned long nr_to_scan = nr_pages - ret; 2154 unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed;
2154 2155
2155 sc.nr_scanned = 0; 2156 sc.nr_scanned = 0;
2156 ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); 2157 sc.swap_cluster_max = nr_to_scan;
2157 if (ret >= nr_pages) 2158 shrink_all_zones(nr_to_scan, prio, pass, &sc);
2159 if (sc.nr_reclaimed >= nr_pages)
2158 goto out; 2160 goto out;
2159 2161
2160 reclaim_state.reclaimed_slab = 0; 2162 reclaim_state.reclaimed_slab = 0;
2161 shrink_slab(sc.nr_scanned, sc.gfp_mask, 2163 shrink_slab(sc.nr_scanned, sc.gfp_mask,
2162 global_lru_pages()); 2164 global_lru_pages());
2163 ret += reclaim_state.reclaimed_slab; 2165 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2164 if (ret >= nr_pages) 2166 if (sc.nr_reclaimed >= nr_pages)
2165 goto out; 2167 goto out;
2166 2168
2167 if (sc.nr_scanned && prio < DEF_PRIORITY - 2) 2169 if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
@@ -2170,21 +2172,23 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
2170 } 2172 }
2171 2173
2172 /* 2174 /*
2173 * If ret = 0, we could not shrink LRUs, but there may be something 2175 * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be
2174 * in slab caches 2176 * something in slab caches
2175 */ 2177 */
2176 if (!ret) { 2178 if (!sc.nr_reclaimed) {
2177 do { 2179 do {
2178 reclaim_state.reclaimed_slab = 0; 2180 reclaim_state.reclaimed_slab = 0;
2179 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); 2181 shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages());
2180 ret += reclaim_state.reclaimed_slab; 2182 sc.nr_reclaimed += reclaim_state.reclaimed_slab;
2181 } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); 2183 } while (sc.nr_reclaimed < nr_pages &&
2184 reclaim_state.reclaimed_slab > 0);
2182 } 2185 }
2183 2186
2187
2184out: 2188out:
2185 current->reclaim_state = NULL; 2189 current->reclaim_state = NULL;
2186 2190
2187 return ret; 2191 return sc.nr_reclaimed;
2188} 2192}
2189#endif 2193#endif
2190 2194
@@ -2292,11 +2296,12 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
2292 int priority; 2296 int priority;
2293 struct scan_control sc = { 2297 struct scan_control sc = {
2294 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), 2298 .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
2295 .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), 2299 .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
2296 .swap_cluster_max = max_t(unsigned long, nr_pages, 2300 .swap_cluster_max = max_t(unsigned long, nr_pages,
2297 SWAP_CLUSTER_MAX), 2301 SWAP_CLUSTER_MAX),
2298 .gfp_mask = gfp_mask, 2302 .gfp_mask = gfp_mask,
2299 .swappiness = vm_swappiness, 2303 .swappiness = vm_swappiness,
2304 .order = order,
2300 .isolate_pages = isolate_pages_global, 2305 .isolate_pages = isolate_pages_global,
2301 }; 2306 };
2302 unsigned long slab_reclaimable; 2307 unsigned long slab_reclaimable;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8cd81ea1ddc1..66f6130976cb 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -135,11 +135,7 @@ static void refresh_zone_stat_thresholds(void)
135 int cpu; 135 int cpu;
136 int threshold; 136 int threshold;
137 137
138 for_each_zone(zone) { 138 for_each_populated_zone(zone) {
139
140 if (!zone->present_pages)
141 continue;
142
143 threshold = calculate_threshold(zone); 139 threshold = calculate_threshold(zone);
144 140
145 for_each_online_cpu(cpu) 141 for_each_online_cpu(cpu)
@@ -301,12 +297,9 @@ void refresh_cpu_vm_stats(int cpu)
301 int i; 297 int i;
302 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; 298 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
303 299
304 for_each_zone(zone) { 300 for_each_populated_zone(zone) {
305 struct per_cpu_pageset *p; 301 struct per_cpu_pageset *p;
306 302
307 if (!populated_zone(zone))
308 continue;
309
310 p = zone_pcp(zone, cpu); 303 p = zone_pcp(zone, cpu);
311 304
312 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 305 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
@@ -898,7 +891,7 @@ static void vmstat_update(struct work_struct *w)
898{ 891{
899 refresh_cpu_vm_stats(smp_processor_id()); 892 refresh_cpu_vm_stats(smp_processor_id());
900 schedule_delayed_work(&__get_cpu_var(vmstat_work), 893 schedule_delayed_work(&__get_cpu_var(vmstat_work),
901 sysctl_stat_interval); 894 round_jiffies_relative(sysctl_stat_interval));
902} 895}
903 896
904static void __cpuinit start_cpu_timer(int cpu) 897static void __cpuinit start_cpu_timer(int cpu)
@@ -906,7 +899,8 @@ static void __cpuinit start_cpu_timer(int cpu)
906 struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); 899 struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu);
907 900
908 INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update); 901 INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update);
909 schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu); 902 schedule_delayed_work_on(cpu, vmstat_work,
903 __round_jiffies_relative(HZ, cpu));
910} 904}
911 905
912/* 906/*