aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig.debug9
-rw-r--r--mm/allocpercpu.c2
-rw-r--r--mm/backing-dev.c10
-rw-r--r--mm/failslab.c1
-rw-r--r--mm/filemap.c23
-rw-r--r--mm/filemap_xip.c4
-rw-r--r--mm/memcontrol.c687
-rw-r--r--mm/migrate.c10
-rw-r--r--mm/mmap.c3
-rw-r--r--mm/nommu.c52
-rw-r--r--mm/oom_kill.c1
-rw-r--r--mm/page_alloc.c8
-rw-r--r--mm/page_cgroup.c37
-rw-r--r--mm/pdflush.c47
-rw-r--r--mm/quicklist.c2
-rw-r--r--mm/readahead.c40
-rw-r--r--mm/slab.c74
-rw-r--r--mm/slob.c31
-rw-r--r--mm/slub.c77
-rw-r--r--mm/swap.c4
-rw-r--r--mm/truncate.c10
-rw-r--r--mm/util.c16
-rw-r--r--mm/vmscan.c12
-rw-r--r--mm/vmstat.c5
24 files changed, 797 insertions, 368 deletions
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index c8d62d49a44e..bb01e298f260 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,3 +1,12 @@
1config DEBUG_PAGEALLOC
2 bool "Debug page memory allocations"
3 depends on DEBUG_KERNEL && ARCH_SUPPORTS_DEBUG_PAGEALLOC
4 depends on !HIBERNATION || !PPC && !SPARC
5 ---help---
6 Unmap pages from the kernel linear mapping after free_pages().
7 This results in a large slowdown, but helps to find certain types
8 of memory corruptions.
9
1config WANT_PAGE_DEBUG_FLAGS 10config WANT_PAGE_DEBUG_FLAGS
2 bool 11 bool
3 12
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 139d5b7b6621..dfdee6a47359 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -31,7 +31,7 @@ static void percpu_depopulate(void *__pdata, int cpu)
31 * @__pdata: per-cpu data to depopulate 31 * @__pdata: per-cpu data to depopulate
32 * @mask: depopulate per-cpu data for cpu's selected through mask bits 32 * @mask: depopulate per-cpu data for cpu's selected through mask bits
33 */ 33 */
34static void __percpu_depopulate_mask(void *__pdata, cpumask_t *mask) 34static void __percpu_depopulate_mask(void *__pdata, const cpumask_t *mask)
35{ 35{
36 int cpu; 36 int cpu;
37 for_each_cpu_mask_nr(cpu, *mask) 37 for_each_cpu_mask_nr(cpu, *mask)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index be68c956a660..493b468a5035 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -284,12 +284,12 @@ static wait_queue_head_t congestion_wqh[2] = {
284 }; 284 };
285 285
286 286
287void clear_bdi_congested(struct backing_dev_info *bdi, int rw) 287void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
288{ 288{
289 enum bdi_state bit; 289 enum bdi_state bit;
290 wait_queue_head_t *wqh = &congestion_wqh[rw]; 290 wait_queue_head_t *wqh = &congestion_wqh[sync];
291 291
292 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; 292 bit = sync ? BDI_sync_congested : BDI_async_congested;
293 clear_bit(bit, &bdi->state); 293 clear_bit(bit, &bdi->state);
294 smp_mb__after_clear_bit(); 294 smp_mb__after_clear_bit();
295 if (waitqueue_active(wqh)) 295 if (waitqueue_active(wqh))
@@ -297,11 +297,11 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int rw)
297} 297}
298EXPORT_SYMBOL(clear_bdi_congested); 298EXPORT_SYMBOL(clear_bdi_congested);
299 299
300void set_bdi_congested(struct backing_dev_info *bdi, int rw) 300void set_bdi_congested(struct backing_dev_info *bdi, int sync)
301{ 301{
302 enum bdi_state bit; 302 enum bdi_state bit;
303 303
304 bit = (rw == WRITE) ? BDI_write_congested : BDI_read_congested; 304 bit = sync ? BDI_sync_congested : BDI_async_congested;
305 set_bit(bit, &bdi->state); 305 set_bit(bit, &bdi->state);
306} 306}
307EXPORT_SYMBOL(set_bdi_congested); 307EXPORT_SYMBOL(set_bdi_congested);
diff --git a/mm/failslab.c b/mm/failslab.c
index 7c6ea6493f80..9339de5f0a91 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -1,4 +1,5 @@
1#include <linux/fault-inject.h> 1#include <linux/fault-inject.h>
2#include <linux/gfp.h>
2 3
3static struct { 4static struct {
4 struct fault_attr attr; 5 struct fault_attr attr;
diff --git a/mm/filemap.c b/mm/filemap.c
index 126d3973b3d1..2e2d38ebda4b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -513,6 +513,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
513 } 513 }
514 return ret; 514 return ret;
515} 515}
516EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
516 517
517#ifdef CONFIG_NUMA 518#ifdef CONFIG_NUMA
518struct page *__page_cache_alloc(gfp_t gfp) 519struct page *__page_cache_alloc(gfp_t gfp)
@@ -565,6 +566,24 @@ void wait_on_page_bit(struct page *page, int bit_nr)
565EXPORT_SYMBOL(wait_on_page_bit); 566EXPORT_SYMBOL(wait_on_page_bit);
566 567
567/** 568/**
569 * add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
570 * @page - Page defining the wait queue of interest
571 * @waiter - Waiter to add to the queue
572 *
573 * Add an arbitrary @waiter to the wait queue for the nominated @page.
574 */
575void add_page_wait_queue(struct page *page, wait_queue_t *waiter)
576{
577 wait_queue_head_t *q = page_waitqueue(page);
578 unsigned long flags;
579
580 spin_lock_irqsave(&q->lock, flags);
581 __add_wait_queue(q, waiter);
582 spin_unlock_irqrestore(&q->lock, flags);
583}
584EXPORT_SYMBOL_GPL(add_page_wait_queue);
585
586/**
568 * unlock_page - unlock a locked page 587 * unlock_page - unlock a locked page
569 * @page: the page 588 * @page: the page
570 * 589 *
@@ -627,6 +646,7 @@ int __lock_page_killable(struct page *page)
627 return __wait_on_bit_lock(page_waitqueue(page), &wait, 646 return __wait_on_bit_lock(page_waitqueue(page), &wait,
628 sync_page_killable, TASK_KILLABLE); 647 sync_page_killable, TASK_KILLABLE);
629} 648}
649EXPORT_SYMBOL_GPL(__lock_page_killable);
630 650
631/** 651/**
632 * __lock_page_nosync - get a lock on the page, without calling sync_page() 652 * __lock_page_nosync - get a lock on the page, without calling sync_page()
@@ -2463,6 +2483,9 @@ EXPORT_SYMBOL(generic_file_aio_write);
2463 * (presumably at page->private). If the release was successful, return `1'. 2483 * (presumably at page->private). If the release was successful, return `1'.
2464 * Otherwise return zero. 2484 * Otherwise return zero.
2465 * 2485 *
2486 * This may also be called if PG_fscache is set on a page, indicating that the
2487 * page is known to the local caching routines.
2488 *
2466 * The @gfp_mask argument specifies whether I/O may be performed to release 2489 * The @gfp_mask argument specifies whether I/O may be performed to release
2467 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS). 2490 * this page (__GFP_IO), and whether the call may block (__GFP_WAIT & __GFP_FS).
2468 * 2491 *
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 0c04615651b7..427dfe3ce78c 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -89,8 +89,8 @@ do_xip_mapping_read(struct address_space *mapping,
89 } 89 }
90 } 90 }
91 nr = nr - offset; 91 nr = nr - offset;
92 if (nr > len) 92 if (nr > len - copied)
93 nr = len; 93 nr = len - copied;
94 94
95 error = mapping->a_ops->get_xip_mem(mapping, index, 0, 95 error = mapping->a_ops->get_xip_mem(mapping, index, 0,
96 &xip_mem, &xip_pfn); 96 &xip_mem, &xip_pfn);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8e4be9cb2a6a..2fc6d6c48238 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -27,6 +27,7 @@
27#include <linux/backing-dev.h> 27#include <linux/backing-dev.h>
28#include <linux/bit_spinlock.h> 28#include <linux/bit_spinlock.h>
29#include <linux/rcupdate.h> 29#include <linux/rcupdate.h>
30#include <linux/limits.h>
30#include <linux/mutex.h> 31#include <linux/mutex.h>
31#include <linux/slab.h> 32#include <linux/slab.h>
32#include <linux/swap.h> 33#include <linux/swap.h>
@@ -95,6 +96,15 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
95 return ret; 96 return ret;
96} 97}
97 98
99static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
100{
101 s64 ret;
102
103 ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
104 ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
105 return ret;
106}
107
98/* 108/*
99 * per-zone information in memory controller. 109 * per-zone information in memory controller.
100 */ 110 */
@@ -154,9 +164,9 @@ struct mem_cgroup {
154 164
155 /* 165 /*
156 * While reclaiming in a hiearchy, we cache the last child we 166 * While reclaiming in a hiearchy, we cache the last child we
157 * reclaimed from. Protected by hierarchy_mutex 167 * reclaimed from.
158 */ 168 */
159 struct mem_cgroup *last_scanned_child; 169 int last_scanned_child;
160 /* 170 /*
161 * Should the accounting and control be hierarchical, per subtree? 171 * Should the accounting and control be hierarchical, per subtree?
162 */ 172 */
@@ -247,7 +257,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc)
247 return mem_cgroup_zoneinfo(mem, nid, zid); 257 return mem_cgroup_zoneinfo(mem, nid, zid);
248} 258}
249 259
250static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, 260static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
251 enum lru_list idx) 261 enum lru_list idx)
252{ 262{
253 int nid, zid; 263 int nid, zid;
@@ -286,6 +296,9 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
286static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 296static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
287{ 297{
288 struct mem_cgroup *mem = NULL; 298 struct mem_cgroup *mem = NULL;
299
300 if (!mm)
301 return NULL;
289 /* 302 /*
290 * Because we have no locks, mm->owner's may be being moved to other 303 * Because we have no locks, mm->owner's may be being moved to other
291 * cgroup. We use css_tryget() here even if this looks 304 * cgroup. We use css_tryget() here even if this looks
@@ -308,6 +321,42 @@ static bool mem_cgroup_is_obsolete(struct mem_cgroup *mem)
308 return css_is_removed(&mem->css); 321 return css_is_removed(&mem->css);
309} 322}
310 323
324
325/*
326 * Call callback function against all cgroup under hierarchy tree.
327 */
328static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data,
329 int (*func)(struct mem_cgroup *, void *))
330{
331 int found, ret, nextid;
332 struct cgroup_subsys_state *css;
333 struct mem_cgroup *mem;
334
335 if (!root->use_hierarchy)
336 return (*func)(root, data);
337
338 nextid = 1;
339 do {
340 ret = 0;
341 mem = NULL;
342
343 rcu_read_lock();
344 css = css_get_next(&mem_cgroup_subsys, nextid, &root->css,
345 &found);
346 if (css && css_tryget(css))
347 mem = container_of(css, struct mem_cgroup, css);
348 rcu_read_unlock();
349
350 if (mem) {
351 ret = (*func)(mem, data);
352 css_put(&mem->css);
353 }
354 nextid = found + 1;
355 } while (!ret && css);
356
357 return ret;
358}
359
311/* 360/*
312 * Following LRU functions are allowed to be used without PCG_LOCK. 361 * Following LRU functions are allowed to be used without PCG_LOCK.
313 * Operations are called by routine of global LRU independently from memcg. 362 * Operations are called by routine of global LRU independently from memcg.
@@ -441,31 +490,24 @@ void mem_cgroup_move_lists(struct page *page,
441int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) 490int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
442{ 491{
443 int ret; 492 int ret;
493 struct mem_cgroup *curr = NULL;
444 494
445 task_lock(task); 495 task_lock(task);
446 ret = task->mm && mm_match_cgroup(task->mm, mem); 496 rcu_read_lock();
497 curr = try_get_mem_cgroup_from_mm(task->mm);
498 rcu_read_unlock();
447 task_unlock(task); 499 task_unlock(task);
500 if (!curr)
501 return 0;
502 if (curr->use_hierarchy)
503 ret = css_is_ancestor(&curr->css, &mem->css);
504 else
505 ret = (curr == mem);
506 css_put(&curr->css);
448 return ret; 507 return ret;
449} 508}
450 509
451/* 510/*
452 * Calculate mapped_ratio under memory controller. This will be used in
453 * vmscan.c for deteremining we have to reclaim mapped pages.
454 */
455int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
456{
457 long total, rss;
458
459 /*
460 * usage is recorded in bytes. But, here, we assume the number of
461 * physical pages can be represented by "long" on any arch.
462 */
463 total = (long) (mem->res.usage >> PAGE_SHIFT) + 1L;
464 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
465 return (int)((rss * 100L) / total);
466}
467
468/*
469 * prev_priority control...this will be used in memory reclaim path. 511 * prev_priority control...this will be used in memory reclaim path.
470 */ 512 */
471int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) 513int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem)
@@ -501,8 +543,8 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_
501 unsigned long gb; 543 unsigned long gb;
502 unsigned long inactive_ratio; 544 unsigned long inactive_ratio;
503 545
504 inactive = mem_cgroup_get_all_zonestat(memcg, LRU_INACTIVE_ANON); 546 inactive = mem_cgroup_get_local_zonestat(memcg, LRU_INACTIVE_ANON);
505 active = mem_cgroup_get_all_zonestat(memcg, LRU_ACTIVE_ANON); 547 active = mem_cgroup_get_local_zonestat(memcg, LRU_ACTIVE_ANON);
506 548
507 gb = (inactive + active) >> (30 - PAGE_SHIFT); 549 gb = (inactive + active) >> (30 - PAGE_SHIFT);
508 if (gb) 550 if (gb)
@@ -629,172 +671,202 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
629#define mem_cgroup_from_res_counter(counter, member) \ 671#define mem_cgroup_from_res_counter(counter, member) \
630 container_of(counter, struct mem_cgroup, member) 672 container_of(counter, struct mem_cgroup, member)
631 673
632/* 674static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem)
633 * This routine finds the DFS walk successor. This routine should be
634 * called with hierarchy_mutex held
635 */
636static struct mem_cgroup *
637__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem)
638{ 675{
639 struct cgroup *cgroup, *curr_cgroup, *root_cgroup; 676 if (do_swap_account) {
640 677 if (res_counter_check_under_limit(&mem->res) &&
641 curr_cgroup = curr->css.cgroup; 678 res_counter_check_under_limit(&mem->memsw))
642 root_cgroup = root_mem->css.cgroup; 679 return true;
680 } else
681 if (res_counter_check_under_limit(&mem->res))
682 return true;
683 return false;
684}
643 685
644 if (!list_empty(&curr_cgroup->children)) { 686static unsigned int get_swappiness(struct mem_cgroup *memcg)
645 /* 687{
646 * Walk down to children 688 struct cgroup *cgrp = memcg->css.cgroup;
647 */ 689 unsigned int swappiness;
648 cgroup = list_entry(curr_cgroup->children.next,
649 struct cgroup, sibling);
650 curr = mem_cgroup_from_cont(cgroup);
651 goto done;
652 }
653 690
654visit_parent: 691 /* root ? */
655 if (curr_cgroup == root_cgroup) { 692 if (cgrp->parent == NULL)
656 /* caller handles NULL case */ 693 return vm_swappiness;
657 curr = NULL;
658 goto done;
659 }
660 694
661 /* 695 spin_lock(&memcg->reclaim_param_lock);
662 * Goto next sibling 696 swappiness = memcg->swappiness;
663 */ 697 spin_unlock(&memcg->reclaim_param_lock);
664 if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) {
665 cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup,
666 sibling);
667 curr = mem_cgroup_from_cont(cgroup);
668 goto done;
669 }
670 698
671 /* 699 return swappiness;
672 * Go up to next parent and next parent's sibling if need be 700}
673 */
674 curr_cgroup = curr_cgroup->parent;
675 goto visit_parent;
676 701
677done: 702static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
678 return curr; 703{
704 int *val = data;
705 (*val)++;
706 return 0;
679} 707}
680 708
681/* 709/**
682 * Visit the first child (need not be the first child as per the ordering 710 * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode.
683 * of the cgroup list, since we track last_scanned_child) of @mem and use 711 * @memcg: The memory cgroup that went over limit
684 * that to reclaim free pages from. 712 * @p: Task that is going to be killed
713 *
714 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
715 * enabled
685 */ 716 */
686static struct mem_cgroup * 717void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
687mem_cgroup_get_next_node(struct mem_cgroup *root_mem)
688{ 718{
689 struct cgroup *cgroup; 719 struct cgroup *task_cgrp;
690 struct mem_cgroup *orig, *next; 720 struct cgroup *mem_cgrp;
691 bool obsolete;
692
693 /* 721 /*
694 * Scan all children under the mem_cgroup mem 722 * Need a buffer in BSS, can't rely on allocations. The code relies
723 * on the assumption that OOM is serialized for memory controller.
724 * If this assumption is broken, revisit this code.
695 */ 725 */
696 mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); 726 static char memcg_name[PATH_MAX];
727 int ret;
728
729 if (!memcg)
730 return;
697 731
698 orig = root_mem->last_scanned_child;
699 obsolete = mem_cgroup_is_obsolete(orig);
700 732
701 if (list_empty(&root_mem->css.cgroup->children)) { 733 rcu_read_lock();
734
735 mem_cgrp = memcg->css.cgroup;
736 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id);
737
738 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX);
739 if (ret < 0) {
702 /* 740 /*
703 * root_mem might have children before and last_scanned_child 741 * Unfortunately, we are unable to convert to a useful name
704 * may point to one of them. We put it later. 742 * But we'll still print out the usage information
705 */ 743 */
706 if (orig) 744 rcu_read_unlock();
707 VM_BUG_ON(!obsolete);
708 next = NULL;
709 goto done; 745 goto done;
710 } 746 }
747 rcu_read_unlock();
711 748
712 if (!orig || obsolete) { 749 printk(KERN_INFO "Task in %s killed", memcg_name);
713 cgroup = list_first_entry(&root_mem->css.cgroup->children,
714 struct cgroup, sibling);
715 next = mem_cgroup_from_cont(cgroup);
716 } else
717 next = __mem_cgroup_get_next_node(orig, root_mem);
718 750
751 rcu_read_lock();
752 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX);
753 if (ret < 0) {
754 rcu_read_unlock();
755 goto done;
756 }
757 rcu_read_unlock();
758
759 /*
760 * Continues from above, so we don't need an KERN_ level
761 */
762 printk(KERN_CONT " as a result of limit of %s\n", memcg_name);
719done: 763done:
720 if (next) 764
721 mem_cgroup_get(next); 765 printk(KERN_INFO "memory: usage %llukB, limit %llukB, failcnt %llu\n",
722 root_mem->last_scanned_child = next; 766 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10,
723 if (orig) 767 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10,
724 mem_cgroup_put(orig); 768 res_counter_read_u64(&memcg->res, RES_FAILCNT));
725 mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); 769 printk(KERN_INFO "memory+swap: usage %llukB, limit %llukB, "
726 return (next) ? next : root_mem; 770 "failcnt %llu\n",
771 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10,
772 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10,
773 res_counter_read_u64(&memcg->memsw, RES_FAILCNT));
727} 774}
728 775
729static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) 776/*
777 * This function returns the number of memcg under hierarchy tree. Returns
778 * 1(self count) if no children.
779 */
780static int mem_cgroup_count_children(struct mem_cgroup *mem)
730{ 781{
731 if (do_swap_account) { 782 int num = 0;
732 if (res_counter_check_under_limit(&mem->res) && 783 mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb);
733 res_counter_check_under_limit(&mem->memsw)) 784 return num;
734 return true;
735 } else
736 if (res_counter_check_under_limit(&mem->res))
737 return true;
738 return false;
739} 785}
740 786
741static unsigned int get_swappiness(struct mem_cgroup *memcg) 787/*
788 * Visit the first child (need not be the first child as per the ordering
789 * of the cgroup list, since we track last_scanned_child) of @mem and use
790 * that to reclaim free pages from.
791 */
792static struct mem_cgroup *
793mem_cgroup_select_victim(struct mem_cgroup *root_mem)
742{ 794{
743 struct cgroup *cgrp = memcg->css.cgroup; 795 struct mem_cgroup *ret = NULL;
744 unsigned int swappiness; 796 struct cgroup_subsys_state *css;
797 int nextid, found;
745 798
746 /* root ? */ 799 if (!root_mem->use_hierarchy) {
747 if (cgrp->parent == NULL) 800 css_get(&root_mem->css);
748 return vm_swappiness; 801 ret = root_mem;
802 }
749 803
750 spin_lock(&memcg->reclaim_param_lock); 804 while (!ret) {
751 swappiness = memcg->swappiness; 805 rcu_read_lock();
752 spin_unlock(&memcg->reclaim_param_lock); 806 nextid = root_mem->last_scanned_child + 1;
807 css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css,
808 &found);
809 if (css && css_tryget(css))
810 ret = container_of(css, struct mem_cgroup, css);
811
812 rcu_read_unlock();
813 /* Updates scanning parameter */
814 spin_lock(&root_mem->reclaim_param_lock);
815 if (!css) {
816 /* this means start scan from ID:1 */
817 root_mem->last_scanned_child = 0;
818 } else
819 root_mem->last_scanned_child = found;
820 spin_unlock(&root_mem->reclaim_param_lock);
821 }
753 822
754 return swappiness; 823 return ret;
755} 824}
756 825
757/* 826/*
758 * Dance down the hierarchy if needed to reclaim memory. We remember the 827 * Scan the hierarchy if needed to reclaim memory. We remember the last child
759 * last child we reclaimed from, so that we don't end up penalizing 828 * we reclaimed from, so that we don't end up penalizing one child extensively
760 * one child extensively based on its position in the children list. 829 * based on its position in the children list.
761 * 830 *
762 * root_mem is the original ancestor that we've been reclaim from. 831 * root_mem is the original ancestor that we've been reclaim from.
832 *
833 * We give up and return to the caller when we visit root_mem twice.
834 * (other groups can be removed while we're walking....)
835 *
836 * If shrink==true, for avoiding to free too much, this returns immedieately.
763 */ 837 */
764static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, 838static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
765 gfp_t gfp_mask, bool noswap) 839 gfp_t gfp_mask, bool noswap, bool shrink)
766{ 840{
767 struct mem_cgroup *next_mem; 841 struct mem_cgroup *victim;
768 int ret = 0; 842 int ret, total = 0;
769 843 int loop = 0;
770 /* 844
771 * Reclaim unconditionally and don't check for return value. 845 while (loop < 2) {
772 * We need to reclaim in the current group and down the tree. 846 victim = mem_cgroup_select_victim(root_mem);
773 * One might think about checking for children before reclaiming, 847 if (victim == root_mem)
774 * but there might be left over accounting, even after children 848 loop++;
775 * have left. 849 if (!mem_cgroup_local_usage(&victim->stat)) {
776 */ 850 /* this cgroup's local usage == 0 */
777 ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, 851 css_put(&victim->css);
778 get_swappiness(root_mem));
779 if (mem_cgroup_check_under_limit(root_mem))
780 return 1; /* indicate reclaim has succeeded */
781 if (!root_mem->use_hierarchy)
782 return ret;
783
784 next_mem = mem_cgroup_get_next_node(root_mem);
785
786 while (next_mem != root_mem) {
787 if (mem_cgroup_is_obsolete(next_mem)) {
788 next_mem = mem_cgroup_get_next_node(root_mem);
789 continue; 852 continue;
790 } 853 }
791 ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, 854 /* we use swappiness of local cgroup */
792 get_swappiness(next_mem)); 855 ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap,
856 get_swappiness(victim));
857 css_put(&victim->css);
858 /*
859 * At shrinking usage, we can't check we should stop here or
860 * reclaim more. It's depends on callers. last_scanned_child
861 * will work enough for keeping fairness under tree.
862 */
863 if (shrink)
864 return ret;
865 total += ret;
793 if (mem_cgroup_check_under_limit(root_mem)) 866 if (mem_cgroup_check_under_limit(root_mem))
794 return 1; /* indicate reclaim has succeeded */ 867 return 1 + total;
795 next_mem = mem_cgroup_get_next_node(root_mem);
796 } 868 }
797 return ret; 869 return total;
798} 870}
799 871
800bool mem_cgroup_oom_called(struct task_struct *task) 872bool mem_cgroup_oom_called(struct task_struct *task)
@@ -813,6 +885,19 @@ bool mem_cgroup_oom_called(struct task_struct *task)
813 rcu_read_unlock(); 885 rcu_read_unlock();
814 return ret; 886 return ret;
815} 887}
888
889static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
890{
891 mem->last_oom_jiffies = jiffies;
892 return 0;
893}
894
895static void record_last_oom(struct mem_cgroup *mem)
896{
897 mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
898}
899
900
816/* 901/*
817 * Unlike exported interface, "oom" parameter is added. if oom==true, 902 * Unlike exported interface, "oom" parameter is added. if oom==true,
818 * oom-killer can be invoked. 903 * oom-killer can be invoked.
@@ -875,7 +960,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
875 goto nomem; 960 goto nomem;
876 961
877 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, 962 ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask,
878 noswap); 963 noswap, false);
879 if (ret) 964 if (ret)
880 continue; 965 continue;
881 966
@@ -895,7 +980,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
895 mutex_lock(&memcg_tasklist); 980 mutex_lock(&memcg_tasklist);
896 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); 981 mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
897 mutex_unlock(&memcg_tasklist); 982 mutex_unlock(&memcg_tasklist);
898 mem_over_limit->last_oom_jiffies = jiffies; 983 record_last_oom(mem_over_limit);
899 } 984 }
900 goto nomem; 985 goto nomem;
901 } 986 }
@@ -906,20 +991,55 @@ nomem:
906 return -ENOMEM; 991 return -ENOMEM;
907} 992}
908 993
994
995/*
996 * A helper function to get mem_cgroup from ID. must be called under
997 * rcu_read_lock(). The caller must check css_is_removed() or some if
998 * it's concern. (dropping refcnt from swap can be called against removed
999 * memcg.)
1000 */
1001static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
1002{
1003 struct cgroup_subsys_state *css;
1004
1005 /* ID 0 is unused ID */
1006 if (!id)
1007 return NULL;
1008 css = css_lookup(&mem_cgroup_subsys, id);
1009 if (!css)
1010 return NULL;
1011 return container_of(css, struct mem_cgroup, css);
1012}
1013
909static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) 1014static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page)
910{ 1015{
911 struct mem_cgroup *mem; 1016 struct mem_cgroup *mem;
1017 struct page_cgroup *pc;
1018 unsigned short id;
912 swp_entry_t ent; 1019 swp_entry_t ent;
913 1020
1021 VM_BUG_ON(!PageLocked(page));
1022
914 if (!PageSwapCache(page)) 1023 if (!PageSwapCache(page))
915 return NULL; 1024 return NULL;
916 1025
917 ent.val = page_private(page); 1026 pc = lookup_page_cgroup(page);
918 mem = lookup_swap_cgroup(ent); 1027 /*
919 if (!mem) 1028 * Used bit of swapcache is solid under page lock.
920 return NULL; 1029 */
921 if (!css_tryget(&mem->css)) 1030 if (PageCgroupUsed(pc)) {
922 return NULL; 1031 mem = pc->mem_cgroup;
1032 if (mem && !css_tryget(&mem->css))
1033 mem = NULL;
1034 } else {
1035 ent.val = page_private(page);
1036 id = lookup_swap_cgroup(ent);
1037 rcu_read_lock();
1038 mem = mem_cgroup_lookup(id);
1039 if (mem && !css_tryget(&mem->css))
1040 mem = NULL;
1041 rcu_read_unlock();
1042 }
923 return mem; 1043 return mem;
924} 1044}
925 1045
@@ -1118,6 +1238,10 @@ int mem_cgroup_newpage_charge(struct page *page,
1118 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); 1238 MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
1119} 1239}
1120 1240
1241static void
1242__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1243 enum charge_type ctype);
1244
1121int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 1245int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1122 gfp_t gfp_mask) 1246 gfp_t gfp_mask)
1123{ 1247{
@@ -1154,16 +1278,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1154 unlock_page_cgroup(pc); 1278 unlock_page_cgroup(pc);
1155 } 1279 }
1156 1280
1157 if (do_swap_account && PageSwapCache(page)) {
1158 mem = try_get_mem_cgroup_from_swapcache(page);
1159 if (mem)
1160 mm = NULL;
1161 else
1162 mem = NULL;
1163 /* SwapCache may be still linked to LRU now. */
1164 mem_cgroup_lru_del_before_commit_swapcache(page);
1165 }
1166
1167 if (unlikely(!mm && !mem)) 1281 if (unlikely(!mm && !mem))
1168 mm = &init_mm; 1282 mm = &init_mm;
1169 1283
@@ -1171,22 +1285,16 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
1171 return mem_cgroup_charge_common(page, mm, gfp_mask, 1285 return mem_cgroup_charge_common(page, mm, gfp_mask,
1172 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); 1286 MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
1173 1287
1174 ret = mem_cgroup_charge_common(page, mm, gfp_mask, 1288 /* shmem */
1175 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); 1289 if (PageSwapCache(page)) {
1176 if (mem) 1290 ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
1177 css_put(&mem->css); 1291 if (!ret)
1178 if (PageSwapCache(page)) 1292 __mem_cgroup_commit_charge_swapin(page, mem,
1179 mem_cgroup_lru_add_after_commit_swapcache(page); 1293 MEM_CGROUP_CHARGE_TYPE_SHMEM);
1294 } else
1295 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
1296 MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
1180 1297
1181 if (do_swap_account && !ret && PageSwapCache(page)) {
1182 swp_entry_t ent = {.val = page_private(page)};
1183 /* avoid double counting */
1184 mem = swap_cgroup_record(ent, NULL);
1185 if (mem) {
1186 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1187 mem_cgroup_put(mem);
1188 }
1189 }
1190 return ret; 1298 return ret;
1191} 1299}
1192 1300
@@ -1229,7 +1337,9 @@ charge_cur_mm:
1229 return __mem_cgroup_try_charge(mm, mask, ptr, true); 1337 return __mem_cgroup_try_charge(mm, mask, ptr, true);
1230} 1338}
1231 1339
1232void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) 1340static void
1341__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
1342 enum charge_type ctype)
1233{ 1343{
1234 struct page_cgroup *pc; 1344 struct page_cgroup *pc;
1235 1345
@@ -1239,7 +1349,7 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1239 return; 1349 return;
1240 pc = lookup_page_cgroup(page); 1350 pc = lookup_page_cgroup(page);
1241 mem_cgroup_lru_del_before_commit_swapcache(page); 1351 mem_cgroup_lru_del_before_commit_swapcache(page);
1242 __mem_cgroup_commit_charge(ptr, pc, MEM_CGROUP_CHARGE_TYPE_MAPPED); 1352 __mem_cgroup_commit_charge(ptr, pc, ctype);
1243 mem_cgroup_lru_add_after_commit_swapcache(page); 1353 mem_cgroup_lru_add_after_commit_swapcache(page);
1244 /* 1354 /*
1245 * Now swap is on-memory. This means this page may be 1355 * Now swap is on-memory. This means this page may be
@@ -1250,18 +1360,32 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1250 */ 1360 */
1251 if (do_swap_account && PageSwapCache(page)) { 1361 if (do_swap_account && PageSwapCache(page)) {
1252 swp_entry_t ent = {.val = page_private(page)}; 1362 swp_entry_t ent = {.val = page_private(page)};
1363 unsigned short id;
1253 struct mem_cgroup *memcg; 1364 struct mem_cgroup *memcg;
1254 memcg = swap_cgroup_record(ent, NULL); 1365
1366 id = swap_cgroup_record(ent, 0);
1367 rcu_read_lock();
1368 memcg = mem_cgroup_lookup(id);
1255 if (memcg) { 1369 if (memcg) {
1370 /*
1371 * This recorded memcg can be obsolete one. So, avoid
1372 * calling css_tryget
1373 */
1256 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1374 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1257 mem_cgroup_put(memcg); 1375 mem_cgroup_put(memcg);
1258 } 1376 }
1259 1377 rcu_read_unlock();
1260 } 1378 }
1261 /* add this page(page_cgroup) to the LRU we want. */ 1379 /* add this page(page_cgroup) to the LRU we want. */
1262 1380
1263} 1381}
1264 1382
1383void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
1384{
1385 __mem_cgroup_commit_charge_swapin(page, ptr,
1386 MEM_CGROUP_CHARGE_TYPE_MAPPED);
1387}
1388
1265void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) 1389void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
1266{ 1390{
1267 if (mem_cgroup_disabled()) 1391 if (mem_cgroup_disabled())
@@ -1324,8 +1448,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
1324 res_counter_uncharge(&mem->res, PAGE_SIZE); 1448 res_counter_uncharge(&mem->res, PAGE_SIZE);
1325 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) 1449 if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
1326 res_counter_uncharge(&mem->memsw, PAGE_SIZE); 1450 res_counter_uncharge(&mem->memsw, PAGE_SIZE);
1327
1328 mem_cgroup_charge_statistics(mem, pc, false); 1451 mem_cgroup_charge_statistics(mem, pc, false);
1452
1329 ClearPageCgroupUsed(pc); 1453 ClearPageCgroupUsed(pc);
1330 /* 1454 /*
1331 * pc->mem_cgroup is not cleared here. It will be accessed when it's 1455 * pc->mem_cgroup is not cleared here. It will be accessed when it's
@@ -1377,7 +1501,7 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
1377 MEM_CGROUP_CHARGE_TYPE_SWAPOUT); 1501 MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
1378 /* record memcg information */ 1502 /* record memcg information */
1379 if (do_swap_account && memcg) { 1503 if (do_swap_account && memcg) {
1380 swap_cgroup_record(ent, memcg); 1504 swap_cgroup_record(ent, css_id(&memcg->css));
1381 mem_cgroup_get(memcg); 1505 mem_cgroup_get(memcg);
1382 } 1506 }
1383 if (memcg) 1507 if (memcg)
@@ -1392,15 +1516,23 @@ void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent)
1392void mem_cgroup_uncharge_swap(swp_entry_t ent) 1516void mem_cgroup_uncharge_swap(swp_entry_t ent)
1393{ 1517{
1394 struct mem_cgroup *memcg; 1518 struct mem_cgroup *memcg;
1519 unsigned short id;
1395 1520
1396 if (!do_swap_account) 1521 if (!do_swap_account)
1397 return; 1522 return;
1398 1523
1399 memcg = swap_cgroup_record(ent, NULL); 1524 id = swap_cgroup_record(ent, 0);
1525 rcu_read_lock();
1526 memcg = mem_cgroup_lookup(id);
1400 if (memcg) { 1527 if (memcg) {
1528 /*
1529 * We uncharge this because swap is freed.
1530 * This memcg can be obsolete one. We avoid calling css_tryget
1531 */
1401 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 1532 res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
1402 mem_cgroup_put(memcg); 1533 mem_cgroup_put(memcg);
1403 } 1534 }
1535 rcu_read_unlock();
1404} 1536}
1405#endif 1537#endif
1406 1538
@@ -1508,7 +1640,8 @@ int mem_cgroup_shrink_usage(struct page *page,
1508 return 0; 1640 return 0;
1509 1641
1510 do { 1642 do {
1511 progress = mem_cgroup_hierarchical_reclaim(mem, gfp_mask, true); 1643 progress = mem_cgroup_hierarchical_reclaim(mem,
1644 gfp_mask, true, false);
1512 progress += mem_cgroup_check_under_limit(mem); 1645 progress += mem_cgroup_check_under_limit(mem);
1513 } while (!progress && --retry); 1646 } while (!progress && --retry);
1514 1647
@@ -1523,11 +1656,21 @@ static DEFINE_MUTEX(set_limit_mutex);
1523static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 1656static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1524 unsigned long long val) 1657 unsigned long long val)
1525{ 1658{
1526 1659 int retry_count;
1527 int retry_count = MEM_CGROUP_RECLAIM_RETRIES;
1528 int progress; 1660 int progress;
1529 u64 memswlimit; 1661 u64 memswlimit;
1530 int ret = 0; 1662 int ret = 0;
1663 int children = mem_cgroup_count_children(memcg);
1664 u64 curusage, oldusage;
1665
1666 /*
1667 * For keeping hierarchical_reclaim simple, how long we should retry
1668 * is depends on callers. We set our retry-count to be function
1669 * of # of children which we should visit in this loop.
1670 */
1671 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children;
1672
1673 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE);
1531 1674
1532 while (retry_count) { 1675 while (retry_count) {
1533 if (signal_pending(current)) { 1676 if (signal_pending(current)) {
@@ -1553,8 +1696,13 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1553 break; 1696 break;
1554 1697
1555 progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, 1698 progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL,
1556 false); 1699 false, true);
1557 if (!progress) retry_count--; 1700 curusage = res_counter_read_u64(&memcg->res, RES_USAGE);
1701 /* Usage is reduced ? */
1702 if (curusage >= oldusage)
1703 retry_count--;
1704 else
1705 oldusage = curusage;
1558 } 1706 }
1559 1707
1560 return ret; 1708 return ret;
@@ -1563,13 +1711,16 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
1563int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 1711int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1564 unsigned long long val) 1712 unsigned long long val)
1565{ 1713{
1566 int retry_count = MEM_CGROUP_RECLAIM_RETRIES; 1714 int retry_count;
1567 u64 memlimit, oldusage, curusage; 1715 u64 memlimit, oldusage, curusage;
1568 int ret; 1716 int children = mem_cgroup_count_children(memcg);
1717 int ret = -EBUSY;
1569 1718
1570 if (!do_swap_account) 1719 if (!do_swap_account)
1571 return -EINVAL; 1720 return -EINVAL;
1572 1721 /* see mem_cgroup_resize_res_limit */
1722 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES;
1723 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1573 while (retry_count) { 1724 while (retry_count) {
1574 if (signal_pending(current)) { 1725 if (signal_pending(current)) {
1575 ret = -EINTR; 1726 ret = -EINTR;
@@ -1593,11 +1744,13 @@ int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
1593 if (!ret) 1744 if (!ret)
1594 break; 1745 break;
1595 1746
1596 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 1747 mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true);
1597 mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true);
1598 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 1748 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE);
1749 /* Usage is reduced ? */
1599 if (curusage >= oldusage) 1750 if (curusage >= oldusage)
1600 retry_count--; 1751 retry_count--;
1752 else
1753 oldusage = curusage;
1601 } 1754 }
1602 return ret; 1755 return ret;
1603} 1756}
@@ -1893,54 +2046,90 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
1893 return 0; 2046 return 0;
1894} 2047}
1895 2048
1896static const struct mem_cgroup_stat_desc { 2049
1897 const char *msg; 2050/* For read statistics */
1898 u64 unit; 2051enum {
1899} mem_cgroup_stat_desc[] = { 2052 MCS_CACHE,
1900 [MEM_CGROUP_STAT_CACHE] = { "cache", PAGE_SIZE, }, 2053 MCS_RSS,
1901 [MEM_CGROUP_STAT_RSS] = { "rss", PAGE_SIZE, }, 2054 MCS_PGPGIN,
1902 [MEM_CGROUP_STAT_PGPGIN_COUNT] = {"pgpgin", 1, }, 2055 MCS_PGPGOUT,
1903 [MEM_CGROUP_STAT_PGPGOUT_COUNT] = {"pgpgout", 1, }, 2056 MCS_INACTIVE_ANON,
2057 MCS_ACTIVE_ANON,
2058 MCS_INACTIVE_FILE,
2059 MCS_ACTIVE_FILE,
2060 MCS_UNEVICTABLE,
2061 NR_MCS_STAT,
2062};
2063
2064struct mcs_total_stat {
2065 s64 stat[NR_MCS_STAT];
2066};
2067
2068struct {
2069 char *local_name;
2070 char *total_name;
2071} memcg_stat_strings[NR_MCS_STAT] = {
2072 {"cache", "total_cache"},
2073 {"rss", "total_rss"},
2074 {"pgpgin", "total_pgpgin"},
2075 {"pgpgout", "total_pgpgout"},
2076 {"inactive_anon", "total_inactive_anon"},
2077 {"active_anon", "total_active_anon"},
2078 {"inactive_file", "total_inactive_file"},
2079 {"active_file", "total_active_file"},
2080 {"unevictable", "total_unevictable"}
1904}; 2081};
1905 2082
2083
2084static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
2085{
2086 struct mcs_total_stat *s = data;
2087 s64 val;
2088
2089 /* per cpu stat */
2090 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE);
2091 s->stat[MCS_CACHE] += val * PAGE_SIZE;
2092 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
2093 s->stat[MCS_RSS] += val * PAGE_SIZE;
2094 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
2095 s->stat[MCS_PGPGIN] += val;
2096 val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
2097 s->stat[MCS_PGPGOUT] += val;
2098
2099 /* per zone stat */
2100 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON);
2101 s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE;
2102 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_ANON);
2103 s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE;
2104 val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_FILE);
2105 s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE;
2106 val = mem_cgroup_get_local_zonestat(mem, LRU_ACTIVE_FILE);
2107 s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE;
2108 val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE);
2109 s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE;
2110 return 0;
2111}
2112
2113static void
2114mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s)
2115{
2116 mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat);
2117}
2118
1906static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, 2119static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1907 struct cgroup_map_cb *cb) 2120 struct cgroup_map_cb *cb)
1908{ 2121{
1909 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont); 2122 struct mem_cgroup *mem_cont = mem_cgroup_from_cont(cont);
1910 struct mem_cgroup_stat *stat = &mem_cont->stat; 2123 struct mcs_total_stat mystat;
1911 int i; 2124 int i;
1912 2125
1913 for (i = 0; i < ARRAY_SIZE(stat->cpustat[0].count); i++) { 2126 memset(&mystat, 0, sizeof(mystat));
1914 s64 val; 2127 mem_cgroup_get_local_stat(mem_cont, &mystat);
1915 2128
1916 val = mem_cgroup_read_stat(stat, i); 2129 for (i = 0; i < NR_MCS_STAT; i++)
1917 val *= mem_cgroup_stat_desc[i].unit; 2130 cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]);
1918 cb->fill(cb, mem_cgroup_stat_desc[i].msg, val);
1919 }
1920 /* showing # of active pages */
1921 {
1922 unsigned long active_anon, inactive_anon;
1923 unsigned long active_file, inactive_file;
1924 unsigned long unevictable;
1925
1926 inactive_anon = mem_cgroup_get_all_zonestat(mem_cont,
1927 LRU_INACTIVE_ANON);
1928 active_anon = mem_cgroup_get_all_zonestat(mem_cont,
1929 LRU_ACTIVE_ANON);
1930 inactive_file = mem_cgroup_get_all_zonestat(mem_cont,
1931 LRU_INACTIVE_FILE);
1932 active_file = mem_cgroup_get_all_zonestat(mem_cont,
1933 LRU_ACTIVE_FILE);
1934 unevictable = mem_cgroup_get_all_zonestat(mem_cont,
1935 LRU_UNEVICTABLE);
1936
1937 cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE);
1938 cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE);
1939 cb->fill(cb, "active_file", (active_file) * PAGE_SIZE);
1940 cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE);
1941 cb->fill(cb, "unevictable", unevictable * PAGE_SIZE);
1942 2131
1943 } 2132 /* Hierarchical information */
1944 { 2133 {
1945 unsigned long long limit, memsw_limit; 2134 unsigned long long limit, memsw_limit;
1946 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit); 2135 memcg_get_hierarchical_limit(mem_cont, &limit, &memsw_limit);
@@ -1949,6 +2138,12 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
1949 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit); 2138 cb->fill(cb, "hierarchical_memsw_limit", memsw_limit);
1950 } 2139 }
1951 2140
2141 memset(&mystat, 0, sizeof(mystat));
2142 mem_cgroup_get_total_stat(mem_cont, &mystat);
2143 for (i = 0; i < NR_MCS_STAT; i++)
2144 cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]);
2145
2146
1952#ifdef CONFIG_DEBUG_VM 2147#ifdef CONFIG_DEBUG_VM
1953 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); 2148 cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL));
1954 2149
@@ -2178,6 +2373,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
2178{ 2373{
2179 int node; 2374 int node;
2180 2375
2376 free_css_id(&mem_cgroup_subsys, &mem->css);
2377
2181 for_each_node_state(node, N_POSSIBLE) 2378 for_each_node_state(node, N_POSSIBLE)
2182 free_mem_cgroup_per_zone_info(mem, node); 2379 free_mem_cgroup_per_zone_info(mem, node);
2183 2380
@@ -2228,11 +2425,12 @@ static struct cgroup_subsys_state * __ref
2228mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 2425mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2229{ 2426{
2230 struct mem_cgroup *mem, *parent; 2427 struct mem_cgroup *mem, *parent;
2428 long error = -ENOMEM;
2231 int node; 2429 int node;
2232 2430
2233 mem = mem_cgroup_alloc(); 2431 mem = mem_cgroup_alloc();
2234 if (!mem) 2432 if (!mem)
2235 return ERR_PTR(-ENOMEM); 2433 return ERR_PTR(error);
2236 2434
2237 for_each_node_state(node, N_POSSIBLE) 2435 for_each_node_state(node, N_POSSIBLE)
2238 if (alloc_mem_cgroup_per_zone_info(mem, node)) 2436 if (alloc_mem_cgroup_per_zone_info(mem, node))
@@ -2260,7 +2458,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2260 res_counter_init(&mem->res, NULL); 2458 res_counter_init(&mem->res, NULL);
2261 res_counter_init(&mem->memsw, NULL); 2459 res_counter_init(&mem->memsw, NULL);
2262 } 2460 }
2263 mem->last_scanned_child = NULL; 2461 mem->last_scanned_child = 0;
2264 spin_lock_init(&mem->reclaim_param_lock); 2462 spin_lock_init(&mem->reclaim_param_lock);
2265 2463
2266 if (parent) 2464 if (parent)
@@ -2269,26 +2467,22 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
2269 return &mem->css; 2467 return &mem->css;
2270free_out: 2468free_out:
2271 __mem_cgroup_free(mem); 2469 __mem_cgroup_free(mem);
2272 return ERR_PTR(-ENOMEM); 2470 return ERR_PTR(error);
2273} 2471}
2274 2472
2275static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, 2473static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
2276 struct cgroup *cont) 2474 struct cgroup *cont)
2277{ 2475{
2278 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2476 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2279 mem_cgroup_force_empty(mem, false); 2477
2478 return mem_cgroup_force_empty(mem, false);
2280} 2479}
2281 2480
2282static void mem_cgroup_destroy(struct cgroup_subsys *ss, 2481static void mem_cgroup_destroy(struct cgroup_subsys *ss,
2283 struct cgroup *cont) 2482 struct cgroup *cont)
2284{ 2483{
2285 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 2484 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
2286 struct mem_cgroup *last_scanned_child = mem->last_scanned_child;
2287 2485
2288 if (last_scanned_child) {
2289 VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child));
2290 mem_cgroup_put(last_scanned_child);
2291 }
2292 mem_cgroup_put(mem); 2486 mem_cgroup_put(mem);
2293} 2487}
2294 2488
@@ -2327,6 +2521,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
2327 .populate = mem_cgroup_populate, 2521 .populate = mem_cgroup_populate,
2328 .attach = mem_cgroup_move_task, 2522 .attach = mem_cgroup_move_task,
2329 .early_init = 0, 2523 .early_init = 0,
2524 .use_id = 1,
2330}; 2525};
2331 2526
2332#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP 2527#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
diff --git a/mm/migrate.c b/mm/migrate.c
index a9eff3f092f6..068655d8f883 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -250,7 +250,7 @@ out:
250 * The number of remaining references must be: 250 * The number of remaining references must be:
251 * 1 for anonymous pages without a mapping 251 * 1 for anonymous pages without a mapping
252 * 2 for pages with a mapping 252 * 2 for pages with a mapping
253 * 3 for pages with a mapping and PagePrivate set. 253 * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
254 */ 254 */
255static int migrate_page_move_mapping(struct address_space *mapping, 255static int migrate_page_move_mapping(struct address_space *mapping,
256 struct page *newpage, struct page *page) 256 struct page *newpage, struct page *page)
@@ -270,7 +270,7 @@ static int migrate_page_move_mapping(struct address_space *mapping,
270 pslot = radix_tree_lookup_slot(&mapping->page_tree, 270 pslot = radix_tree_lookup_slot(&mapping->page_tree,
271 page_index(page)); 271 page_index(page));
272 272
273 expected_count = 2 + !!PagePrivate(page); 273 expected_count = 2 + !!page_has_private(page);
274 if (page_count(page) != expected_count || 274 if (page_count(page) != expected_count ||
275 (struct page *)radix_tree_deref_slot(pslot) != page) { 275 (struct page *)radix_tree_deref_slot(pslot) != page) {
276 spin_unlock_irq(&mapping->tree_lock); 276 spin_unlock_irq(&mapping->tree_lock);
@@ -386,7 +386,7 @@ EXPORT_SYMBOL(fail_migrate_page);
386 386
387/* 387/*
388 * Common logic to directly migrate a single page suitable for 388 * Common logic to directly migrate a single page suitable for
389 * pages that do not use PagePrivate. 389 * pages that do not use PagePrivate/PagePrivate2.
390 * 390 *
391 * Pages are locked upon entry and exit. 391 * Pages are locked upon entry and exit.
392 */ 392 */
@@ -522,7 +522,7 @@ static int fallback_migrate_page(struct address_space *mapping,
522 * Buffers may be managed in a filesystem specific way. 522 * Buffers may be managed in a filesystem specific way.
523 * We must have no buffers or drop them. 523 * We must have no buffers or drop them.
524 */ 524 */
525 if (PagePrivate(page) && 525 if (page_has_private(page) &&
526 !try_to_release_page(page, GFP_KERNEL)) 526 !try_to_release_page(page, GFP_KERNEL))
527 return -EAGAIN; 527 return -EAGAIN;
528 528
@@ -655,7 +655,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
655 * free the metadata, so the page can be freed. 655 * free the metadata, so the page can be freed.
656 */ 656 */
657 if (!page->mapping) { 657 if (!page->mapping) {
658 if (!PageAnon(page) && PagePrivate(page)) { 658 if (!PageAnon(page) && page_has_private(page)) {
659 /* 659 /*
660 * Go direct to try_to_free_buffers() here because 660 * Go direct to try_to_free_buffers() here because
661 * a) that's what try_to_release_page() would do anyway 661 * a) that's what try_to_release_page() would do anyway
diff --git a/mm/mmap.c b/mm/mmap.c
index 1abb9185a686..4a3841186c11 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2481,7 +2481,4 @@ void mm_drop_all_locks(struct mm_struct *mm)
2481 */ 2481 */
2482void __init mmap_init(void) 2482void __init mmap_init(void)
2483{ 2483{
2484 vm_area_cachep = kmem_cache_create("vm_area_struct",
2485 sizeof(struct vm_area_struct), 0,
2486 SLAB_PANIC, NULL);
2487} 2484}
diff --git a/mm/nommu.c b/mm/nommu.c
index 2fcf47d449b4..72eda4aee2cb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -69,7 +69,7 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
69int sysctl_nr_trim_pages = 1; /* page trimming behaviour */ 69int sysctl_nr_trim_pages = 1; /* page trimming behaviour */
70int heap_stack_gap = 0; 70int heap_stack_gap = 0;
71 71
72atomic_t mmap_pages_allocated; 72atomic_long_t mmap_pages_allocated;
73 73
74EXPORT_SYMBOL(mem_map); 74EXPORT_SYMBOL(mem_map);
75EXPORT_SYMBOL(num_physpages); 75EXPORT_SYMBOL(num_physpages);
@@ -463,12 +463,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
463 */ 463 */
464void __init mmap_init(void) 464void __init mmap_init(void)
465{ 465{
466 vm_region_jar = kmem_cache_create("vm_region_jar", 466 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
467 sizeof(struct vm_region), 0,
468 SLAB_PANIC, NULL);
469 vm_area_cachep = kmem_cache_create("vm_area_struct",
470 sizeof(struct vm_area_struct), 0,
471 SLAB_PANIC, NULL);
472} 467}
473 468
474/* 469/*
@@ -486,27 +481,24 @@ static noinline void validate_nommu_regions(void)
486 return; 481 return;
487 482
488 last = rb_entry(lastp, struct vm_region, vm_rb); 483 last = rb_entry(lastp, struct vm_region, vm_rb);
489 if (unlikely(last->vm_end <= last->vm_start)) 484 BUG_ON(unlikely(last->vm_end <= last->vm_start));
490 BUG(); 485 BUG_ON(unlikely(last->vm_top < last->vm_end));
491 if (unlikely(last->vm_top < last->vm_end))
492 BUG();
493 486
494 while ((p = rb_next(lastp))) { 487 while ((p = rb_next(lastp))) {
495 region = rb_entry(p, struct vm_region, vm_rb); 488 region = rb_entry(p, struct vm_region, vm_rb);
496 last = rb_entry(lastp, struct vm_region, vm_rb); 489 last = rb_entry(lastp, struct vm_region, vm_rb);
497 490
498 if (unlikely(region->vm_end <= region->vm_start)) 491 BUG_ON(unlikely(region->vm_end <= region->vm_start));
499 BUG(); 492 BUG_ON(unlikely(region->vm_top < region->vm_end));
500 if (unlikely(region->vm_top < region->vm_end)) 493 BUG_ON(unlikely(region->vm_start < last->vm_top));
501 BUG();
502 if (unlikely(region->vm_start < last->vm_top))
503 BUG();
504 494
505 lastp = p; 495 lastp = p;
506 } 496 }
507} 497}
508#else 498#else
509#define validate_nommu_regions() do {} while(0) 499static void validate_nommu_regions(void)
500{
501}
510#endif 502#endif
511 503
512/* 504/*
@@ -563,16 +555,17 @@ static void free_page_series(unsigned long from, unsigned long to)
563 struct page *page = virt_to_page(from); 555 struct page *page = virt_to_page(from);
564 556
565 kdebug("- free %lx", from); 557 kdebug("- free %lx", from);
566 atomic_dec(&mmap_pages_allocated); 558 atomic_long_dec(&mmap_pages_allocated);
567 if (page_count(page) != 1) 559 if (page_count(page) != 1)
568 kdebug("free page %p [%d]", page, page_count(page)); 560 kdebug("free page %p: refcount not one: %d",
561 page, page_count(page));
569 put_page(page); 562 put_page(page);
570 } 563 }
571} 564}
572 565
573/* 566/*
574 * release a reference to a region 567 * release a reference to a region
575 * - the caller must hold the region semaphore, which this releases 568 * - the caller must hold the region semaphore for writing, which this releases
576 * - the region may not have been added to the tree yet, in which case vm_top 569 * - the region may not have been added to the tree yet, in which case vm_top
577 * will equal vm_start 570 * will equal vm_start
578 */ 571 */
@@ -1096,7 +1089,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
1096 goto enomem; 1089 goto enomem;
1097 1090
1098 total = 1 << order; 1091 total = 1 << order;
1099 atomic_add(total, &mmap_pages_allocated); 1092 atomic_long_add(total, &mmap_pages_allocated);
1100 1093
1101 point = rlen >> PAGE_SHIFT; 1094 point = rlen >> PAGE_SHIFT;
1102 1095
@@ -1107,7 +1100,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
1107 order = ilog2(total - point); 1100 order = ilog2(total - point);
1108 n = 1 << order; 1101 n = 1 << order;
1109 kdebug("shave %lu/%lu @%lu", n, total - point, total); 1102 kdebug("shave %lu/%lu @%lu", n, total - point, total);
1110 atomic_sub(n, &mmap_pages_allocated); 1103 atomic_long_sub(n, &mmap_pages_allocated);
1111 total -= n; 1104 total -= n;
1112 set_page_refcounted(pages + total); 1105 set_page_refcounted(pages + total);
1113 __free_pages(pages + total, order); 1106 __free_pages(pages + total, order);
@@ -1536,10 +1529,15 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
1536 /* find the first potentially overlapping VMA */ 1529 /* find the first potentially overlapping VMA */
1537 vma = find_vma(mm, start); 1530 vma = find_vma(mm, start);
1538 if (!vma) { 1531 if (!vma) {
1539 printk(KERN_WARNING 1532 static int limit = 0;
1540 "munmap of memory not mmapped by process %d (%s):" 1533 if (limit < 5) {
1541 " 0x%lx-0x%lx\n", 1534 printk(KERN_WARNING
1542 current->pid, current->comm, start, start + len - 1); 1535 "munmap of memory not mmapped by process %d"
1536 " (%s): 0x%lx-0x%lx\n",
1537 current->pid, current->comm,
1538 start, start + len - 1);
1539 limit++;
1540 }
1543 return -EINVAL; 1541 return -EINVAL;
1544 } 1542 }
1545 1543
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d3b9bac085b5..2f3166e308d9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -394,6 +394,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
394 cpuset_print_task_mems_allowed(current); 394 cpuset_print_task_mems_allowed(current);
395 task_unlock(current); 395 task_unlock(current);
396 dump_stack(); 396 dump_stack();
397 mem_cgroup_print_oom_info(mem, current);
397 show_mem(); 398 show_mem();
398 if (sysctl_oom_dump_tasks) 399 if (sysctl_oom_dump_tasks)
399 dump_tasks(mem); 400 dump_tasks(mem);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0284e528748d..e2f26991fff1 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -331,7 +331,7 @@ static int destroy_compound_page(struct page *page, unsigned long order)
331 for (i = 1; i < nr_pages; i++) { 331 for (i = 1; i < nr_pages; i++) {
332 struct page *p = page + i; 332 struct page *p = page + i;
333 333
334 if (unlikely(!PageTail(p) | (p->first_page != page))) { 334 if (unlikely(!PageTail(p) || (p->first_page != page))) {
335 bad_page(page); 335 bad_page(page);
336 bad++; 336 bad++;
337 } 337 }
@@ -2128,7 +2128,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
2128 int n, val; 2128 int n, val;
2129 int min_val = INT_MAX; 2129 int min_val = INT_MAX;
2130 int best_node = -1; 2130 int best_node = -1;
2131 node_to_cpumask_ptr(tmp, 0); 2131 const struct cpumask *tmp = cpumask_of_node(0);
2132 2132
2133 /* Use the local node if we haven't already */ 2133 /* Use the local node if we haven't already */
2134 if (!node_isset(node, *used_node_mask)) { 2134 if (!node_isset(node, *used_node_mask)) {
@@ -2149,8 +2149,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask)
2149 val += (n < node); 2149 val += (n < node);
2150 2150
2151 /* Give preference to headless and unused nodes */ 2151 /* Give preference to headless and unused nodes */
2152 node_to_cpumask_ptr_next(tmp, n); 2152 tmp = cpumask_of_node(n);
2153 if (!cpus_empty(*tmp)) 2153 if (!cpumask_empty(tmp))
2154 val += PENALTY_FOR_NODE_WITH_CPUS; 2154 val += PENALTY_FOR_NODE_WITH_CPUS;
2155 2155
2156 /* Slight preference for less loaded node */ 2156 /* Slight preference for less loaded node */
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index ceecfbb143fa..791905c991df 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -285,12 +285,8 @@ struct swap_cgroup_ctrl {
285 285
286struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; 286struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES];
287 287
288/*
289 * This 8bytes seems big..maybe we can reduce this when we can use "id" for
290 * cgroup rather than pointer.
291 */
292struct swap_cgroup { 288struct swap_cgroup {
293 struct mem_cgroup *val; 289 unsigned short id;
294}; 290};
295#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) 291#define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup))
296#define SC_POS_MASK (SC_PER_PAGE - 1) 292#define SC_POS_MASK (SC_PER_PAGE - 1)
@@ -342,10 +338,10 @@ not_enough_page:
342 * @ent: swap entry to be recorded into 338 * @ent: swap entry to be recorded into
343 * @mem: mem_cgroup to be recorded 339 * @mem: mem_cgroup to be recorded
344 * 340 *
345 * Returns old value at success, NULL at failure. 341 * Returns old value at success, 0 at failure.
346 * (Of course, old value can be NULL.) 342 * (Of course, old value can be 0.)
347 */ 343 */
348struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem) 344unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id)
349{ 345{
350 int type = swp_type(ent); 346 int type = swp_type(ent);
351 unsigned long offset = swp_offset(ent); 347 unsigned long offset = swp_offset(ent);
@@ -354,18 +350,18 @@ struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
354 struct swap_cgroup_ctrl *ctrl; 350 struct swap_cgroup_ctrl *ctrl;
355 struct page *mappage; 351 struct page *mappage;
356 struct swap_cgroup *sc; 352 struct swap_cgroup *sc;
357 struct mem_cgroup *old; 353 unsigned short old;
358 354
359 if (!do_swap_account) 355 if (!do_swap_account)
360 return NULL; 356 return 0;
361 357
362 ctrl = &swap_cgroup_ctrl[type]; 358 ctrl = &swap_cgroup_ctrl[type];
363 359
364 mappage = ctrl->map[idx]; 360 mappage = ctrl->map[idx];
365 sc = page_address(mappage); 361 sc = page_address(mappage);
366 sc += pos; 362 sc += pos;
367 old = sc->val; 363 old = sc->id;
368 sc->val = mem; 364 sc->id = id;
369 365
370 return old; 366 return old;
371} 367}
@@ -374,9 +370,9 @@ struct mem_cgroup *swap_cgroup_record(swp_entry_t ent, struct mem_cgroup *mem)
374 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry 370 * lookup_swap_cgroup - lookup mem_cgroup tied to swap entry
375 * @ent: swap entry to be looked up. 371 * @ent: swap entry to be looked up.
376 * 372 *
377 * Returns pointer to mem_cgroup at success. NULL at failure. 373 * Returns CSS ID of mem_cgroup at success. 0 at failure. (0 is invalid ID)
378 */ 374 */
379struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent) 375unsigned short lookup_swap_cgroup(swp_entry_t ent)
380{ 376{
381 int type = swp_type(ent); 377 int type = swp_type(ent);
382 unsigned long offset = swp_offset(ent); 378 unsigned long offset = swp_offset(ent);
@@ -385,16 +381,16 @@ struct mem_cgroup *lookup_swap_cgroup(swp_entry_t ent)
385 struct swap_cgroup_ctrl *ctrl; 381 struct swap_cgroup_ctrl *ctrl;
386 struct page *mappage; 382 struct page *mappage;
387 struct swap_cgroup *sc; 383 struct swap_cgroup *sc;
388 struct mem_cgroup *ret; 384 unsigned short ret;
389 385
390 if (!do_swap_account) 386 if (!do_swap_account)
391 return NULL; 387 return 0;
392 388
393 ctrl = &swap_cgroup_ctrl[type]; 389 ctrl = &swap_cgroup_ctrl[type];
394 mappage = ctrl->map[idx]; 390 mappage = ctrl->map[idx];
395 sc = page_address(mappage); 391 sc = page_address(mappage);
396 sc += pos; 392 sc += pos;
397 ret = sc->val; 393 ret = sc->id;
398 return ret; 394 return ret;
399} 395}
400 396
@@ -430,13 +426,6 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
430 } 426 }
431 mutex_unlock(&swap_cgroup_mutex); 427 mutex_unlock(&swap_cgroup_mutex);
432 428
433 printk(KERN_INFO
434 "swap_cgroup: uses %ld bytes of vmalloc for pointer array space"
435 " and %ld bytes to hold mem_cgroup pointers on swap\n",
436 array_size, length * PAGE_SIZE);
437 printk(KERN_INFO
438 "swap_cgroup can be disabled by noswapaccount boot option.\n");
439
440 return 0; 429 return 0;
441nomem: 430nomem:
442 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); 431 printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n");
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 118905e3d788..f2caf96993f8 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -58,6 +58,14 @@ static DEFINE_SPINLOCK(pdflush_lock);
58int nr_pdflush_threads = 0; 58int nr_pdflush_threads = 0;
59 59
60/* 60/*
61 * The max/min number of pdflush threads. R/W by sysctl at
62 * /proc/sys/vm/nr_pdflush_threads_max/min
63 */
64int nr_pdflush_threads_max __read_mostly = MAX_PDFLUSH_THREADS;
65int nr_pdflush_threads_min __read_mostly = MIN_PDFLUSH_THREADS;
66
67
68/*
61 * The time at which the pdflush thread pool last went empty 69 * The time at which the pdflush thread pool last went empty
62 */ 70 */
63static unsigned long last_empty_jifs; 71static unsigned long last_empty_jifs;
@@ -68,7 +76,7 @@ static unsigned long last_empty_jifs;
68 * Thread pool management algorithm: 76 * Thread pool management algorithm:
69 * 77 *
70 * - The minimum and maximum number of pdflush instances are bound 78 * - The minimum and maximum number of pdflush instances are bound
71 * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS. 79 * by nr_pdflush_threads_min and nr_pdflush_threads_max.
72 * 80 *
73 * - If there have been no idle pdflush instances for 1 second, create 81 * - If there have been no idle pdflush instances for 1 second, create
74 * a new one. 82 * a new one.
@@ -98,7 +106,6 @@ static int __pdflush(struct pdflush_work *my_work)
98 INIT_LIST_HEAD(&my_work->list); 106 INIT_LIST_HEAD(&my_work->list);
99 107
100 spin_lock_irq(&pdflush_lock); 108 spin_lock_irq(&pdflush_lock);
101 nr_pdflush_threads++;
102 for ( ; ; ) { 109 for ( ; ; ) {
103 struct pdflush_work *pdf; 110 struct pdflush_work *pdf;
104 111
@@ -126,20 +133,25 @@ static int __pdflush(struct pdflush_work *my_work)
126 133
127 (*my_work->fn)(my_work->arg0); 134 (*my_work->fn)(my_work->arg0);
128 135
136 spin_lock_irq(&pdflush_lock);
137
129 /* 138 /*
130 * Thread creation: For how long have there been zero 139 * Thread creation: For how long have there been zero
131 * available threads? 140 * available threads?
141 *
142 * To throttle creation, we reset last_empty_jifs.
132 */ 143 */
133 if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { 144 if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
134 /* unlocked list_empty() test is OK here */ 145 if (list_empty(&pdflush_list) &&
135 if (list_empty(&pdflush_list)) { 146 nr_pdflush_threads < nr_pdflush_threads_max) {
136 /* unlocked test is OK here */ 147 last_empty_jifs = jiffies;
137 if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) 148 nr_pdflush_threads++;
138 start_one_pdflush_thread(); 149 spin_unlock_irq(&pdflush_lock);
150 start_one_pdflush_thread();
151 spin_lock_irq(&pdflush_lock);
139 } 152 }
140 } 153 }
141 154
142 spin_lock_irq(&pdflush_lock);
143 my_work->fn = NULL; 155 my_work->fn = NULL;
144 156
145 /* 157 /*
@@ -148,7 +160,7 @@ static int __pdflush(struct pdflush_work *my_work)
148 */ 160 */
149 if (list_empty(&pdflush_list)) 161 if (list_empty(&pdflush_list))
150 continue; 162 continue;
151 if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) 163 if (nr_pdflush_threads <= nr_pdflush_threads_min)
152 continue; 164 continue;
153 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); 165 pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
154 if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) { 166 if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
@@ -236,14 +248,27 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
236 248
237static void start_one_pdflush_thread(void) 249static void start_one_pdflush_thread(void)
238{ 250{
239 kthread_run(pdflush, NULL, "pdflush"); 251 struct task_struct *k;
252
253 k = kthread_run(pdflush, NULL, "pdflush");
254 if (unlikely(IS_ERR(k))) {
255 spin_lock_irq(&pdflush_lock);
256 nr_pdflush_threads--;
257 spin_unlock_irq(&pdflush_lock);
258 }
240} 259}
241 260
242static int __init pdflush_init(void) 261static int __init pdflush_init(void)
243{ 262{
244 int i; 263 int i;
245 264
246 for (i = 0; i < MIN_PDFLUSH_THREADS; i++) 265 /*
266 * Pre-set nr_pdflush_threads... If we fail to create,
267 * the count will be decremented.
268 */
269 nr_pdflush_threads = nr_pdflush_threads_min;
270
271 for (i = 0; i < nr_pdflush_threads_min; i++)
247 start_one_pdflush_thread(); 272 start_one_pdflush_thread();
248 return 0; 273 return 0;
249} 274}
diff --git a/mm/quicklist.c b/mm/quicklist.c
index 8dbb6805ef35..e66d07d1b4ff 100644
--- a/mm/quicklist.c
+++ b/mm/quicklist.c
@@ -29,7 +29,7 @@ static unsigned long max_pages(unsigned long min_pages)
29 int node = numa_node_id(); 29 int node = numa_node_id();
30 struct zone *zones = NODE_DATA(node)->node_zones; 30 struct zone *zones = NODE_DATA(node)->node_zones;
31 int num_cpus_on_node; 31 int num_cpus_on_node;
32 node_to_cpumask_ptr(cpumask_on_node, node); 32 const struct cpumask *cpumask_on_node = cpumask_of_node(node);
33 33
34 node_free_pages = 34 node_free_pages =
35#ifdef CONFIG_ZONE_DMA 35#ifdef CONFIG_ZONE_DMA
diff --git a/mm/readahead.c b/mm/readahead.c
index 9ce303d4b810..133b6d525513 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -31,6 +31,42 @@ EXPORT_SYMBOL_GPL(file_ra_state_init);
31 31
32#define list_to_page(head) (list_entry((head)->prev, struct page, lru)) 32#define list_to_page(head) (list_entry((head)->prev, struct page, lru))
33 33
34/*
35 * see if a page needs releasing upon read_cache_pages() failure
36 * - the caller of read_cache_pages() may have set PG_private or PG_fscache
37 * before calling, such as the NFS fs marking pages that are cached locally
38 * on disk, thus we need to give the fs a chance to clean up in the event of
39 * an error
40 */
41static void read_cache_pages_invalidate_page(struct address_space *mapping,
42 struct page *page)
43{
44 if (page_has_private(page)) {
45 if (!trylock_page(page))
46 BUG();
47 page->mapping = mapping;
48 do_invalidatepage(page, 0);
49 page->mapping = NULL;
50 unlock_page(page);
51 }
52 page_cache_release(page);
53}
54
55/*
56 * release a list of pages, invalidating them first if need be
57 */
58static void read_cache_pages_invalidate_pages(struct address_space *mapping,
59 struct list_head *pages)
60{
61 struct page *victim;
62
63 while (!list_empty(pages)) {
64 victim = list_to_page(pages);
65 list_del(&victim->lru);
66 read_cache_pages_invalidate_page(mapping, victim);
67 }
68}
69
34/** 70/**
35 * read_cache_pages - populate an address space with some pages & start reads against them 71 * read_cache_pages - populate an address space with some pages & start reads against them
36 * @mapping: the address_space 72 * @mapping: the address_space
@@ -52,14 +88,14 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
52 list_del(&page->lru); 88 list_del(&page->lru);
53 if (add_to_page_cache_lru(page, mapping, 89 if (add_to_page_cache_lru(page, mapping,
54 page->index, GFP_KERNEL)) { 90 page->index, GFP_KERNEL)) {
55 page_cache_release(page); 91 read_cache_pages_invalidate_page(mapping, page);
56 continue; 92 continue;
57 } 93 }
58 page_cache_release(page); 94 page_cache_release(page);
59 95
60 ret = filler(data, page); 96 ret = filler(data, page);
61 if (unlikely(ret)) { 97 if (unlikely(ret)) {
62 put_pages_list(pages); 98 read_cache_pages_invalidate_pages(mapping, pages);
63 break; 99 break;
64 } 100 }
65 task_io_account_read(PAGE_CACHE_SIZE); 101 task_io_account_read(PAGE_CACHE_SIZE);
diff --git a/mm/slab.c b/mm/slab.c
index 825c606f691d..9a90b00d2f91 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -102,6 +102,7 @@
102#include <linux/cpu.h> 102#include <linux/cpu.h>
103#include <linux/sysctl.h> 103#include <linux/sysctl.h>
104#include <linux/module.h> 104#include <linux/module.h>
105#include <trace/kmemtrace.h>
105#include <linux/rcupdate.h> 106#include <linux/rcupdate.h>
106#include <linux/string.h> 107#include <linux/string.h>
107#include <linux/uaccess.h> 108#include <linux/uaccess.h>
@@ -568,6 +569,14 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp)
568 569
569#endif 570#endif
570 571
572#ifdef CONFIG_KMEMTRACE
573size_t slab_buffer_size(struct kmem_cache *cachep)
574{
575 return cachep->buffer_size;
576}
577EXPORT_SYMBOL(slab_buffer_size);
578#endif
579
571/* 580/*
572 * Do not go above this order unless 0 objects fit into the slab. 581 * Do not go above this order unless 0 objects fit into the slab.
573 */ 582 */
@@ -1160,7 +1169,7 @@ static void __cpuinit cpuup_canceled(long cpu)
1160 struct kmem_cache *cachep; 1169 struct kmem_cache *cachep;
1161 struct kmem_list3 *l3 = NULL; 1170 struct kmem_list3 *l3 = NULL;
1162 int node = cpu_to_node(cpu); 1171 int node = cpu_to_node(cpu);
1163 node_to_cpumask_ptr(mask, node); 1172 const struct cpumask *mask = cpumask_of_node(node);
1164 1173
1165 list_for_each_entry(cachep, &cache_chain, next) { 1174 list_for_each_entry(cachep, &cache_chain, next) {
1166 struct array_cache *nc; 1175 struct array_cache *nc;
@@ -3554,10 +3563,23 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp)
3554 */ 3563 */
3555void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) 3564void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
3556{ 3565{
3557 return __cache_alloc(cachep, flags, __builtin_return_address(0)); 3566 void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
3567
3568 trace_kmem_cache_alloc(_RET_IP_, ret,
3569 obj_size(cachep), cachep->buffer_size, flags);
3570
3571 return ret;
3558} 3572}
3559EXPORT_SYMBOL(kmem_cache_alloc); 3573EXPORT_SYMBOL(kmem_cache_alloc);
3560 3574
3575#ifdef CONFIG_KMEMTRACE
3576void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags)
3577{
3578 return __cache_alloc(cachep, flags, __builtin_return_address(0));
3579}
3580EXPORT_SYMBOL(kmem_cache_alloc_notrace);
3581#endif
3582
3561/** 3583/**
3562 * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. 3584 * kmem_ptr_validate - check if an untrusted pointer might be a slab entry.
3563 * @cachep: the cache we're checking against 3585 * @cachep: the cache we're checking against
@@ -3602,23 +3624,46 @@ out:
3602#ifdef CONFIG_NUMA 3624#ifdef CONFIG_NUMA
3603void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) 3625void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
3604{ 3626{
3605 return __cache_alloc_node(cachep, flags, nodeid, 3627 void *ret = __cache_alloc_node(cachep, flags, nodeid,
3606 __builtin_return_address(0)); 3628 __builtin_return_address(0));
3629
3630 trace_kmem_cache_alloc_node(_RET_IP_, ret,
3631 obj_size(cachep), cachep->buffer_size,
3632 flags, nodeid);
3633
3634 return ret;
3607} 3635}
3608EXPORT_SYMBOL(kmem_cache_alloc_node); 3636EXPORT_SYMBOL(kmem_cache_alloc_node);
3609 3637
3638#ifdef CONFIG_KMEMTRACE
3639void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep,
3640 gfp_t flags,
3641 int nodeid)
3642{
3643 return __cache_alloc_node(cachep, flags, nodeid,
3644 __builtin_return_address(0));
3645}
3646EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
3647#endif
3648
3610static __always_inline void * 3649static __always_inline void *
3611__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) 3650__do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
3612{ 3651{
3613 struct kmem_cache *cachep; 3652 struct kmem_cache *cachep;
3653 void *ret;
3614 3654
3615 cachep = kmem_find_general_cachep(size, flags); 3655 cachep = kmem_find_general_cachep(size, flags);
3616 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3656 if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3617 return cachep; 3657 return cachep;
3618 return kmem_cache_alloc_node(cachep, flags, node); 3658 ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
3659
3660 trace_kmalloc_node((unsigned long) caller, ret,
3661 size, cachep->buffer_size, flags, node);
3662
3663 return ret;
3619} 3664}
3620 3665
3621#ifdef CONFIG_DEBUG_SLAB 3666#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
3622void *__kmalloc_node(size_t size, gfp_t flags, int node) 3667void *__kmalloc_node(size_t size, gfp_t flags, int node)
3623{ 3668{
3624 return __do_kmalloc_node(size, flags, node, 3669 return __do_kmalloc_node(size, flags, node,
@@ -3651,6 +3696,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3651 void *caller) 3696 void *caller)
3652{ 3697{
3653 struct kmem_cache *cachep; 3698 struct kmem_cache *cachep;
3699 void *ret;
3654 3700
3655 /* If you want to save a few bytes .text space: replace 3701 /* If you want to save a few bytes .text space: replace
3656 * __ with kmem_. 3702 * __ with kmem_.
@@ -3660,11 +3706,16 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
3660 cachep = __find_general_cachep(size, flags); 3706 cachep = __find_general_cachep(size, flags);
3661 if (unlikely(ZERO_OR_NULL_PTR(cachep))) 3707 if (unlikely(ZERO_OR_NULL_PTR(cachep)))
3662 return cachep; 3708 return cachep;
3663 return __cache_alloc(cachep, flags, caller); 3709 ret = __cache_alloc(cachep, flags, caller);
3710
3711 trace_kmalloc((unsigned long) caller, ret,
3712 size, cachep->buffer_size, flags);
3713
3714 return ret;
3664} 3715}
3665 3716
3666 3717
3667#ifdef CONFIG_DEBUG_SLAB 3718#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE)
3668void *__kmalloc(size_t size, gfp_t flags) 3719void *__kmalloc(size_t size, gfp_t flags)
3669{ 3720{
3670 return __do_kmalloc(size, flags, __builtin_return_address(0)); 3721 return __do_kmalloc(size, flags, __builtin_return_address(0));
@@ -3703,6 +3754,8 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
3703 debug_check_no_obj_freed(objp, obj_size(cachep)); 3754 debug_check_no_obj_freed(objp, obj_size(cachep));
3704 __cache_free(cachep, objp); 3755 __cache_free(cachep, objp);
3705 local_irq_restore(flags); 3756 local_irq_restore(flags);
3757
3758 trace_kmem_cache_free(_RET_IP_, objp);
3706} 3759}
3707EXPORT_SYMBOL(kmem_cache_free); 3760EXPORT_SYMBOL(kmem_cache_free);
3708 3761
@@ -3720,6 +3773,8 @@ void kfree(const void *objp)
3720 struct kmem_cache *c; 3773 struct kmem_cache *c;
3721 unsigned long flags; 3774 unsigned long flags;
3722 3775
3776 trace_kfree(_RET_IP_, objp);
3777
3723 if (unlikely(ZERO_OR_NULL_PTR(objp))) 3778 if (unlikely(ZERO_OR_NULL_PTR(objp)))
3724 return; 3779 return;
3725 local_irq_save(flags); 3780 local_irq_save(flags);
@@ -3992,8 +4047,7 @@ static void cache_reap(struct work_struct *w)
3992 struct kmem_cache *searchp; 4047 struct kmem_cache *searchp;
3993 struct kmem_list3 *l3; 4048 struct kmem_list3 *l3;
3994 int node = numa_node_id(); 4049 int node = numa_node_id();
3995 struct delayed_work *work = 4050 struct delayed_work *work = to_delayed_work(w);
3996 container_of(w, struct delayed_work, work);
3997 4051
3998 if (!mutex_trylock(&cache_chain_mutex)) 4052 if (!mutex_trylock(&cache_chain_mutex))
3999 /* Give up. Setup the next iteration. */ 4053 /* Give up. Setup the next iteration. */
diff --git a/mm/slob.c b/mm/slob.c
index 7a3411524dac..a2d4ab32198d 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -65,6 +65,7 @@
65#include <linux/module.h> 65#include <linux/module.h>
66#include <linux/rcupdate.h> 66#include <linux/rcupdate.h>
67#include <linux/list.h> 67#include <linux/list.h>
68#include <trace/kmemtrace.h>
68#include <asm/atomic.h> 69#include <asm/atomic.h>
69 70
70/* 71/*
@@ -474,6 +475,7 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
474{ 475{
475 unsigned int *m; 476 unsigned int *m;
476 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN); 477 int align = max(ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
478 void *ret;
477 479
478 lockdep_trace_alloc(gfp); 480 lockdep_trace_alloc(gfp);
479 481
@@ -482,12 +484,16 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
482 return ZERO_SIZE_PTR; 484 return ZERO_SIZE_PTR;
483 485
484 m = slob_alloc(size + align, gfp, align, node); 486 m = slob_alloc(size + align, gfp, align, node);
487
485 if (!m) 488 if (!m)
486 return NULL; 489 return NULL;
487 *m = size; 490 *m = size;
488 return (void *)m + align; 491 ret = (void *)m + align;
492
493 trace_kmalloc_node(_RET_IP_, ret,
494 size, size + align, gfp, node);
489 } else { 495 } else {
490 void *ret; 496 unsigned int order = get_order(size);
491 497
492 ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node); 498 ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node);
493 if (ret) { 499 if (ret) {
@@ -495,8 +501,12 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
495 page = virt_to_page(ret); 501 page = virt_to_page(ret);
496 page->private = size; 502 page->private = size;
497 } 503 }
498 return ret; 504
505 trace_kmalloc_node(_RET_IP_, ret,
506 size, PAGE_SIZE << order, gfp, node);
499 } 507 }
508
509 return ret;
500} 510}
501EXPORT_SYMBOL(__kmalloc_node); 511EXPORT_SYMBOL(__kmalloc_node);
502 512
@@ -504,6 +514,8 @@ void kfree(const void *block)
504{ 514{
505 struct slob_page *sp; 515 struct slob_page *sp;
506 516
517 trace_kfree(_RET_IP_, block);
518
507 if (unlikely(ZERO_OR_NULL_PTR(block))) 519 if (unlikely(ZERO_OR_NULL_PTR(block)))
508 return; 520 return;
509 521
@@ -583,10 +595,17 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
583{ 595{
584 void *b; 596 void *b;
585 597
586 if (c->size < PAGE_SIZE) 598 if (c->size < PAGE_SIZE) {
587 b = slob_alloc(c->size, flags, c->align, node); 599 b = slob_alloc(c->size, flags, c->align, node);
588 else 600 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
601 SLOB_UNITS(c->size) * SLOB_UNIT,
602 flags, node);
603 } else {
589 b = slob_new_pages(flags, get_order(c->size), node); 604 b = slob_new_pages(flags, get_order(c->size), node);
605 trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
606 PAGE_SIZE << get_order(c->size),
607 flags, node);
608 }
590 609
591 if (c->ctor) 610 if (c->ctor)
592 c->ctor(b); 611 c->ctor(b);
@@ -622,6 +641,8 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
622 } else { 641 } else {
623 __kmem_cache_free(b, c->size); 642 __kmem_cache_free(b, c->size);
624 } 643 }
644
645 trace_kmem_cache_free(_RET_IP_, b);
625} 646}
626EXPORT_SYMBOL(kmem_cache_free); 647EXPORT_SYMBOL(kmem_cache_free);
627 648
diff --git a/mm/slub.c b/mm/slub.c
index c4ea9158c9fb..7ab54ecbd3f3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -16,6 +16,7 @@
16#include <linux/slab.h> 16#include <linux/slab.h>
17#include <linux/proc_fs.h> 17#include <linux/proc_fs.h>
18#include <linux/seq_file.h> 18#include <linux/seq_file.h>
19#include <trace/kmemtrace.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
20#include <linux/cpuset.h> 21#include <linux/cpuset.h>
21#include <linux/mempolicy.h> 22#include <linux/mempolicy.h>
@@ -1618,18 +1619,45 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
1618 1619
1619void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) 1620void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
1620{ 1621{
1621 return slab_alloc(s, gfpflags, -1, _RET_IP_); 1622 void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_);
1623
1624 trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags);
1625
1626 return ret;
1622} 1627}
1623EXPORT_SYMBOL(kmem_cache_alloc); 1628EXPORT_SYMBOL(kmem_cache_alloc);
1624 1629
1630#ifdef CONFIG_KMEMTRACE
1631void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags)
1632{
1633 return slab_alloc(s, gfpflags, -1, _RET_IP_);
1634}
1635EXPORT_SYMBOL(kmem_cache_alloc_notrace);
1636#endif
1637
1625#ifdef CONFIG_NUMA 1638#ifdef CONFIG_NUMA
1626void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) 1639void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
1627{ 1640{
1628 return slab_alloc(s, gfpflags, node, _RET_IP_); 1641 void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
1642
1643 trace_kmem_cache_alloc_node(_RET_IP_, ret,
1644 s->objsize, s->size, gfpflags, node);
1645
1646 return ret;
1629} 1647}
1630EXPORT_SYMBOL(kmem_cache_alloc_node); 1648EXPORT_SYMBOL(kmem_cache_alloc_node);
1631#endif 1649#endif
1632 1650
1651#ifdef CONFIG_KMEMTRACE
1652void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
1653 gfp_t gfpflags,
1654 int node)
1655{
1656 return slab_alloc(s, gfpflags, node, _RET_IP_);
1657}
1658EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
1659#endif
1660
1633/* 1661/*
1634 * Slow patch handling. This may still be called frequently since objects 1662 * Slow patch handling. This may still be called frequently since objects
1635 * have a longer lifetime than the cpu slabs in most processing loads. 1663 * have a longer lifetime than the cpu slabs in most processing loads.
@@ -1737,6 +1765,8 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
1737 page = virt_to_head_page(x); 1765 page = virt_to_head_page(x);
1738 1766
1739 slab_free(s, page, x, _RET_IP_); 1767 slab_free(s, page, x, _RET_IP_);
1768
1769 trace_kmem_cache_free(_RET_IP_, x);
1740} 1770}
1741EXPORT_SYMBOL(kmem_cache_free); 1771EXPORT_SYMBOL(kmem_cache_free);
1742 1772
@@ -2659,6 +2689,7 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
2659void *__kmalloc(size_t size, gfp_t flags) 2689void *__kmalloc(size_t size, gfp_t flags)
2660{ 2690{
2661 struct kmem_cache *s; 2691 struct kmem_cache *s;
2692 void *ret;
2662 2693
2663 if (unlikely(size > SLUB_MAX_SIZE)) 2694 if (unlikely(size > SLUB_MAX_SIZE))
2664 return kmalloc_large(size, flags); 2695 return kmalloc_large(size, flags);
@@ -2668,7 +2699,11 @@ void *__kmalloc(size_t size, gfp_t flags)
2668 if (unlikely(ZERO_OR_NULL_PTR(s))) 2699 if (unlikely(ZERO_OR_NULL_PTR(s)))
2669 return s; 2700 return s;
2670 2701
2671 return slab_alloc(s, flags, -1, _RET_IP_); 2702 ret = slab_alloc(s, flags, -1, _RET_IP_);
2703
2704 trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
2705
2706 return ret;
2672} 2707}
2673EXPORT_SYMBOL(__kmalloc); 2708EXPORT_SYMBOL(__kmalloc);
2674 2709
@@ -2687,16 +2722,28 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
2687void *__kmalloc_node(size_t size, gfp_t flags, int node) 2722void *__kmalloc_node(size_t size, gfp_t flags, int node)
2688{ 2723{
2689 struct kmem_cache *s; 2724 struct kmem_cache *s;
2725 void *ret;
2690 2726
2691 if (unlikely(size > SLUB_MAX_SIZE)) 2727 if (unlikely(size > SLUB_MAX_SIZE)) {
2692 return kmalloc_large_node(size, flags, node); 2728 ret = kmalloc_large_node(size, flags, node);
2729
2730 trace_kmalloc_node(_RET_IP_, ret,
2731 size, PAGE_SIZE << get_order(size),
2732 flags, node);
2733
2734 return ret;
2735 }
2693 2736
2694 s = get_slab(size, flags); 2737 s = get_slab(size, flags);
2695 2738
2696 if (unlikely(ZERO_OR_NULL_PTR(s))) 2739 if (unlikely(ZERO_OR_NULL_PTR(s)))
2697 return s; 2740 return s;
2698 2741
2699 return slab_alloc(s, flags, node, _RET_IP_); 2742 ret = slab_alloc(s, flags, node, _RET_IP_);
2743
2744 trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
2745
2746 return ret;
2700} 2747}
2701EXPORT_SYMBOL(__kmalloc_node); 2748EXPORT_SYMBOL(__kmalloc_node);
2702#endif 2749#endif
@@ -2745,6 +2792,8 @@ void kfree(const void *x)
2745 struct page *page; 2792 struct page *page;
2746 void *object = (void *)x; 2793 void *object = (void *)x;
2747 2794
2795 trace_kfree(_RET_IP_, x);
2796
2748 if (unlikely(ZERO_OR_NULL_PTR(x))) 2797 if (unlikely(ZERO_OR_NULL_PTR(x)))
2749 return; 2798 return;
2750 2799
@@ -3224,6 +3273,7 @@ static struct notifier_block __cpuinitdata slab_notifier = {
3224void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) 3273void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3225{ 3274{
3226 struct kmem_cache *s; 3275 struct kmem_cache *s;
3276 void *ret;
3227 3277
3228 if (unlikely(size > SLUB_MAX_SIZE)) 3278 if (unlikely(size > SLUB_MAX_SIZE))
3229 return kmalloc_large(size, gfpflags); 3279 return kmalloc_large(size, gfpflags);
@@ -3233,13 +3283,19 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
3233 if (unlikely(ZERO_OR_NULL_PTR(s))) 3283 if (unlikely(ZERO_OR_NULL_PTR(s)))
3234 return s; 3284 return s;
3235 3285
3236 return slab_alloc(s, gfpflags, -1, caller); 3286 ret = slab_alloc(s, gfpflags, -1, caller);
3287
3288 /* Honor the call site pointer we recieved. */
3289 trace_kmalloc(caller, ret, size, s->size, gfpflags);
3290
3291 return ret;
3237} 3292}
3238 3293
3239void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, 3294void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3240 int node, unsigned long caller) 3295 int node, unsigned long caller)
3241{ 3296{
3242 struct kmem_cache *s; 3297 struct kmem_cache *s;
3298 void *ret;
3243 3299
3244 if (unlikely(size > SLUB_MAX_SIZE)) 3300 if (unlikely(size > SLUB_MAX_SIZE))
3245 return kmalloc_large_node(size, gfpflags, node); 3301 return kmalloc_large_node(size, gfpflags, node);
@@ -3249,7 +3305,12 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
3249 if (unlikely(ZERO_OR_NULL_PTR(s))) 3305 if (unlikely(ZERO_OR_NULL_PTR(s)))
3250 return s; 3306 return s;
3251 3307
3252 return slab_alloc(s, gfpflags, node, caller); 3308 ret = slab_alloc(s, gfpflags, node, caller);
3309
3310 /* Honor the call site pointer we recieved. */
3311 trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
3312
3313 return ret;
3253} 3314}
3254 3315
3255#ifdef CONFIG_SLUB_DEBUG 3316#ifdef CONFIG_SLUB_DEBUG
diff --git a/mm/swap.c b/mm/swap.c
index 6e83084c1f6c..bede23ce64ea 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -448,8 +448,8 @@ void pagevec_strip(struct pagevec *pvec)
448 for (i = 0; i < pagevec_count(pvec); i++) { 448 for (i = 0; i < pagevec_count(pvec); i++) {
449 struct page *page = pvec->pages[i]; 449 struct page *page = pvec->pages[i];
450 450
451 if (PagePrivate(page) && trylock_page(page)) { 451 if (page_has_private(page) && trylock_page(page)) {
452 if (PagePrivate(page)) 452 if (page_has_private(page))
453 try_to_release_page(page, 0); 453 try_to_release_page(page, 0);
454 unlock_page(page); 454 unlock_page(page);
455 } 455 }
diff --git a/mm/truncate.c b/mm/truncate.c
index 1229211104f8..55206fab7b99 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -50,7 +50,7 @@ void do_invalidatepage(struct page *page, unsigned long offset)
50static inline void truncate_partial_page(struct page *page, unsigned partial) 50static inline void truncate_partial_page(struct page *page, unsigned partial)
51{ 51{
52 zero_user_segment(page, partial, PAGE_CACHE_SIZE); 52 zero_user_segment(page, partial, PAGE_CACHE_SIZE);
53 if (PagePrivate(page)) 53 if (page_has_private(page))
54 do_invalidatepage(page, partial); 54 do_invalidatepage(page, partial);
55} 55}
56 56
@@ -99,7 +99,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page)
99 if (page->mapping != mapping) 99 if (page->mapping != mapping)
100 return; 100 return;
101 101
102 if (PagePrivate(page)) 102 if (page_has_private(page))
103 do_invalidatepage(page, 0); 103 do_invalidatepage(page, 0);
104 104
105 cancel_dirty_page(page, PAGE_CACHE_SIZE); 105 cancel_dirty_page(page, PAGE_CACHE_SIZE);
@@ -126,7 +126,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page)
126 if (page->mapping != mapping) 126 if (page->mapping != mapping)
127 return 0; 127 return 0;
128 128
129 if (PagePrivate(page) && !try_to_release_page(page, 0)) 129 if (page_has_private(page) && !try_to_release_page(page, 0))
130 return 0; 130 return 0;
131 131
132 clear_page_mlock(page); 132 clear_page_mlock(page);
@@ -348,7 +348,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
348 if (page->mapping != mapping) 348 if (page->mapping != mapping)
349 return 0; 349 return 0;
350 350
351 if (PagePrivate(page) && !try_to_release_page(page, GFP_KERNEL)) 351 if (page_has_private(page) && !try_to_release_page(page, GFP_KERNEL))
352 return 0; 352 return 0;
353 353
354 spin_lock_irq(&mapping->tree_lock); 354 spin_lock_irq(&mapping->tree_lock);
@@ -356,7 +356,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
356 goto failed; 356 goto failed;
357 357
358 clear_page_mlock(page); 358 clear_page_mlock(page);
359 BUG_ON(PagePrivate(page)); 359 BUG_ON(page_has_private(page));
360 __remove_from_page_cache(page); 360 __remove_from_page_cache(page);
361 spin_unlock_irq(&mapping->tree_lock); 361 spin_unlock_irq(&mapping->tree_lock);
362 page_cache_release(page); /* pagecache ref */ 362 page_cache_release(page); /* pagecache ref */
diff --git a/mm/util.c b/mm/util.c
index 7c122e49f769..2599e83eea17 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,6 +4,7 @@
4#include <linux/module.h> 4#include <linux/module.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/tracepoint.h>
7#include <asm/uaccess.h> 8#include <asm/uaccess.h>
8 9
9/** 10/**
@@ -236,3 +237,18 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
236 return ret; 237 return ret;
237} 238}
238EXPORT_SYMBOL_GPL(get_user_pages_fast); 239EXPORT_SYMBOL_GPL(get_user_pages_fast);
240
241/* Tracepoints definitions. */
242DEFINE_TRACE(kmalloc);
243DEFINE_TRACE(kmem_cache_alloc);
244DEFINE_TRACE(kmalloc_node);
245DEFINE_TRACE(kmem_cache_alloc_node);
246DEFINE_TRACE(kfree);
247DEFINE_TRACE(kmem_cache_free);
248
249EXPORT_TRACEPOINT_SYMBOL(kmalloc);
250EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
251EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
252EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
253EXPORT_TRACEPOINT_SYMBOL(kfree);
254EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 06e72693b458..39fdfb14eeaa 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -283,7 +283,7 @@ static inline int page_mapping_inuse(struct page *page)
283 283
284static inline int is_page_cache_freeable(struct page *page) 284static inline int is_page_cache_freeable(struct page *page)
285{ 285{
286 return page_count(page) - !!PagePrivate(page) == 2; 286 return page_count(page) - !!page_has_private(page) == 2;
287} 287}
288 288
289static int may_write_to_queue(struct backing_dev_info *bdi) 289static int may_write_to_queue(struct backing_dev_info *bdi)
@@ -367,7 +367,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
367 * Some data journaling orphaned pages can have 367 * Some data journaling orphaned pages can have
368 * page->mapping == NULL while being dirty with clean buffers. 368 * page->mapping == NULL while being dirty with clean buffers.
369 */ 369 */
370 if (PagePrivate(page)) { 370 if (page_has_private(page)) {
371 if (try_to_free_buffers(page)) { 371 if (try_to_free_buffers(page)) {
372 ClearPageDirty(page); 372 ClearPageDirty(page);
373 printk("%s: orphaned page\n", __func__); 373 printk("%s: orphaned page\n", __func__);
@@ -727,7 +727,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
727 * process address space (page_count == 1) it can be freed. 727 * process address space (page_count == 1) it can be freed.
728 * Otherwise, leave the page on the LRU so it is swappable. 728 * Otherwise, leave the page on the LRU so it is swappable.
729 */ 729 */
730 if (PagePrivate(page)) { 730 if (page_has_private(page)) {
731 if (!try_to_release_page(page, sc->gfp_mask)) 731 if (!try_to_release_page(page, sc->gfp_mask))
732 goto activate_locked; 732 goto activate_locked;
733 if (!mapping && page_count(page) == 1) { 733 if (!mapping && page_count(page) == 1) {
@@ -1967,7 +1967,7 @@ static int kswapd(void *p)
1967 struct reclaim_state reclaim_state = { 1967 struct reclaim_state reclaim_state = {
1968 .reclaimed_slab = 0, 1968 .reclaimed_slab = 0,
1969 }; 1969 };
1970 node_to_cpumask_ptr(cpumask, pgdat->node_id); 1970 const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
1971 1971
1972 lockdep_set_current_reclaim_state(GFP_KERNEL); 1972 lockdep_set_current_reclaim_state(GFP_KERNEL);
1973 1973
@@ -2204,7 +2204,9 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
2204 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { 2204 if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
2205 for_each_node_state(nid, N_HIGH_MEMORY) { 2205 for_each_node_state(nid, N_HIGH_MEMORY) {
2206 pg_data_t *pgdat = NODE_DATA(nid); 2206 pg_data_t *pgdat = NODE_DATA(nid);
2207 node_to_cpumask_ptr(mask, pgdat->node_id); 2207 const struct cpumask *mask;
2208
2209 mask = cpumask_of_node(pgdat->node_id);
2208 2210
2209 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) 2211 if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids)
2210 /* One of our CPUs online: restore mask */ 2212 /* One of our CPUs online: restore mask */
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9826766f1274..66f6130976cb 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -891,7 +891,7 @@ static void vmstat_update(struct work_struct *w)
891{ 891{
892 refresh_cpu_vm_stats(smp_processor_id()); 892 refresh_cpu_vm_stats(smp_processor_id());
893 schedule_delayed_work(&__get_cpu_var(vmstat_work), 893 schedule_delayed_work(&__get_cpu_var(vmstat_work),
894 sysctl_stat_interval); 894 round_jiffies_relative(sysctl_stat_interval));
895} 895}
896 896
897static void __cpuinit start_cpu_timer(int cpu) 897static void __cpuinit start_cpu_timer(int cpu)
@@ -899,7 +899,8 @@ static void __cpuinit start_cpu_timer(int cpu)
899 struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); 899 struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu);
900 900
901 INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update); 901 INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update);
902 schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu); 902 schedule_delayed_work_on(cpu, vmstat_work,
903 __round_jiffies_relative(HZ, cpu));
903} 904}
904 905
905/* 906/*