From 867578cbccb0893cc14fc29c670f7185809c90d6 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 10 Mar 2010 15:22:39 -0800 Subject: memcg: fix oom kill behavior In current page-fault code, handle_mm_fault() -> ... -> mem_cgroup_charge() -> map page or handle error. -> check return code. If page fault's return code is VM_FAULT_OOM, page_fault_out_of_memory() is called. But if it's caused by memcg, OOM should have been already invoked. Then, I added a patch: a636b327f731143ccc544b966cfd8de6cb6d72c6. That patch records last_oom_jiffies for memcg's sub-hierarchy and prevents page_fault_out_of_memory from being invoked in near future. But Nishimura-san reported that check by jiffies is not enough when the system is terribly heavy. This patch changes memcg's oom logic as. * If memcg causes OOM-kill, continue to retry. * remove jiffies check which is used now. * add memcg-oom-lock which works like perzone oom lock. * If current is killed(as a process), bypass charge. Something more sophisticated can be added but this pactch does fundamental things. TODO: - add oom notifier - add permemcg disable-oom-kill flag and freezer at oom. - more chances for wake up oom waiter (when changing memory limit etc..) Reviewed-by: Daisuke Nishimura Tested-by: Daisuke Nishimura Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: David Rientjes Signed-off-by: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1f9b119f4ace..44301c6affa8 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -124,7 +124,6 @@ static inline bool mem_cgroup_disabled(void) return false; } -extern bool mem_cgroup_oom_called(struct task_struct *task); void mem_cgroup_update_file_mapped(struct page *page, int val); unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask, int nid, @@ -258,11 +257,6 @@ static inline bool mem_cgroup_disabled(void) return true; } -static inline bool mem_cgroup_oom_called(struct task_struct *task) -{ - return false; -} - static inline int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) { -- cgit v1.2.2 From 8b25c6d2231b978ccce9c401e771932bde79aa9f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 24 May 2010 14:32:40 -0700 Subject: vmscan: remove isolate_pages callback scan control For now, we have global isolation vs. memory control group isolation, do not allow the reclaim entry function to set an arbitrary page isolation callback, we do not need that flexibility. And since we already pass around the group descriptor for the memory control group isolation case, just use it to decide which one of the two isolator functions to use. The decisions can be merged into nearby branches, so no extra cost there. In fact, we save the indirect calls. Signed-off-by: Johannes Weiner Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 44301c6affa8..05894795fdc1 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -25,6 +25,13 @@ struct page_cgroup; struct page; struct mm_struct; +extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, + struct list_head *dst, + unsigned long *scanned, int order, + int mode, struct zone *z, + struct mem_cgroup *mem_cont, + int active, int file); + #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* * All "charge" functions with gfp_mask should use GFP_KERNEL or @@ -64,12 +71,6 @@ extern void mem_cgroup_uncharge_cache_page(struct page *page); extern int mem_cgroup_shmem_charge_fallback(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); -extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, - struct list_head *dst, - unsigned long *scanned, int order, - int mode, struct zone *z, - struct mem_cgroup *mem_cont, - int active, int file); extern void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask); int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem); -- cgit v1.2.2 From ac39cf8cb86c45eeac6a592ce0d58f9021a97235 Mon Sep 17 00:00:00 2001 From: "akpm@linux-foundation.org" Date: Wed, 26 May 2010 14:42:46 -0700 Subject: memcg: fix mis-accounting of file mapped racy with migration FILE_MAPPED per memcg of migrated file cache is not properly updated, because our hook in page_add_file_rmap() can't know to which memcg FILE_MAPPED should be counted. Basically, this patch is for fixing the bug but includes some big changes to fix up other messes. Now, at migrating mapped file, events happen in following sequence. 1. allocate a new page. 2. get memcg of an old page. 3. charge ageinst a new page before migration. But at this point, no changes to new page's page_cgroup, no commit for the charge. (IOW, PCG_USED bit is not set.) 4. page migration replaces radix-tree, old-page and new-page. 5. page migration remaps the new page if the old page was mapped. 6. Here, the new page is unlocked. 7. memcg commits the charge for newpage, Mark the new page's page_cgroup as PCG_USED. Because "commit" happens after page-remap, we can count FILE_MAPPED at "5", because we should avoid to trust page_cgroup->mem_cgroup. if PCG_USED bit is unset. (Note: memcg's LRU removal code does that but LRU-isolation logic is used for helping it. When we overwrite page_cgroup->mem_cgroup, page_cgroup is not on LRU or page_cgroup->mem_cgroup is NULL.) We can lose file_mapped accounting information at 5 because FILE_MAPPED is updated only when mapcount changes 0->1. So we should catch it. BTW, historically, above implemntation comes from migration-failure of anonymous page. Because we charge both of old page and new page with mapcount=0, we can't catch - the page is really freed before remap. - migration fails but it's freed before remap or .....corner cases. New migration sequence with memcg is: 1. allocate a new page. 2. mark PageCgroupMigration to the old page. 3. charge against a new page onto the old page's memcg. (here, new page's pc is marked as PageCgroupUsed.) 4. page migration replaces radix-tree, page table, etc... 5. At remapping, new page's page_cgroup is now makrked as "USED" We can catch 0->1 event and FILE_MAPPED will be properly updated. And we can catch SWAPOUT event after unlock this and freeing this page by unmap() can be caught. 7. Clear PageCgroupMigration of the old page. So, FILE_MAPPED will be correctly updated. Then, for what MIGRATION flag is ? Without it, at migration failure, we may have to charge old page again because it may be fully unmapped. "charge" means that we have to dive into memory reclaim or something complated. So, it's better to avoid charge it again. Before this patch, __commit_charge() was working for both of the old/new page and fixed up all. But this technique has some racy condtion around FILE_MAPPED and SWAPOUT etc... Now, the kernel use MIGRATION flag and don't uncharge old page until the end of migration. I hope this change will make memcg's page migration much simpler. This page migration has caused several troubles. Worth to add a flag for simplification. Reviewed-by: Daisuke Nishimura Tested-by: Daisuke Nishimura Reported-by: Daisuke Nishimura Signed-off-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Christoph Lameter Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 05894795fdc1..9411d32840b0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -90,7 +90,8 @@ int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup) extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem); extern int -mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr); +mem_cgroup_prepare_migration(struct page *page, + struct page *newpage, struct mem_cgroup **ptr); extern void mem_cgroup_end_migration(struct mem_cgroup *mem, struct page *oldpage, struct page *newpage); @@ -227,7 +228,8 @@ static inline struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) } static inline int -mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) +mem_cgroup_prepare_migration(struct page *page, struct page *newpage, + struct mem_cgroup **ptr) { return 0; } -- cgit v1.2.2 From 25edde0332916ae706ccf83de688be57bcc844b7 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Mon, 9 Aug 2010 17:19:27 -0700 Subject: vmscan: kill prev_priority completely Since 2.6.28 zone->prev_priority is unused. Then it can be removed safely. It reduce stack usage slightly. Now I have to say that I'm sorry. 2 years ago, I thought prev_priority can be integrate again, it's useful. but four (or more) times trying haven't got good performance number. Thus I give up such approach. The rest of this changelog is notes on prev_priority and why it existed in the first place and why it might be not necessary any more. This information is based heavily on discussions between Andrew Morton, Rik van Riel and Kosaki Motohiro who is heavily quotes from. Historically prev_priority was important because it determined when the VM would start unmapping PTE pages. i.e. there are no balances of note within the VM, Anon vs File and Mapped vs Unmapped. Without prev_priority, there is a potential risk of unnecessarily increasing minor faults as a large amount of read activity of use-once pages could push mapped pages to the end of the LRU and get unmapped. There is no proof this is still a problem but currently it is not considered to be. Active files are not deactivated if the active file list is smaller than the inactive list reducing the liklihood that file-mapped pages are being pushed off the LRU and referenced executable pages are kept on the active list to avoid them getting pushed out by read activity. Even if it is a problem, prev_priority prev_priority wouldn't works nowadays. First of all, current vmscan still a lot of UP centric code. it expose some weakness on some dozens CPUs machine. I think we need more and more improvement. The problem is, current vmscan mix up per-system-pressure, per-zone-pressure and per-task-pressure a bit. example, prev_priority try to boost priority to other concurrent priority. but if the another task have mempolicy restriction, it is unnecessary, but also makes wrong big latency and exceeding reclaim. per-task based priority + prev_priority adjustment make the emulation of per-system pressure. but it have two issue 1) too rough and brutal emulation 2) we need per-zone pressure, not per-system. Another example, currently DEF_PRIORITY is 12. it mean the lru rotate about 2 cycle (1/4096 + 1/2048 + 1/1024 + .. + 1) before invoking OOM-Killer. but if 10,0000 thrreads enter DEF_PRIORITY reclaim at the same time, the system have higher memory pressure than priority==0 (1/4096*10,000 > 2). prev_priority can't solve such multithreads workload issue. In other word, prev_priority concept assume the sysmtem don't have lots threads." Signed-off-by: KOSAKI Motohiro Signed-off-by: Mel Gorman Reviewed-by: Johannes Weiner Reviewed-by: Rik van Riel Cc: Dave Chinner Cc: Chris Mason Cc: Nick Piggin Cc: Rik van Riel Cc: Johannes Weiner Cc: Christoph Hellwig Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Andrea Arcangeli Cc: Michael Rubin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9411d32840b0..9f1afd361583 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -98,11 +98,6 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem, /* * For memory reclaim. */ -extern int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem); -extern void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, - int priority); -extern void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, - int priority); int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg); int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg); unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, -- cgit v1.2.2 From a63d83f427fbce97a6cea0db2e64b0eb8435cd10 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 9 Aug 2010 17:19:46 -0700 Subject: oom: badness heuristic rewrite This a complete rewrite of the oom killer's badness() heuristic which is used to determine which task to kill in oom conditions. The goal is to make it as simple and predictable as possible so the results are better understood and we end up killing the task which will lead to the most memory freeing while still respecting the fine-tuning from userspace. Instead of basing the heuristic on mm->total_vm for each task, the task's rss and swap space is used instead. This is a better indication of the amount of memory that will be freeable if the oom killed task is chosen and subsequently exits. This helps specifically in cases where KDE or GNOME is chosen for oom kill on desktop systems instead of a memory hogging task. The baseline for the heuristic is a proportion of memory that each task is currently using in memory plus swap compared to the amount of "allowable" memory. "Allowable," in this sense, means the system-wide resources for unconstrained oom conditions, the set of mempolicy nodes, the mems attached to current's cpuset, or a memory controller's limit. The proportion is given on a scale of 0 (never kill) to 1000 (always kill), roughly meaning that if a task has a badness() score of 500 that the task consumes approximately 50% of allowable memory resident in RAM or in swap space. The proportion is always relative to the amount of "allowable" memory and not the total amount of RAM systemwide so that mempolicies and cpusets may operate in isolation; they shall not need to know the true size of the machine on which they are running if they are bound to a specific set of nodes or mems, respectively. Root tasks are given 3% extra memory just like __vm_enough_memory() provides in LSMs. In the event of two tasks consuming similar amounts of memory, it is generally better to save root's task. Because of the change in the badness() heuristic's baseline, it is also necessary to introduce a new user interface to tune it. It's not possible to redefine the meaning of /proc/pid/oom_adj with a new scale since the ABI cannot be changed for backward compatability. Instead, a new tunable, /proc/pid/oom_score_adj, is added that ranges from -1000 to +1000. It may be used to polarize the heuristic such that certain tasks are never considered for oom kill while others may always be considered. The value is added directly into the badness() score so a value of -500, for example, means to discount 50% of its memory consumption in comparison to other tasks either on the system, bound to the mempolicy, in the cpuset, or sharing the same memory controller. /proc/pid/oom_adj is changed so that its meaning is rescaled into the units used by /proc/pid/oom_score_adj, and vice versa. Changing one of these per-task tunables will rescale the value of the other to an equivalent meaning. Although /proc/pid/oom_adj was originally defined as a bitshift on the badness score, it now shares the same linear growth as /proc/pid/oom_score_adj but with different granularity. This is required so the ABI is not broken with userspace applications and allows oom_adj to be deprecated for future removal. Signed-off-by: David Rientjes Cc: Nick Piggin Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Oleg Nesterov Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9f1afd361583..73564cac38c7 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -125,6 +125,8 @@ void mem_cgroup_update_file_mapped(struct page *page, int val); unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask, int nid, int zid); +u64 mem_cgroup_get_limit(struct mem_cgroup *mem); + #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct mem_cgroup; @@ -304,6 +306,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, return 0; } +static inline +u64 mem_cgroup_get_limit(struct mem_cgroup *mem) +{ + return 0; +} + #endif /* CONFIG_CGROUP_MEM_CONT */ #endif /* _LINUX_MEMCONTROL_H */ -- cgit v1.2.2 From 00918b6ab89df8984ca06397cb77994dabd73f9b Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 10 Aug 2010 18:03:05 -0700 Subject: memcg: remove nid and zid argument from mem_cgroup_soft_limit_reclaim() mem_cgroup_soft_limit_reclaim() has zone, nid and zid argument. but nid and zid can be calculated from zone. So remove it. Signed-off-by: KOSAKI Motohiro Acked-by: KAMEZAWA Hiroyuki Acked-by: Mel Gorman Cc: Balbir Singh Cc: Nishimura Daisuke Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include/linux/memcontrol.h') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 73564cac38c7..159a0762aeaf 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -123,8 +123,7 @@ static inline bool mem_cgroup_disabled(void) void mem_cgroup_update_file_mapped(struct page *page, int val); unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, - gfp_t gfp_mask, int nid, - int zid); + gfp_t gfp_mask); u64 mem_cgroup_get_limit(struct mem_cgroup *mem); #else /* CONFIG_CGROUP_MEM_RES_CTLR */ @@ -301,7 +300,7 @@ static inline void mem_cgroup_update_file_mapped(struct page *page, static inline unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, - gfp_t gfp_mask, int nid, int zid) + gfp_t gfp_mask) { return 0; } -- cgit v1.2.2