aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2011-07-25 20:12:27 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-07-25 23:57:10 -0400
commit1d65f86db14806cf7b1218c7b4ecb8b4db5af27d (patch)
tree01a2c4e3feb48327220b1fd8d09cf805c20eee7f
parentd515afe88a32e567c550e3db914f3e378f86453a (diff)
mm: preallocate page before lock_page() at filemap COW
Currently we are keeping faulted page locked throughout whole __do_fault call (except for page_mkwrite code path) after calling file system's fault code. If we do early COW, we allocate a new page which has to be charged for a memcg (mem_cgroup_newpage_charge). This function, however, might block for unbounded amount of time if memcg oom killer is disabled or fork-bomb is running because the only way out of the OOM situation is either an external event or OOM-situation fix. In the end we are keeping the faulted page locked and blocking other processes from faulting it in which is not good at all because we are basically punishing potentially an unrelated process for OOM condition in a different group (I have seen stuck system because of ld-2.11.1.so being locked). We can do test easily. % cgcreate -g memory:A % cgset -r memory.limit_in_bytes=64M A % cgset -r memory.memsw.limit_in_bytes=64M A % cd kernel_dir; cgexec -g memory:A make -j Then, the whole system will live-locked until you kill 'make -j' by hands (or push reboot...) This is because some important page in a a shared library are locked. Considering again, the new page is not necessary to be allocated with lock_page() held. And usual page allocation may dive into long memory reclaim loop with holding lock_page() and can cause very long latency. There are 3 ways. 1. do allocation/charge before lock_page() Pros. - simple and can handle page allocation in the same manner. This will reduce holding time of lock_page() in general. Cons. - we do page allocation even if ->fault() returns error. 2. do charge after unlock_page(). Even if charge fails, it's just OOM. Pros. - no impact to non-memcg path. Cons. - implemenation requires special cares of LRU and we need to modify page_add_new_anon_rmap()... 3. do unlock->charge->lock again method. Pros. - no impact to non-memcg path. Cons. - This may kill LOCK_PAGE_RETRY optimization. We need to release lock and get it again... This patch moves "charge" and memory allocation for COW page before lock_page(). Then, we can avoid scanning LRU with holding a lock on a page and latency under lock_page() will be reduced. Then, above livelock disappears. [akpm@linux-foundation.org: fix code layout] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Reported-by: Lutz Vieweg <lvml@5t9.de> Original-idea-by: Michal Hocko <mhocko@suse.cz> Cc: Michal Hocko <mhocko@suse.cz> Cc: Ying Han <yinghan@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--mm/memory.c56
1 files changed, 34 insertions, 22 deletions
diff --git a/mm/memory.c b/mm/memory.c
index a58bbebb3070..3c9f3aa8332e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3093,14 +3093,34 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3093 pte_t *page_table; 3093 pte_t *page_table;
3094 spinlock_t *ptl; 3094 spinlock_t *ptl;
3095 struct page *page; 3095 struct page *page;
3096 struct page *cow_page;
3096 pte_t entry; 3097 pte_t entry;
3097 int anon = 0; 3098 int anon = 0;
3098 int charged = 0;
3099 struct page *dirty_page = NULL; 3099 struct page *dirty_page = NULL;
3100 struct vm_fault vmf; 3100 struct vm_fault vmf;
3101 int ret; 3101 int ret;
3102 int page_mkwrite = 0; 3102 int page_mkwrite = 0;
3103 3103
3104 /*
3105 * If we do COW later, allocate page befor taking lock_page()
3106 * on the file cache page. This will reduce lock holding time.
3107 */
3108 if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
3109
3110 if (unlikely(anon_vma_prepare(vma)))
3111 return VM_FAULT_OOM;
3112
3113 cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
3114 if (!cow_page)
3115 return VM_FAULT_OOM;
3116
3117 if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) {
3118 page_cache_release(cow_page);
3119 return VM_FAULT_OOM;
3120 }
3121 } else
3122 cow_page = NULL;
3123
3104 vmf.virtual_address = (void __user *)(address & PAGE_MASK); 3124 vmf.virtual_address = (void __user *)(address & PAGE_MASK);
3105 vmf.pgoff = pgoff; 3125 vmf.pgoff = pgoff;
3106 vmf.flags = flags; 3126 vmf.flags = flags;
@@ -3109,12 +3129,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3109 ret = vma->vm_ops->fault(vma, &vmf); 3129 ret = vma->vm_ops->fault(vma, &vmf);
3110 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | 3130 if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
3111 VM_FAULT_RETRY))) 3131 VM_FAULT_RETRY)))
3112 return ret; 3132 goto uncharge_out;
3113 3133
3114 if (unlikely(PageHWPoison(vmf.page))) { 3134 if (unlikely(PageHWPoison(vmf.page))) {
3115 if (ret & VM_FAULT_LOCKED) 3135 if (ret & VM_FAULT_LOCKED)
3116 unlock_page(vmf.page); 3136 unlock_page(vmf.page);
3117 return VM_FAULT_HWPOISON; 3137 ret = VM_FAULT_HWPOISON;
3138 goto uncharge_out;
3118 } 3139 }
3119 3140
3120 /* 3141 /*
@@ -3132,23 +3153,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3132 page = vmf.page; 3153 page = vmf.page;
3133 if (flags & FAULT_FLAG_WRITE) { 3154 if (flags & FAULT_FLAG_WRITE) {
3134 if (!(vma->vm_flags & VM_SHARED)) { 3155 if (!(vma->vm_flags & VM_SHARED)) {
3156 page = cow_page;
3135 anon = 1; 3157 anon = 1;
3136 if (unlikely(anon_vma_prepare(vma))) {
3137 ret = VM_FAULT_OOM;
3138 goto out;
3139 }
3140 page = alloc_page_vma(GFP_HIGHUSER_MOVABLE,
3141 vma, address);
3142 if (!page) {
3143 ret = VM_FAULT_OOM;
3144 goto out;
3145 }
3146 if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
3147 ret = VM_FAULT_OOM;
3148 page_cache_release(page);
3149 goto out;
3150 }
3151 charged = 1;
3152 copy_user_highpage(page, vmf.page, address, vma); 3158 copy_user_highpage(page, vmf.page, address, vma);
3153 __SetPageUptodate(page); 3159 __SetPageUptodate(page);
3154 } else { 3160 } else {
@@ -3217,8 +3223,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3217 /* no need to invalidate: a not-present page won't be cached */ 3223 /* no need to invalidate: a not-present page won't be cached */
3218 update_mmu_cache(vma, address, page_table); 3224 update_mmu_cache(vma, address, page_table);
3219 } else { 3225 } else {
3220 if (charged) 3226 if (cow_page)
3221 mem_cgroup_uncharge_page(page); 3227 mem_cgroup_uncharge_page(cow_page);
3222 if (anon) 3228 if (anon)
3223 page_cache_release(page); 3229 page_cache_release(page);
3224 else 3230 else
@@ -3227,7 +3233,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
3227 3233
3228 pte_unmap_unlock(page_table, ptl); 3234 pte_unmap_unlock(page_table, ptl);
3229 3235
3230out:
3231 if (dirty_page) { 3236 if (dirty_page) {
3232 struct address_space *mapping = page->mapping; 3237 struct address_space *mapping = page->mapping;
3233 3238
@@ -3257,6 +3262,13 @@ out:
3257unwritable_page: 3262unwritable_page:
3258 page_cache_release(page); 3263 page_cache_release(page);
3259 return ret; 3264 return ret;
3265uncharge_out:
3266 /* fs's fault handler get error */
3267 if (cow_page) {
3268 mem_cgroup_uncharge_page(cow_page);
3269 page_cache_release(cow_page);
3270 }
3271 return ret;
3260} 3272}
3261 3273
3262static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, 3274static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,