aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2010-08-09 20:19:46 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2010-08-09 23:45:02 -0400
commita63d83f427fbce97a6cea0db2e64b0eb8435cd10 (patch)
tree8ac229cdf6e2289d97e82e35774057106fe7f4a2 /mm
parent74bcbf40546bb7500f2a7ba4ff3cc056a6bd004a (diff)
oom: badness heuristic rewrite
This a complete rewrite of the oom killer's badness() heuristic which is used to determine which task to kill in oom conditions. The goal is to make it as simple and predictable as possible so the results are better understood and we end up killing the task which will lead to the most memory freeing while still respecting the fine-tuning from userspace. Instead of basing the heuristic on mm->total_vm for each task, the task's rss and swap space is used instead. This is a better indication of the amount of memory that will be freeable if the oom killed task is chosen and subsequently exits. This helps specifically in cases where KDE or GNOME is chosen for oom kill on desktop systems instead of a memory hogging task. The baseline for the heuristic is a proportion of memory that each task is currently using in memory plus swap compared to the amount of "allowable" memory. "Allowable," in this sense, means the system-wide resources for unconstrained oom conditions, the set of mempolicy nodes, the mems attached to current's cpuset, or a memory controller's limit. The proportion is given on a scale of 0 (never kill) to 1000 (always kill), roughly meaning that if a task has a badness() score of 500 that the task consumes approximately 50% of allowable memory resident in RAM or in swap space. The proportion is always relative to the amount of "allowable" memory and not the total amount of RAM systemwide so that mempolicies and cpusets may operate in isolation; they shall not need to know the true size of the machine on which they are running if they are bound to a specific set of nodes or mems, respectively. Root tasks are given 3% extra memory just like __vm_enough_memory() provides in LSMs. In the event of two tasks consuming similar amounts of memory, it is generally better to save root's task. Because of the change in the badness() heuristic's baseline, it is also necessary to introduce a new user interface to tune it. It's not possible to redefine the meaning of /proc/pid/oom_adj with a new scale since the ABI cannot be changed for backward compatability. Instead, a new tunable, /proc/pid/oom_score_adj, is added that ranges from -1000 to +1000. It may be used to polarize the heuristic such that certain tasks are never considered for oom kill while others may always be considered. The value is added directly into the badness() score so a value of -500, for example, means to discount 50% of its memory consumption in comparison to other tasks either on the system, bound to the mempolicy, in the cpuset, or sharing the same memory controller. /proc/pid/oom_adj is changed so that its meaning is rescaled into the units used by /proc/pid/oom_score_adj, and vice versa. Changing one of these per-task tunables will rescale the value of the other to an equivalent meaning. Although /proc/pid/oom_adj was originally defined as a bitshift on the badness score, it now shares the same linear growth as /proc/pid/oom_score_adj but with different granularity. This is required so the ABI is not broken with userspace applications and allows oom_adj to be deprecated for future removal. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Nick Piggin <npiggin@suse.de> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Balbir Singh <balbir@in.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/memcontrol.c18
-rw-r--r--mm/oom_kill.c259
2 files changed, 129 insertions, 148 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 31abd1c2c0c5..de54ea0094a1 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1127,6 +1127,24 @@ static int mem_cgroup_count_children(struct mem_cgroup *mem)
1127} 1127}
1128 1128
1129/* 1129/*
1130 * Return the memory (and swap, if configured) limit for a memcg.
1131 */
1132u64 mem_cgroup_get_limit(struct mem_cgroup *memcg)
1133{
1134 u64 limit;
1135 u64 memsw;
1136
1137 limit = res_counter_read_u64(&memcg->res, RES_LIMIT) +
1138 total_swap_pages;
1139 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT);
1140 /*
1141 * If memsw is finite and limits the amount of swap space available
1142 * to this memcg, return that limit.
1143 */
1144 return min(limit, memsw);
1145}
1146
1147/*
1130 * Visit the first child (need not be the first child as per the ordering 1148 * Visit the first child (need not be the first child as per the ordering
1131 * of the cgroup list, since we track last_scanned_child) of @mem and use 1149 * of the cgroup list, since we track last_scanned_child) of @mem and use
1132 * that to reclaim free pages from. 1150 * that to reclaim free pages from.
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 0a4ca8a0234b..d3def05a33d9 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -4,6 +4,8 @@
4 * Copyright (C) 1998,2000 Rik van Riel 4 * Copyright (C) 1998,2000 Rik van Riel
5 * Thanks go out to Claus Fischer for some serious inspiration and 5 * Thanks go out to Claus Fischer for some serious inspiration and
6 * for goading me into coding this file... 6 * for goading me into coding this file...
7 * Copyright (C) 2010 Google, Inc.
8 * Rewritten by David Rientjes
7 * 9 *
8 * The routines in this file are used to kill a process when 10 * The routines in this file are used to kill a process when
9 * we're seriously out of memory. This gets called from __alloc_pages() 11 * we're seriously out of memory. This gets called from __alloc_pages()
@@ -34,7 +36,6 @@ int sysctl_panic_on_oom;
34int sysctl_oom_kill_allocating_task; 36int sysctl_oom_kill_allocating_task;
35int sysctl_oom_dump_tasks = 1; 37int sysctl_oom_dump_tasks = 1;
36static DEFINE_SPINLOCK(zone_scan_lock); 38static DEFINE_SPINLOCK(zone_scan_lock);
37/* #define DEBUG */
38 39
39#ifdef CONFIG_NUMA 40#ifdef CONFIG_NUMA
40/** 41/**
@@ -140,137 +141,76 @@ static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem,
140} 141}
141 142
142/** 143/**
143 * badness - calculate a numeric value for how bad this task has been 144 * oom_badness - heuristic function to determine which candidate task to kill
144 * @p: task struct of which task we should calculate 145 * @p: task struct of which task we should calculate
145 * @uptime: current uptime in seconds 146 * @totalpages: total present RAM allowed for page allocation
146 * 147 *
147 * The formula used is relatively simple and documented inline in the 148 * The heuristic for determining which task to kill is made to be as simple and
148 * function. The main rationale is that we want to select a good task 149 * predictable as possible. The goal is to return the highest value for the
149 * to kill when we run out of memory. 150 * task consuming the most memory to avoid subsequent oom failures.
150 *
151 * Good in this context means that:
152 * 1) we lose the minimum amount of work done
153 * 2) we recover a large amount of memory
154 * 3) we don't kill anything innocent of eating tons of memory
155 * 4) we want to kill the minimum amount of processes (one)
156 * 5) we try to kill the process the user expects us to kill, this
157 * algorithm has been meticulously tuned to meet the principle
158 * of least surprise ... (be careful when you change it)
159 */ 151 */
160unsigned long badness(struct task_struct *p, struct mem_cgroup *mem, 152unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
161 const nodemask_t *nodemask, unsigned long uptime) 153 const nodemask_t *nodemask, unsigned long totalpages)
162{ 154{
163 unsigned long points, cpu_time, run_time; 155 int points;
164 struct task_struct *child;
165 struct task_struct *c, *t;
166 int oom_adj = p->signal->oom_adj;
167 struct task_cputime task_time;
168 unsigned long utime;
169 unsigned long stime;
170 156
171 if (oom_unkillable_task(p, mem, nodemask)) 157 if (oom_unkillable_task(p, mem, nodemask))
172 return 0; 158 return 0;
173 if (oom_adj == OOM_DISABLE)
174 return 0;
175 159
176 p = find_lock_task_mm(p); 160 p = find_lock_task_mm(p);
177 if (!p) 161 if (!p)
178 return 0; 162 return 0;
179 163
180 /* 164 /*
181 * The memory size of the process is the basis for the badness. 165 * Shortcut check for OOM_SCORE_ADJ_MIN so the entire heuristic doesn't
182 */ 166 * need to be executed for something that cannot be killed.
183 points = p->mm->total_vm;
184 task_unlock(p);
185
186 /*
187 * swapoff can easily use up all memory, so kill those first.
188 */
189 if (p->flags & PF_OOM_ORIGIN)
190 return ULONG_MAX;
191
192 /*
193 * Processes which fork a lot of child processes are likely
194 * a good choice. We add half the vmsize of the children if they
195 * have an own mm. This prevents forking servers to flood the
196 * machine with an endless amount of children. In case a single
197 * child is eating the vast majority of memory, adding only half
198 * to the parents will make the child our kill candidate of choice.
199 */ 167 */
200 t = p; 168 if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
201 do { 169 task_unlock(p);
202 list_for_each_entry(c, &t->children, sibling) { 170 return 0;
203 child = find_lock_task_mm(c); 171 }
204 if (child) {
205 if (child->mm != p->mm)
206 points += child->mm->total_vm/2 + 1;
207 task_unlock(child);
208 }
209 }
210 } while_each_thread(p, t);
211 172
212 /* 173 /*
213 * CPU time is in tens of seconds and run time is in thousands 174 * When the PF_OOM_ORIGIN bit is set, it indicates the task should have
214 * of seconds. There is no particular reason for this other than 175 * priority for oom killing.
215 * that it turned out to work very well in practice.
216 */ 176 */
217 thread_group_cputime(p, &task_time); 177 if (p->flags & PF_OOM_ORIGIN) {
218 utime = cputime_to_jiffies(task_time.utime); 178 task_unlock(p);
219 stime = cputime_to_jiffies(task_time.stime); 179 return 1000;
220 cpu_time = (utime + stime) >> (SHIFT_HZ + 3); 180 }
221
222
223 if (uptime >= p->start_time.tv_sec)
224 run_time = (uptime - p->start_time.tv_sec) >> 10;
225 else
226 run_time = 0;
227
228 if (cpu_time)
229 points /= int_sqrt(cpu_time);
230 if (run_time)
231 points /= int_sqrt(int_sqrt(run_time));
232 181
233 /* 182 /*
234 * Niced processes are most likely less important, so double 183 * The memory controller may have a limit of 0 bytes, so avoid a divide
235 * their badness points. 184 * by zero, if necessary.
236 */ 185 */
237 if (task_nice(p) > 0) 186 if (!totalpages)
238 points *= 2; 187 totalpages = 1;
239 188
240 /* 189 /*
241 * Superuser processes are usually more important, so we make it 190 * The baseline for the badness score is the proportion of RAM that each
242 * less likely that we kill those. 191 * task's rss and swap space use.
243 */ 192 */
244 if (has_capability_noaudit(p, CAP_SYS_ADMIN) || 193 points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 /
245 has_capability_noaudit(p, CAP_SYS_RESOURCE)) 194 totalpages;
246 points /= 4; 195 task_unlock(p);
247 196
248 /* 197 /*
249 * We don't want to kill a process with direct hardware access. 198 * Root processes get 3% bonus, just like the __vm_enough_memory()
250 * Not only could that mess up the hardware, but usually users 199 * implementation used by LSMs.
251 * tend to only have this flag set on applications they think
252 * of as important.
253 */ 200 */
254 if (has_capability_noaudit(p, CAP_SYS_RAWIO)) 201 if (has_capability_noaudit(p, CAP_SYS_ADMIN))
255 points /= 4; 202 points -= 30;
256 203
257 /* 204 /*
258 * Adjust the score by oom_adj. 205 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
206 * either completely disable oom killing or always prefer a certain
207 * task.
259 */ 208 */
260 if (oom_adj) { 209 points += p->signal->oom_score_adj;
261 if (oom_adj > 0) {
262 if (!points)
263 points = 1;
264 points <<= oom_adj;
265 } else
266 points >>= -(oom_adj);
267 }
268 210
269#ifdef DEBUG 211 if (points < 0)
270 printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n", 212 return 0;
271 p->pid, p->comm, points); 213 return (points < 1000) ? points : 1000;
272#endif
273 return points;
274} 214}
275 215
276/* 216/*
@@ -278,12 +218,20 @@ unsigned long badness(struct task_struct *p, struct mem_cgroup *mem,
278 */ 218 */
279#ifdef CONFIG_NUMA 219#ifdef CONFIG_NUMA
280static enum oom_constraint constrained_alloc(struct zonelist *zonelist, 220static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
281 gfp_t gfp_mask, nodemask_t *nodemask) 221 gfp_t gfp_mask, nodemask_t *nodemask,
222 unsigned long *totalpages)
282{ 223{
283 struct zone *zone; 224 struct zone *zone;
284 struct zoneref *z; 225 struct zoneref *z;
285 enum zone_type high_zoneidx = gfp_zone(gfp_mask); 226 enum zone_type high_zoneidx = gfp_zone(gfp_mask);
227 bool cpuset_limited = false;
228 int nid;
286 229
230 /* Default to all available memory */
231 *totalpages = totalram_pages + total_swap_pages;
232
233 if (!zonelist)
234 return CONSTRAINT_NONE;
287 /* 235 /*
288 * Reach here only when __GFP_NOFAIL is used. So, we should avoid 236 * Reach here only when __GFP_NOFAIL is used. So, we should avoid
289 * to kill current.We have to random task kill in this case. 237 * to kill current.We have to random task kill in this case.
@@ -293,26 +241,37 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
293 return CONSTRAINT_NONE; 241 return CONSTRAINT_NONE;
294 242
295 /* 243 /*
296 * The nodemask here is a nodemask passed to alloc_pages(). Now, 244 * This is not a __GFP_THISNODE allocation, so a truncated nodemask in
297 * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy 245 * the page allocator means a mempolicy is in effect. Cpuset policy
298 * feature. mempolicy is an only user of nodemask here. 246 * is enforced in get_page_from_freelist().
299 * check mempolicy's nodemask contains all N_HIGH_MEMORY
300 */ 247 */
301 if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) 248 if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) {
249 *totalpages = total_swap_pages;
250 for_each_node_mask(nid, *nodemask)
251 *totalpages += node_spanned_pages(nid);
302 return CONSTRAINT_MEMORY_POLICY; 252 return CONSTRAINT_MEMORY_POLICY;
253 }
303 254
304 /* Check this allocation failure is caused by cpuset's wall function */ 255 /* Check this allocation failure is caused by cpuset's wall function */
305 for_each_zone_zonelist_nodemask(zone, z, zonelist, 256 for_each_zone_zonelist_nodemask(zone, z, zonelist,
306 high_zoneidx, nodemask) 257 high_zoneidx, nodemask)
307 if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) 258 if (!cpuset_zone_allowed_softwall(zone, gfp_mask))
308 return CONSTRAINT_CPUSET; 259 cpuset_limited = true;
309 260
261 if (cpuset_limited) {
262 *totalpages = total_swap_pages;
263 for_each_node_mask(nid, cpuset_current_mems_allowed)
264 *totalpages += node_spanned_pages(nid);
265 return CONSTRAINT_CPUSET;
266 }
310 return CONSTRAINT_NONE; 267 return CONSTRAINT_NONE;
311} 268}
312#else 269#else
313static enum oom_constraint constrained_alloc(struct zonelist *zonelist, 270static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
314 gfp_t gfp_mask, nodemask_t *nodemask) 271 gfp_t gfp_mask, nodemask_t *nodemask,
272 unsigned long *totalpages)
315{ 273{
274 *totalpages = totalram_pages + total_swap_pages;
316 return CONSTRAINT_NONE; 275 return CONSTRAINT_NONE;
317} 276}
318#endif 277#endif
@@ -323,17 +282,16 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
323 * 282 *
324 * (not docbooked, we don't want this one cluttering up the manual) 283 * (not docbooked, we don't want this one cluttering up the manual)
325 */ 284 */
326static struct task_struct *select_bad_process(unsigned long *ppoints, 285static struct task_struct *select_bad_process(unsigned int *ppoints,
327 struct mem_cgroup *mem, const nodemask_t *nodemask) 286 unsigned long totalpages, struct mem_cgroup *mem,
287 const nodemask_t *nodemask)
328{ 288{
329 struct task_struct *p; 289 struct task_struct *p;
330 struct task_struct *chosen = NULL; 290 struct task_struct *chosen = NULL;
331 struct timespec uptime;
332 *ppoints = 0; 291 *ppoints = 0;
333 292
334 do_posix_clock_monotonic_gettime(&uptime);
335 for_each_process(p) { 293 for_each_process(p) {
336 unsigned long points; 294 unsigned int points;
337 295
338 if (oom_unkillable_task(p, mem, nodemask)) 296 if (oom_unkillable_task(p, mem, nodemask))
339 continue; 297 continue;
@@ -365,11 +323,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
365 return ERR_PTR(-1UL); 323 return ERR_PTR(-1UL);
366 324
367 chosen = p; 325 chosen = p;
368 *ppoints = ULONG_MAX; 326 *ppoints = 1000;
369 } 327 }
370 328
371 points = badness(p, mem, nodemask, uptime.tv_sec); 329 points = oom_badness(p, mem, nodemask, totalpages);
372 if (points > *ppoints || !chosen) { 330 if (points > *ppoints) {
373 chosen = p; 331 chosen = p;
374 *ppoints = points; 332 *ppoints = points;
375 } 333 }
@@ -384,7 +342,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
384 * 342 *
385 * Dumps the current memory state of all system tasks, excluding kernel threads. 343 * Dumps the current memory state of all system tasks, excluding kernel threads.
386 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj 344 * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
387 * score, and name. 345 * value, oom_score_adj value, and name.
388 * 346 *
389 * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are 347 * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
390 * shown. 348 * shown.
@@ -396,8 +354,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
396 struct task_struct *p; 354 struct task_struct *p;
397 struct task_struct *task; 355 struct task_struct *task;
398 356
399 printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " 357 pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n");
400 "name\n");
401 for_each_process(p) { 358 for_each_process(p) {
402 if (p->flags & PF_KTHREAD) 359 if (p->flags & PF_KTHREAD)
403 continue; 360 continue;
@@ -414,10 +371,11 @@ static void dump_tasks(const struct mem_cgroup *mem)
414 continue; 371 continue;
415 } 372 }
416 373
417 printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3u %3d %s\n", 374 pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n",
418 task->pid, __task_cred(task)->uid, task->tgid, 375 task->pid, __task_cred(task)->uid, task->tgid,
419 task->mm->total_vm, get_mm_rss(task->mm), 376 task->mm->total_vm, get_mm_rss(task->mm),
420 task_cpu(task), task->signal->oom_adj, task->comm); 377 task_cpu(task), task->signal->oom_adj,
378 task->signal->oom_score_adj, task->comm);
421 task_unlock(task); 379 task_unlock(task);
422 } 380 }
423} 381}
@@ -427,8 +385,9 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
427{ 385{
428 task_lock(current); 386 task_lock(current);
429 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " 387 pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
430 "oom_adj=%d\n", 388 "oom_adj=%d, oom_score_adj=%d\n",
431 current->comm, gfp_mask, order, current->signal->oom_adj); 389 current->comm, gfp_mask, order, current->signal->oom_adj,
390 current->signal->oom_score_adj);
432 cpuset_print_task_mems_allowed(current); 391 cpuset_print_task_mems_allowed(current);
433 task_unlock(current); 392 task_unlock(current);
434 dump_stack(); 393 dump_stack();
@@ -468,14 +427,14 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
468#undef K 427#undef K
469 428
470static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, 429static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
471 unsigned long points, struct mem_cgroup *mem, 430 unsigned int points, unsigned long totalpages,
472 nodemask_t *nodemask, const char *message) 431 struct mem_cgroup *mem, nodemask_t *nodemask,
432 const char *message)
473{ 433{
474 struct task_struct *victim = p; 434 struct task_struct *victim = p;
475 struct task_struct *child; 435 struct task_struct *child;
476 struct task_struct *t = p; 436 struct task_struct *t = p;
477 unsigned long victim_points = 0; 437 unsigned int victim_points = 0;
478 struct timespec uptime;
479 438
480 if (printk_ratelimit()) 439 if (printk_ratelimit())
481 dump_header(p, gfp_mask, order, mem); 440 dump_header(p, gfp_mask, order, mem);
@@ -491,7 +450,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
491 } 450 }
492 451
493 task_lock(p); 452 task_lock(p);
494 pr_err("%s: Kill process %d (%s) score %lu or sacrifice child\n", 453 pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
495 message, task_pid_nr(p), p->comm, points); 454 message, task_pid_nr(p), p->comm, points);
496 task_unlock(p); 455 task_unlock(p);
497 456
@@ -501,14 +460,15 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
501 * parent. This attempts to lose the minimal amount of work done while 460 * parent. This attempts to lose the minimal amount of work done while
502 * still freeing memory. 461 * still freeing memory.
503 */ 462 */
504 do_posix_clock_monotonic_gettime(&uptime);
505 do { 463 do {
506 list_for_each_entry(child, &t->children, sibling) { 464 list_for_each_entry(child, &t->children, sibling) {
507 unsigned long child_points; 465 unsigned int child_points;
508 466
509 /* badness() returns 0 if the thread is unkillable */ 467 /*
510 child_points = badness(child, mem, nodemask, 468 * oom_badness() returns 0 if the thread is unkillable
511 uptime.tv_sec); 469 */
470 child_points = oom_badness(child, mem, nodemask,
471 totalpages);
512 if (child_points > victim_points) { 472 if (child_points > victim_points) {
513 victim = child; 473 victim = child;
514 victim_points = child_points; 474 victim_points = child_points;
@@ -546,17 +506,19 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
546#ifdef CONFIG_CGROUP_MEM_RES_CTLR 506#ifdef CONFIG_CGROUP_MEM_RES_CTLR
547void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) 507void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
548{ 508{
549 unsigned long points = 0; 509 unsigned long limit;
510 unsigned int points = 0;
550 struct task_struct *p; 511 struct task_struct *p;
551 512
552 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0); 513 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0);
514 limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
553 read_lock(&tasklist_lock); 515 read_lock(&tasklist_lock);
554retry: 516retry:
555 p = select_bad_process(&points, mem, NULL); 517 p = select_bad_process(&points, limit, mem, NULL);
556 if (!p || PTR_ERR(p) == -1UL) 518 if (!p || PTR_ERR(p) == -1UL)
557 goto out; 519 goto out;
558 520
559 if (oom_kill_process(p, gfp_mask, 0, points, mem, NULL, 521 if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL,
560 "Memory cgroup out of memory")) 522 "Memory cgroup out of memory"))
561 goto retry; 523 goto retry;
562out: 524out:
@@ -681,8 +643,9 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
681 int order, nodemask_t *nodemask) 643 int order, nodemask_t *nodemask)
682{ 644{
683 struct task_struct *p; 645 struct task_struct *p;
646 unsigned long totalpages;
684 unsigned long freed = 0; 647 unsigned long freed = 0;
685 unsigned long points; 648 unsigned int points;
686 enum oom_constraint constraint = CONSTRAINT_NONE; 649 enum oom_constraint constraint = CONSTRAINT_NONE;
687 650
688 blocking_notifier_call_chain(&oom_notify_list, 0, &freed); 651 blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
@@ -705,8 +668,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
705 * Check if there were limitations on the allocation (only relevant for 668 * Check if there were limitations on the allocation (only relevant for
706 * NUMA) that may require different handling. 669 * NUMA) that may require different handling.
707 */ 670 */
708 if (zonelist) 671 constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
709 constraint = constrained_alloc(zonelist, gfp_mask, nodemask); 672 &totalpages);
710 check_panic_on_oom(constraint, gfp_mask, order); 673 check_panic_on_oom(constraint, gfp_mask, order);
711 674
712 read_lock(&tasklist_lock); 675 read_lock(&tasklist_lock);
@@ -718,14 +681,14 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
718 * non-zero, current could not be killed so we must fallback to 681 * non-zero, current could not be killed so we must fallback to
719 * the tasklist scan. 682 * the tasklist scan.
720 */ 683 */
721 if (!oom_kill_process(current, gfp_mask, order, 0, NULL, 684 if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
722 nodemask, 685 NULL, nodemask,
723 "Out of memory (oom_kill_allocating_task)")) 686 "Out of memory (oom_kill_allocating_task)"))
724 return; 687 return;
725 } 688 }
726 689
727retry: 690retry:
728 p = select_bad_process(&points, NULL, 691 p = select_bad_process(&points, totalpages, NULL,
729 constraint == CONSTRAINT_MEMORY_POLICY ? nodemask : 692 constraint == CONSTRAINT_MEMORY_POLICY ? nodemask :
730 NULL); 693 NULL);
731 if (PTR_ERR(p) == -1UL) 694 if (PTR_ERR(p) == -1UL)
@@ -738,8 +701,8 @@ retry:
738 panic("Out of memory and no killable processes...\n"); 701 panic("Out of memory and no killable processes...\n");
739 } 702 }
740 703
741 if (oom_kill_process(p, gfp_mask, order, points, NULL, nodemask, 704 if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
742 "Out of memory")) 705 nodemask, "Out of memory"))
743 goto retry; 706 goto retry;
744 read_unlock(&tasklist_lock); 707 read_unlock(&tasklist_lock);
745 708