aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2012-05-29 18:06:47 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-05-29 19:22:24 -0400
commita7f638f999ff42310e9582273b1fe25ea6e469ba (patch)
tree174dec4d849b78023a23a15d5947cf1d3be9f564
parentfe35004fbf9eaf67482b074a2e032abb9c89b1dd (diff)
mm, oom: normalize oom scores to oom_score_adj scale only for userspace
The oom_score_adj scale ranges from -1000 to 1000 and represents the proportion of memory available to the process at allocation time. This means an oom_score_adj value of 300, for example, will bias a process as though it was using an extra 30.0% of available memory and a value of -350 will discount 35.0% of available memory from its usage. The oom killer badness heuristic also uses this scale to report the oom score for each eligible process in determining the "best" process to kill. Thus, it can only differentiate each process's memory usage by 0.1% of system RAM. On large systems, this can end up being a large amount of memory: 256MB on 256GB systems, for example. This can be fixed by having the badness heuristic to use the actual memory usage in scoring threads and then normalizing it to the oom_score_adj scale for userspace. This results in better comparison between eligible threads for kill and no change from the userspace perspective. Suggested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Tested-by: Dave Jones <davej@redhat.com> Signed-off-by: David Rientjes <rientjes@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/proc/base.c5
-rw-r--r--include/linux/oom.h5
-rw-r--r--mm/oom_kill.c44
3 files changed, 22 insertions, 32 deletions
diff --git a/fs/proc/base.c b/fs/proc/base.c
index d2d3108a611c..d7d711876b6a 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -411,12 +411,13 @@ static const struct file_operations proc_lstats_operations = {
411 411
412static int proc_oom_score(struct task_struct *task, char *buffer) 412static int proc_oom_score(struct task_struct *task, char *buffer)
413{ 413{
414 unsigned long totalpages = totalram_pages + total_swap_pages;
414 unsigned long points = 0; 415 unsigned long points = 0;
415 416
416 read_lock(&tasklist_lock); 417 read_lock(&tasklist_lock);
417 if (pid_alive(task)) 418 if (pid_alive(task))
418 points = oom_badness(task, NULL, NULL, 419 points = oom_badness(task, NULL, NULL, totalpages) *
419 totalram_pages + total_swap_pages); 420 1000 / totalpages;
420 read_unlock(&tasklist_lock); 421 read_unlock(&tasklist_lock);
421 return sprintf(buffer, "%lu\n", points); 422 return sprintf(buffer, "%lu\n", points);
422} 423}
diff --git a/include/linux/oom.h b/include/linux/oom.h
index 3d7647536b03..e4c29bc72e70 100644
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -43,8 +43,9 @@ enum oom_constraint {
43extern void compare_swap_oom_score_adj(int old_val, int new_val); 43extern void compare_swap_oom_score_adj(int old_val, int new_val);
44extern int test_set_oom_score_adj(int new_val); 44extern int test_set_oom_score_adj(int new_val);
45 45
46extern unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, 46extern unsigned long oom_badness(struct task_struct *p,
47 const nodemask_t *nodemask, unsigned long totalpages); 47 struct mem_cgroup *memcg, const nodemask_t *nodemask,
48 unsigned long totalpages);
48extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); 49extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
49extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); 50extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
50 51
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 9f09a1fde9f9..ed0e19677360 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -180,10 +180,10 @@ static bool oom_unkillable_task(struct task_struct *p,
180 * predictable as possible. The goal is to return the highest value for the 180 * predictable as possible. The goal is to return the highest value for the
181 * task consuming the most memory to avoid subsequent oom failures. 181 * task consuming the most memory to avoid subsequent oom failures.
182 */ 182 */
183unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, 183unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
184 const nodemask_t *nodemask, unsigned long totalpages) 184 const nodemask_t *nodemask, unsigned long totalpages)
185{ 185{
186 long points; 186 unsigned long points;
187 187
188 if (oom_unkillable_task(p, memcg, nodemask)) 188 if (oom_unkillable_task(p, memcg, nodemask))
189 return 0; 189 return 0;
@@ -198,21 +198,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
198 } 198 }
199 199
200 /* 200 /*
201 * The memory controller may have a limit of 0 bytes, so avoid a divide
202 * by zero, if necessary.
203 */
204 if (!totalpages)
205 totalpages = 1;
206
207 /*
208 * The baseline for the badness score is the proportion of RAM that each 201 * The baseline for the badness score is the proportion of RAM that each
209 * task's rss, pagetable and swap space use. 202 * task's rss, pagetable and swap space use.
210 */ 203 */
211 points = get_mm_rss(p->mm) + p->mm->nr_ptes; 204 points = get_mm_rss(p->mm) + p->mm->nr_ptes +
212 points += get_mm_counter(p->mm, MM_SWAPENTS); 205 get_mm_counter(p->mm, MM_SWAPENTS);
213
214 points *= 1000;
215 points /= totalpages;
216 task_unlock(p); 206 task_unlock(p);
217 207
218 /* 208 /*
@@ -220,23 +210,20 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
220 * implementation used by LSMs. 210 * implementation used by LSMs.
221 */ 211 */
222 if (has_capability_noaudit(p, CAP_SYS_ADMIN)) 212 if (has_capability_noaudit(p, CAP_SYS_ADMIN))
223 points -= 30; 213 points -= 30 * totalpages / 1000;
224 214
225 /* 215 /*
226 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may 216 * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may
227 * either completely disable oom killing or always prefer a certain 217 * either completely disable oom killing or always prefer a certain
228 * task. 218 * task.
229 */ 219 */
230 points += p->signal->oom_score_adj; 220 points += p->signal->oom_score_adj * totalpages / 1000;
231 221
232 /* 222 /*
233 * Never return 0 for an eligible task that may be killed since it's 223 * Never return 0 for an eligible task regardless of the root bonus and
234 * possible that no single user task uses more than 0.1% of memory and 224 * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
235 * no single admin tasks uses more than 3.0%.
236 */ 225 */
237 if (points <= 0) 226 return points ? points : 1;
238 return 1;
239 return (points < 1000) ? points : 1000;
240} 227}
241 228
242/* 229/*
@@ -314,7 +301,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
314{ 301{
315 struct task_struct *g, *p; 302 struct task_struct *g, *p;
316 struct task_struct *chosen = NULL; 303 struct task_struct *chosen = NULL;
317 *ppoints = 0; 304 unsigned long chosen_points = 0;
318 305
319 do_each_thread(g, p) { 306 do_each_thread(g, p) {
320 unsigned int points; 307 unsigned int points;
@@ -354,7 +341,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
354 */ 341 */
355 if (p == current) { 342 if (p == current) {
356 chosen = p; 343 chosen = p;
357 *ppoints = 1000; 344 chosen_points = ULONG_MAX;
358 } else if (!force_kill) { 345 } else if (!force_kill) {
359 /* 346 /*
360 * If this task is not being ptraced on exit, 347 * If this task is not being ptraced on exit,
@@ -367,12 +354,13 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
367 } 354 }
368 355
369 points = oom_badness(p, memcg, nodemask, totalpages); 356 points = oom_badness(p, memcg, nodemask, totalpages);
370 if (points > *ppoints) { 357 if (points > chosen_points) {
371 chosen = p; 358 chosen = p;
372 *ppoints = points; 359 chosen_points = points;
373 } 360 }
374 } while_each_thread(g, p); 361 } while_each_thread(g, p);
375 362
363 *ppoints = chosen_points * 1000 / totalpages;
376 return chosen; 364 return chosen;
377} 365}
378 366
@@ -572,7 +560,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
572 } 560 }
573 561
574 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 562 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL);
575 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; 563 limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1;
576 read_lock(&tasklist_lock); 564 read_lock(&tasklist_lock);
577 p = select_bad_process(&points, limit, memcg, NULL, false); 565 p = select_bad_process(&points, limit, memcg, NULL, false);
578 if (p && PTR_ERR(p) != -1UL) 566 if (p && PTR_ERR(p) != -1UL)