diff options
author | David Rientjes <rientjes@google.com> | 2012-05-29 18:06:47 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-05-29 19:22:24 -0400 |
commit | a7f638f999ff42310e9582273b1fe25ea6e469ba (patch) | |
tree | 174dec4d849b78023a23a15d5947cf1d3be9f564 /mm | |
parent | fe35004fbf9eaf67482b074a2e032abb9c89b1dd (diff) |
mm, oom: normalize oom scores to oom_score_adj scale only for userspace
The oom_score_adj scale ranges from -1000 to 1000 and represents the
proportion of memory available to the process at allocation time. This
means an oom_score_adj value of 300, for example, will bias a process as
though it was using an extra 30.0% of available memory and a value of
-350 will discount 35.0% of available memory from its usage.
The oom killer badness heuristic also uses this scale to report the oom
score for each eligible process in determining the "best" process to
kill. Thus, it can only differentiate each process's memory usage by
0.1% of system RAM.
On large systems, this can end up being a large amount of memory: 256MB
on 256GB systems, for example.
This can be fixed by having the badness heuristic to use the actual
memory usage in scoring threads and then normalizing it to the
oom_score_adj scale for userspace. This results in better comparison
between eligible threads for kill and no change from the userspace
perspective.
Suggested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Tested-by: Dave Jones <davej@redhat.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/oom_kill.c | 44 |
1 files changed, 16 insertions, 28 deletions
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 9f09a1fde9f9..ed0e19677360 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -180,10 +180,10 @@ static bool oom_unkillable_task(struct task_struct *p, | |||
180 | * predictable as possible. The goal is to return the highest value for the | 180 | * predictable as possible. The goal is to return the highest value for the |
181 | * task consuming the most memory to avoid subsequent oom failures. | 181 | * task consuming the most memory to avoid subsequent oom failures. |
182 | */ | 182 | */ |
183 | unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | 183 | unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, |
184 | const nodemask_t *nodemask, unsigned long totalpages) | 184 | const nodemask_t *nodemask, unsigned long totalpages) |
185 | { | 185 | { |
186 | long points; | 186 | unsigned long points; |
187 | 187 | ||
188 | if (oom_unkillable_task(p, memcg, nodemask)) | 188 | if (oom_unkillable_task(p, memcg, nodemask)) |
189 | return 0; | 189 | return 0; |
@@ -198,21 +198,11 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
198 | } | 198 | } |
199 | 199 | ||
200 | /* | 200 | /* |
201 | * The memory controller may have a limit of 0 bytes, so avoid a divide | ||
202 | * by zero, if necessary. | ||
203 | */ | ||
204 | if (!totalpages) | ||
205 | totalpages = 1; | ||
206 | |||
207 | /* | ||
208 | * The baseline for the badness score is the proportion of RAM that each | 201 | * The baseline for the badness score is the proportion of RAM that each |
209 | * task's rss, pagetable and swap space use. | 202 | * task's rss, pagetable and swap space use. |
210 | */ | 203 | */ |
211 | points = get_mm_rss(p->mm) + p->mm->nr_ptes; | 204 | points = get_mm_rss(p->mm) + p->mm->nr_ptes + |
212 | points += get_mm_counter(p->mm, MM_SWAPENTS); | 205 | get_mm_counter(p->mm, MM_SWAPENTS); |
213 | |||
214 | points *= 1000; | ||
215 | points /= totalpages; | ||
216 | task_unlock(p); | 206 | task_unlock(p); |
217 | 207 | ||
218 | /* | 208 | /* |
@@ -220,23 +210,20 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg, | |||
220 | * implementation used by LSMs. | 210 | * implementation used by LSMs. |
221 | */ | 211 | */ |
222 | if (has_capability_noaudit(p, CAP_SYS_ADMIN)) | 212 | if (has_capability_noaudit(p, CAP_SYS_ADMIN)) |
223 | points -= 30; | 213 | points -= 30 * totalpages / 1000; |
224 | 214 | ||
225 | /* | 215 | /* |
226 | * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may | 216 | * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may |
227 | * either completely disable oom killing or always prefer a certain | 217 | * either completely disable oom killing or always prefer a certain |
228 | * task. | 218 | * task. |
229 | */ | 219 | */ |
230 | points += p->signal->oom_score_adj; | 220 | points += p->signal->oom_score_adj * totalpages / 1000; |
231 | 221 | ||
232 | /* | 222 | /* |
233 | * Never return 0 for an eligible task that may be killed since it's | 223 | * Never return 0 for an eligible task regardless of the root bonus and |
234 | * possible that no single user task uses more than 0.1% of memory and | 224 | * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here). |
235 | * no single admin tasks uses more than 3.0%. | ||
236 | */ | 225 | */ |
237 | if (points <= 0) | 226 | return points ? points : 1; |
238 | return 1; | ||
239 | return (points < 1000) ? points : 1000; | ||
240 | } | 227 | } |
241 | 228 | ||
242 | /* | 229 | /* |
@@ -314,7 +301,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
314 | { | 301 | { |
315 | struct task_struct *g, *p; | 302 | struct task_struct *g, *p; |
316 | struct task_struct *chosen = NULL; | 303 | struct task_struct *chosen = NULL; |
317 | *ppoints = 0; | 304 | unsigned long chosen_points = 0; |
318 | 305 | ||
319 | do_each_thread(g, p) { | 306 | do_each_thread(g, p) { |
320 | unsigned int points; | 307 | unsigned int points; |
@@ -354,7 +341,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
354 | */ | 341 | */ |
355 | if (p == current) { | 342 | if (p == current) { |
356 | chosen = p; | 343 | chosen = p; |
357 | *ppoints = 1000; | 344 | chosen_points = ULONG_MAX; |
358 | } else if (!force_kill) { | 345 | } else if (!force_kill) { |
359 | /* | 346 | /* |
360 | * If this task is not being ptraced on exit, | 347 | * If this task is not being ptraced on exit, |
@@ -367,12 +354,13 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, | |||
367 | } | 354 | } |
368 | 355 | ||
369 | points = oom_badness(p, memcg, nodemask, totalpages); | 356 | points = oom_badness(p, memcg, nodemask, totalpages); |
370 | if (points > *ppoints) { | 357 | if (points > chosen_points) { |
371 | chosen = p; | 358 | chosen = p; |
372 | *ppoints = points; | 359 | chosen_points = points; |
373 | } | 360 | } |
374 | } while_each_thread(g, p); | 361 | } while_each_thread(g, p); |
375 | 362 | ||
363 | *ppoints = chosen_points * 1000 / totalpages; | ||
376 | return chosen; | 364 | return chosen; |
377 | } | 365 | } |
378 | 366 | ||
@@ -572,7 +560,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, | |||
572 | } | 560 | } |
573 | 561 | ||
574 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); | 562 | check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); |
575 | limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT; | 563 | limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; |
576 | read_lock(&tasklist_lock); | 564 | read_lock(&tasklist_lock); |
577 | p = select_bad_process(&points, limit, memcg, NULL, false); | 565 | p = select_bad_process(&points, limit, memcg, NULL, false); |
578 | if (p && PTR_ERR(p) != -1UL) | 566 | if (p && PTR_ERR(p) != -1UL) |