Linux-2.6.12-rc2v2.6.12-rc2

Initial git repository build. I'm not bothering with the full history, even though we have it. We can create a separate "historical" git archive of that later if we want to, and in the meantime it's about 3.2GB when imported into git - space that would just make the early git days unnecessarily complicated, when we don't have a lot of good infrastructure for it. Let it rip!
author: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
committer: Linus Torvalds <torvalds@ppc970.osdl.org> 2005-04-16 18:20:36 -0400
commit: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree: 0bba044c4ce775e45a88a51686b5d9f90697ea9d /mm/oom_kill.c
1 files changed, 292 insertions, 0 deletions
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
new file mode 100644
index 000000000000..9595a0f6c4b8
--- /dev/null
+++ b/mm/oom_kill.c
@@ -0,0 +1,292 @@
+/*
+ *  linux/mm/oom_kill.c
+ * 
+ *  Copyright (C)  1998,2000  Rik van Riel
+ *      Thanks go out to Claus Fischer for some serious inspiration and
+ *      for goading me into coding this file...
+ *
+ *  The routines in this file are used to kill a process when
+ *  we're seriously out of memory. This gets called from kswapd()
+ *  in linux/mm/vmscan.c when we really run out of memory.
+ *
+ *  Since we won't call these routines often (on a well-configured
+ *  machine) this file will double as a 'coding guide' and a signpost
+ *  for newbie kernel hackers. It features several pointers to major
+ *  kernel subsystems and hints as to where to find out what things do.
+ */
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/swap.h>
+#include <linux/timex.h>
+#include <linux/jiffies.h>
+/* #define DEBUG */
+/**
+ * oom_badness - calculate a numeric value for how bad this task has been
+ * @p: task struct of which task we should calculate
+ * @p: current uptime in seconds
+ *
+ * The formula used is relatively simple and documented inline in the
+ * function. The main rationale is that we want to select a good task
+ * to kill when we run out of memory.
+ *
+ * Good in this context means that:
+ * 1) we lose the minimum amount of work done
+ * 2) we recover a large amount of memory
+ * 3) we don't kill anything innocent of eating tons of memory
+ * 4) we want to kill the minimum amount of processes (one)
+ * 5) we try to kill the process the user expects us to kill, this
+ *    algorithm has been meticulously tuned to meet the principle
+ *    of least surprise ... (be careful when you change it)
+ */
+unsigned long badness(struct task_struct *p, unsigned long uptime)
+{
+        unsigned long points, cpu_time, run_time, s;
+        struct list_head *tsk;
+        if (!p->mm)
+                return 0;
+        /*
+         * The memory size of the process is the basis for the badness.
+         */
+        points = p->mm->total_vm;
+        /*
+         * Processes which fork a lot of child processes are likely
+         * a good choice. We add the vmsize of the childs if they
+         * have an own mm. This prevents forking servers to flood the
+         * machine with an endless amount of childs
+         */
+        list_for_each(tsk, &p->children) {
+                struct task_struct *chld;
+                chld = list_entry(tsk, struct task_struct, sibling);
+                if (chld->mm != p->mm && chld->mm)
+                        points += chld->mm->total_vm;
+        }
+        /*
+         * CPU time is in tens of seconds and run time is in thousands
+         * of seconds. There is no particular reason for this other than
+         * that it turned out to work very well in practice.
+         */
+        cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))
+                >> (SHIFT_HZ + 3);
+        if (uptime >= p->start_time.tv_sec)
+                run_time = (uptime - p->start_time.tv_sec) >> 10;
+        else
+                run_time = 0;
+        s = int_sqrt(cpu_time);
+        if (s)
+                points /= s;
+        s = int_sqrt(int_sqrt(run_time));
+        if (s)
+                points /= s;
+        /*
+         * Niced processes are most likely less important, so double
+         * their badness points.
+         */
+        if (task_nice(p) > 0)
+                points *= 2;
+        /*
+         * Superuser processes are usually more important, so we make it
+         * less likely that we kill those.
+         */
+        if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) ||
+                                p->uid == 0 || p->euid == 0)
+                points /= 4;
+        /*
+         * We don't want to kill a process with direct hardware access.
+         * Not only could that mess up the hardware, but usually users
+         * tend to only have this flag set on applications they think
+         * of as important.
+         */
+        if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
+                points /= 4;
+        /*
+         * Adjust the score by oomkilladj.
+         */
+        if (p->oomkilladj) {
+                if (p->oomkilladj > 0)
+                        points <<= p->oomkilladj;
+                else
+                        points >>= -(p->oomkilladj);
+        }
+#ifdef DEBUG
+        printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
+        p->pid, p->comm, points);
+#endif
+        return points;
+}
+/*
+ * Simple selection loop. We chose the process with the highest
+ * number of 'points'. We expect the caller will lock the tasklist.
+ *
+ * (not docbooked, we don't want this one cluttering up the manual)
+ */
+static struct task_struct * select_bad_process(void)
+{
+        unsigned long maxpoints = 0;
+        struct task_struct *g, *p;
+        struct task_struct *chosen = NULL;
+        struct timespec uptime;
+        do_posix_clock_monotonic_gettime(&uptime);
+        do_each_thread(g, p)
+                /* skip the init task with pid == 1 */
+                if (p->pid > 1) {
+                        unsigned long points;
+                        /*
+                         * This is in the process of releasing memory so wait it
+                         * to finish before killing some other task by mistake.
+                         */
+                        if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) || (p->flags & PF_EXITING)) &&
+                            !(p->flags & PF_DEAD))
+                                return ERR_PTR(-1UL);
+                        if (p->flags & PF_SWAPOFF)
+                                return p;
+                        points = badness(p, uptime.tv_sec);
+                        if (points > maxpoints || !chosen) {
+                                chosen = p;
+                                maxpoints = points;
+                        }
+                }
+        while_each_thread(g, p);
+        return chosen;
+}
+/**
+ * We must be careful though to never send SIGKILL a process with
+ * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
+ * we select a process with CAP_SYS_RAW_IO set).
+ */
+static void __oom_kill_task(task_t *p)
+{
+        if (p->pid == 1) {
+                WARN_ON(1);
+                printk(KERN_WARNING "tried to kill init!\n");
+                return;
+        }
+        task_lock(p);
+        if (!p->mm || p->mm == &init_mm) {
+                WARN_ON(1);
+                printk(KERN_WARNING "tried to kill an mm-less task!\n");
+                task_unlock(p);
+                return;
+        }
+        task_unlock(p);
+        printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm);
+        /*
+         * We give our sacrificial lamb high priority and access to
+         * all the memory it needs. That way it should be able to
+         * exit() and clear out its resources quickly...
+         */
+        p->time_slice = HZ;
+        set_tsk_thread_flag(p, TIF_MEMDIE);
+        force_sig(SIGKILL, p);
+}
+static struct mm_struct *oom_kill_task(task_t *p)
+{
+        struct mm_struct *mm = get_task_mm(p);
+        task_t * g, * q;
+        if (!mm)
+                return NULL;
+        if (mm == &init_mm) {
+                mmput(mm);
+                return NULL;
+        }
+        __oom_kill_task(p);
+        /*
+         * kill all processes that share the ->mm (i.e. all threads),
+         * but are in a different thread group
+         */
+        do_each_thread(g, q)
+                if (q->mm == mm && q->tgid != p->tgid)
+                        __oom_kill_task(q);
+        while_each_thread(g, q);
+        return mm;
+}
+static struct mm_struct *oom_kill_process(struct task_struct *p)
+{
+        struct mm_struct *mm;
+        struct task_struct *c;
+        struct list_head *tsk;
+        /* Try to kill a child first */
+        list_for_each(tsk, &p->children) {
+                c = list_entry(tsk, struct task_struct, sibling);
+                if (c->mm == p->mm)
+                        continue;
+                mm = oom_kill_task(c);
+                if (mm)
+                        return mm;
+        }
+        return oom_kill_task(p);
+}
+/**
+ * oom_kill - kill the "best" process when we run out of memory
+ *
+ * If we run out of memory, we have the choice between either
+ * killing a random task (bad), letting the system crash (worse)
+ * OR try to be smart about which process to kill. Note that we
+ * don't have to be perfect here, we just have to be good.
+ */
+void out_of_memory(unsigned int __nocast gfp_mask)
+{
+        struct mm_struct *mm = NULL;
+        task_t * p;
+        read_lock(&tasklist_lock);
+retry:
+        p = select_bad_process();
+        if (PTR_ERR(p) == -1UL)
+                goto out;
+        /* Found nothing?!?! Either we hang forever, or we panic. */
+        if (!p) {
+                read_unlock(&tasklist_lock);
+                show_free_areas();
+                panic("Out of memory and no killable processes...\n");
+        }
+        printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
+        show_free_areas();
+        mm = oom_kill_process(p);
+        if (!mm)
+                goto retry;
+ out:
+        read_unlock(&tasklist_lock);
+        if (mm)
+                mmput(mm);
+        /*
+         * Give "p" a good chance of killing itself before we
+         * retry to allocate memory.
+         */
+        __set_current_state(TASK_INTERRUPTIBLE);
+        schedule_timeout(1);
+}
author	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	2005-04-16 18:20:36 -0400
commit	1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 (patch)
tree	0bba044c4ce775e45a88a51686b5d9f90697ea9d /mm/oom_kill.c

diff --git a/mm/oom_kill.c b/mm/oom_kill.c new file mode 100644 index 000000000000..9595a0f6c4b8 --- /dev/null +++ b/mm/oom_kill.c
@@ -0,0 +1,292 @@
	1	/*
	2	* linux/mm/oom_kill.c
	3	*
	4	* Copyright (C) 1998,2000 Rik van Riel
	5	* Thanks go out to Claus Fischer for some serious inspiration and
	6	* for goading me into coding this file...
	7	*
	8	* The routines in this file are used to kill a process when
	9	* we're seriously out of memory. This gets called from kswapd()
	10	* in linux/mm/vmscan.c when we really run out of memory.
	11	*
	12	* Since we won't call these routines often (on a well-configured
	13	* machine) this file will double as a 'coding guide' and a signpost
	14	* for newbie kernel hackers. It features several pointers to major
	15	* kernel subsystems and hints as to where to find out what things do.
	16	*/
	17
	18	#include <linux/mm.h>
	19	#include <linux/sched.h>
	20	#include <linux/swap.h>
	21	#include <linux/timex.h>
	22	#include <linux/jiffies.h>
	23
	24	/* #define DEBUG */
	25
	26	/**
	27	* oom_badness - calculate a numeric value for how bad this task has been
	28	* @p: task struct of which task we should calculate
	29	* @p: current uptime in seconds
	30	*
	31	* The formula used is relatively simple and documented inline in the
	32	* function. The main rationale is that we want to select a good task
	33	* to kill when we run out of memory.
	34	*
	35	* Good in this context means that:
	36	* 1) we lose the minimum amount of work done
	37	* 2) we recover a large amount of memory
	38	* 3) we don't kill anything innocent of eating tons of memory
	39	* 4) we want to kill the minimum amount of processes (one)
	40	* 5) we try to kill the process the user expects us to kill, this
	41	* algorithm has been meticulously tuned to meet the principle
	42	* of least surprise ... (be careful when you change it)
	43	*/
	44
	45	unsigned long badness(struct task_struct *p, unsigned long uptime)
	46	{
	47	unsigned long points, cpu_time, run_time, s;
	48	struct list_head *tsk;
	49
	50	if (!p->mm)
	51	return 0;
	52
	53	/*
	54	* The memory size of the process is the basis for the badness.
	55	*/
	56	points = p->mm->total_vm;
	57
	58	/*
	59	* Processes which fork a lot of child processes are likely
	60	* a good choice. We add the vmsize of the childs if they
	61	* have an own mm. This prevents forking servers to flood the
	62	* machine with an endless amount of childs
	63	*/
	64	list_for_each(tsk, &p->children) {
	65	struct task_struct *chld;
	66	chld = list_entry(tsk, struct task_struct, sibling);
	67	if (chld->mm != p->mm && chld->mm)
	68	points += chld->mm->total_vm;
	69	}
	70
	71	/*
	72	* CPU time is in tens of seconds and run time is in thousands
	73	* of seconds. There is no particular reason for this other than
	74	* that it turned out to work very well in practice.
	75	*/
	76	cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))
	77	>> (SHIFT_HZ + 3);
	78
	79	if (uptime >= p->start_time.tv_sec)
	80	run_time = (uptime - p->start_time.tv_sec) >> 10;
	81	else
	82	run_time = 0;
	83
	84	s = int_sqrt(cpu_time);
	85	if (s)
	86	points /= s;
	87	s = int_sqrt(int_sqrt(run_time));
	88	if (s)
	89	points /= s;
	90
	91	/*
	92	* Niced processes are most likely less important, so double
	93	* their badness points.
	94	*/
	95	if (task_nice(p) > 0)
	96	points *= 2;
	97
	98	/*
	99	* Superuser processes are usually more important, so we make it
	100	* less likely that we kill those.
	101	*/
	102	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) \|\|
	103	p->uid == 0 \|\| p->euid == 0)
	104	points /= 4;
	105
	106	/*
	107	* We don't want to kill a process with direct hardware access.
	108	* Not only could that mess up the hardware, but usually users
	109	* tend to only have this flag set on applications they think
	110	* of as important.
	111	*/
	112	if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO))
	113	points /= 4;
	114
	115	/*
	116	* Adjust the score by oomkilladj.
	117	*/
	118	if (p->oomkilladj) {
	119	if (p->oomkilladj > 0)
	120	points <<= p->oomkilladj;
	121	else
	122	points >>= -(p->oomkilladj);
	123	}
	124
	125	#ifdef DEBUG
	126	printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n",
	127	p->pid, p->comm, points);
	128	#endif
	129	return points;
	130	}
	131
	132	/*
	133	* Simple selection loop. We chose the process with the highest
	134	* number of 'points'. We expect the caller will lock the tasklist.
	135	*
	136	* (not docbooked, we don't want this one cluttering up the manual)
	137	*/
	138	static struct task_struct * select_bad_process(void)
	139	{
	140	unsigned long maxpoints = 0;
	141	struct task_struct g, p;
	142	struct task_struct *chosen = NULL;
	143	struct timespec uptime;
	144
	145	do_posix_clock_monotonic_gettime(&uptime);
	146	do_each_thread(g, p)
	147	/* skip the init task with pid == 1 */
	148	if (p->pid > 1) {
	149	unsigned long points;
	150
	151	/*
	152	* This is in the process of releasing memory so wait it
	153	* to finish before killing some other task by mistake.
	154	*/
	155	if ((unlikely(test_tsk_thread_flag(p, TIF_MEMDIE)) \|\| (p->flags & PF_EXITING)) &&
	156	!(p->flags & PF_DEAD))
	157	return ERR_PTR(-1UL);
	158	if (p->flags & PF_SWAPOFF)
	159	return p;
	160
	161	points = badness(p, uptime.tv_sec);
	162	if (points > maxpoints \|\| !chosen) {
	163	chosen = p;
	164	maxpoints = points;
	165	}
	166	}
	167	while_each_thread(g, p);
	168	return chosen;
	169	}
	170
	171	/**
	172	* We must be careful though to never send SIGKILL a process with
	173	* CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that
	174	* we select a process with CAP_SYS_RAW_IO set).
	175	*/
	176	static void __oom_kill_task(task_t *p)
	177	{
	178	if (p->pid == 1) {
	179	WARN_ON(1);
	180	printk(KERN_WARNING "tried to kill init!\n");
	181	return;
	182	}
	183
	184	task_lock(p);
	185	if (!p->mm \|\| p->mm == &init_mm) {
	186	WARN_ON(1);
	187	printk(KERN_WARNING "tried to kill an mm-less task!\n");
	188	task_unlock(p);
	189	return;
	190	}
	191	task_unlock(p);
	192	printk(KERN_ERR "Out of Memory: Killed process %d (%s).\n", p->pid, p->comm);
	193
	194	/*
	195	* We give our sacrificial lamb high priority and access to
	196	* all the memory it needs. That way it should be able to
	197	* exit() and clear out its resources quickly...
	198	*/
	199	p->time_slice = HZ;
	200	set_tsk_thread_flag(p, TIF_MEMDIE);
	201
	202	force_sig(SIGKILL, p);
	203	}
	204
	205	static struct mm_struct oom_kill_task(task_t p)
	206	{
	207	struct mm_struct *mm = get_task_mm(p);
	208	task_t * g, * q;
	209
	210	if (!mm)
	211	return NULL;
	212	if (mm == &init_mm) {
	213	mmput(mm);
	214	return NULL;
	215	}
	216
	217	__oom_kill_task(p);
	218	/*
	219	* kill all processes that share the ->mm (i.e. all threads),
	220	* but are in a different thread group
	221	*/
	222	do_each_thread(g, q)
	223	if (q->mm == mm && q->tgid != p->tgid)
	224	__oom_kill_task(q);
	225	while_each_thread(g, q);
	226
	227	return mm;
	228	}
	229
	230	static struct mm_struct oom_kill_process(struct task_struct p)
	231	{
	232	struct mm_struct *mm;
	233	struct task_struct *c;
	234	struct list_head *tsk;
	235
	236	/* Try to kill a child first */
	237	list_for_each(tsk, &p->children) {
	238	c = list_entry(tsk, struct task_struct, sibling);
	239	if (c->mm == p->mm)
	240	continue;
	241	mm = oom_kill_task(c);
	242	if (mm)
	243	return mm;
	244	}
	245	return oom_kill_task(p);
	246	}
	247
	248	/**
	249	* oom_kill - kill the "best" process when we run out of memory
	250	*
	251	* If we run out of memory, we have the choice between either
	252	* killing a random task (bad), letting the system crash (worse)
	253	* OR try to be smart about which process to kill. Note that we
	254	* don't have to be perfect here, we just have to be good.
	255	*/
	256	void out_of_memory(unsigned int __nocast gfp_mask)
	257	{
	258	struct mm_struct *mm = NULL;
	259	task_t * p;
	260
	261	read_lock(&tasklist_lock);
	262	retry:
	263	p = select_bad_process();
	264
	265	if (PTR_ERR(p) == -1UL)
	266	goto out;
	267
	268	/* Found nothing?!?! Either we hang forever, or we panic. */
	269	if (!p) {
	270	read_unlock(&tasklist_lock);
	271	show_free_areas();
	272	panic("Out of memory and no killable processes...\n");
	273	}
	274
	275	printk("oom-killer: gfp_mask=0x%x\n", gfp_mask);
	276	show_free_areas();
	277	mm = oom_kill_process(p);
	278	if (!mm)
	279	goto retry;
	280
	281	out:
	282	read_unlock(&tasklist_lock);
	283	if (mm)
	284	mmput(mm);
	285
	286	/*
	287	* Give "p" a good chance of killing itself before we
	288	* retry to allocate memory.
	289	*/
	290	__set_current_state(TASK_INTERRUPTIBLE);
	291	schedule_timeout(1);
	292	}