Fix cpusets update_cpumask

Cause writes to cpuset "cpus" file to update cpus_allowed for member tasks: - collect batches of tasks under tasklist_lock and then call set_cpus_allowed() on them outside the lock (since this can sleep). - add a simple generic priority heap type to allow efficient collection of batches of tasks to be processed without duplicating or missing any tasks in subsequent batches. - make "cpus" file update a no-op if the mask hasn't changed - fix race between update_cpumask() and sched_setaffinity() by making sched_setaffinity() post-check that it's not running on any cpus outside cpuset_cpus_allowed(). [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Paul Menage <menage@google.com> Cc: Paul Jackson <pj@sgi.com> Cc: David Rientjes <rientjes@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Serge Hallyn <serue@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Paul Menage <menage@google.com> 2007-10-19 02:40:22 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-10-19 14:53:41 -0400
commit: 8707d8b8c0cbdf4441507f8dded194167da896c7 (patch)
tree: 1e9ac6b15027bd55263378e551c1595a937d66d6
parent: 020958b6272882c1a8bfbe5f3e0927f3845c2698 (diff)
5 files changed, 243 insertions, 5 deletions
diff --git a/include/linux/prio_heap.h b/include/linux/prio_heap.h
new file mode 100644
index 000000000000..08094350f26a
--- /dev/null
+++ b/include/linux/prio_heap.h
@@ -0,0 +1,58 @@
+#ifndef _LINUX_PRIO_HEAP_H
+#define _LINUX_PRIO_HEAP_H
+/*
+ * Simple insertion-only static-sized priority heap containing
+ * pointers, based on CLR, chapter 7
+ */
+#include <linux/gfp.h>
+/**
+ * struct ptr_heap - simple static-sized priority heap
+ * @ptrs - pointer to data area
+ * @max - max number of elements that can be stored in @ptrs
+ * @size - current number of valid elements in @ptrs (in the range 0..@size-1
+ * @gt: comparison operator, which should implement "greater than"
+ */
+struct ptr_heap {
+        void **ptrs;
+        int max;
+        int size;
+        int (*gt)(void *, void *);
+};
+/**
+ * heap_init - initialize an empty heap with a given memory size
+ * @heap: the heap structure to be initialized
+ * @size: amount of memory to use in bytes
+ * @gfp_mask: mask to pass to kmalloc()
+ * @gt: comparison operator, which should implement "greater than"
+ */
+extern int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask,
+                     int (*gt)(void *, void *));
+/**
+ * heap_free - release a heap's storage
+ * @heap: the heap structure whose data should be released
+ */
+void heap_free(struct ptr_heap *heap);
+/**
+ * heap_insert - insert a value into the heap and return any overflowed value
+ * @heap: the heap to be operated on
+ * @p: the pointer to be inserted
+ *
+ * Attempts to insert the given value into the priority heap. If the
+ * heap is full prior to the insertion, then the resulting heap will
+ * consist of the smallest @max elements of the original heap and the
+ * new element; the greatest element will be removed from the heap and
+ * returned. Note that the returned element will be the new element
+ * (i.e. no change to the heap) if the new element is greater than all
+ * elements currently in the heap.
+ */
+extern void *heap_insert(struct ptr_heap *heap, void *p);
+#endif /* _LINUX_PRIO_HEAP_H */
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 64ad59cfad9b..fa31cb9f9898 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -38,6 +38,7 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/pagemap.h>
+#include <linux/prio_heap.h>
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
 #include <linux/sched.h>
@@ -701,6 +702,36 @@ done:
        /* Don't kfree(doms) -- partition_sched_domains() does that. */
 }
+static inline int started_after_time(struct task_struct *t1,
+                                     struct timespec *time,
+                                     struct task_struct *t2)
+{
+        int start_diff = timespec_compare(&t1->start_time, time);
+        if (start_diff > 0) {
+                return 1;
+        } else if (start_diff < 0) {
+                return 0;
+        } else {
+                /*
+                 * Arbitrarily, if two processes started at the same
+                 * time, we'll say that the lower pointer value
+                 * started first. Note that t2 may have exited by now
+                 * so this may not be a valid pointer any longer, but
+                 * that's fine - it still serves to distinguish
+                 * between two tasks started (effectively)
+                 * simultaneously.
+                 */
+                return t1 > t2;
+        }
+}
+static inline int started_after(void *p1, void *p2)
+{
+        struct task_struct *t1 = p1;
+        struct task_struct *t2 = p2;
+        return started_after_time(t1, &t2->start_time, t2);
+}
 /*
 * Call with manage_mutex held.  May take callback_mutex during call.
 */
@@ -708,8 +739,15 @@ done:
 static int update_cpumask(struct cpuset *cs, char *buf)
 {
        struct cpuset trialcs;
-        int retval;
+        int retval, i;
-        int cpus_changed, is_load_balanced;
+        int is_load_balanced;
+        struct cgroup_iter it;
+        struct cgroup *cgrp = cs->css.cgroup;
+        struct task_struct *p, *dropped;
+        /* Never dereference latest_task, since it's not refcounted */
+        struct task_struct *latest_task = NULL;
+        struct ptr_heap heap;
+        struct timespec latest_time = { 0, 0 };
        /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
        if (cs == &top_cpuset)
@@ -736,14 +774,73 @@ static int update_cpumask(struct cpuset *cs, char *buf)
        if (retval < 0)
                return retval;
-        cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
+        /* Nothing to do if the cpus didn't change */
+        if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
+                return 0;
+        retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
+        if (retval)
+                return retval;
        is_load_balanced = is_sched_load_balance(&trialcs);
        mutex_lock(&callback_mutex);
        cs->cpus_allowed = trialcs.cpus_allowed;
        mutex_unlock(&callback_mutex);
-        if (cpus_changed && is_load_balanced)
+ again:
+        /*
+         * Scan tasks in the cpuset, and update the cpumasks of any
+         * that need an update. Since we can't call set_cpus_allowed()
+         * while holding tasklist_lock, gather tasks to be processed
+         * in a heap structure. If the statically-sized heap fills up,
+         * overflow tasks that started later, and in future iterations
+         * only consider tasks that started after the latest task in
+         * the previous pass. This guarantees forward progress and
+         * that we don't miss any tasks
+         */
+        heap.size = 0;
+        cgroup_iter_start(cgrp, &it);
+        while ((p = cgroup_iter_next(cgrp, &it))) {
+                /* Only affect tasks that don't have the right cpus_allowed */
+                if (cpus_equal(p->cpus_allowed, cs->cpus_allowed))
+                        continue;
+                /*
+                 * Only process tasks that started after the last task
+                 * we processed
+                 */
+                if (!started_after_time(p, &latest_time, latest_task))
+                        continue;
+                dropped = heap_insert(&heap, p);
+                if (dropped == NULL) {
+                        get_task_struct(p);
+                } else if (dropped != p) {
+                        get_task_struct(p);
+                        put_task_struct(dropped);
+                }
+        }
+        cgroup_iter_end(cgrp, &it);
+        if (heap.size) {
+                for (i = 0; i < heap.size; i++) {
+                        struct task_struct *p = heap.ptrs[i];
+                        if (i == 0) {
+                                latest_time = p->start_time;
+                                latest_task = p;
+                        }
+                        set_cpus_allowed(p, cs->cpus_allowed);
+                        put_task_struct(p);
+                }
+                /*
+                 * If we had to process any tasks at all, scan again
+                 * in case some of them were in the middle of forking
+                 * children that didn't notice the new cpumask
+                 * restriction.  Not the most efficient way to do it,
+                 * but it avoids having to take callback_mutex in the
+                 * fork path
+                 */
+                goto again;
+        }
+        heap_free(&heap);
+        if (is_load_balanced)
                rebuild_sched_domains();
        return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 39d6354af489..72a809a54d5b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4471,8 +4471,21 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
        cpus_allowed = cpuset_cpus_allowed(p);
        cpus_and(new_mask, new_mask, cpus_allowed);
+ again:
        retval = set_cpus_allowed(p, new_mask);
+        if (!retval) {
+                cpus_allowed = cpuset_cpus_allowed(p);
+                if (!cpus_subset(new_mask, cpus_allowed)) {
+                        /*
+                         * We must have raced with a concurrent cpuset
+                         * update. Just reset the cpus_allowed to the
+                         * cpuset's cpus_allowed
+                         */
+                        new_mask = cpus_allowed;
+                        goto again;
+                }
+        }
 out_unlock:
        put_task_struct(p);
        mutex_unlock(&sched_hotcpu_mutex);
diff --git a/lib/Makefile b/lib/Makefile
index c5f215d509d3..3a0983b77412 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -6,7 +6,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
         rbtree.o radix-tree.o dump_stack.o \
         idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
         sha1.o irq_regs.o reciprocal_div.o argv_split.o \
-         proportions.o
+         proportions.o prio_heap.o
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
diff --git a/lib/prio_heap.c b/lib/prio_heap.c
new file mode 100644
index 000000000000..471944a54e23
--- /dev/null
+++ b/lib/prio_heap.c
@@ -0,0 +1,70 @@
+/*
+ * Simple insertion-only static-sized priority heap containing
+ * pointers, based on CLR, chapter 7
+ */
+#include <linux/slab.h>
+#include <linux/prio_heap.h>
+int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask,
+              int (*gt)(void *, void *))
+{
+        heap->ptrs = kmalloc(size, gfp_mask);
+        if (!heap->ptrs)
+                return -ENOMEM;
+        heap->size = 0;
+        heap->max = size / sizeof(void *);
+        heap->gt = gt;
+        return 0;
+}
+void heap_free(struct ptr_heap *heap)
+{
+        kfree(heap->ptrs);
+}
+void *heap_insert(struct ptr_heap *heap, void *p)
+{
+        void *res;
+        void **ptrs = heap->ptrs;
+        int pos;
+        if (heap->size < heap->max) {
+                /* Heap insertion */
+                int pos = heap->size++;
+                while (pos > 0 && heap->gt(p, ptrs[(pos-1)/2])) {
+                        ptrs[pos] = ptrs[(pos-1)/2];
+                        pos = (pos-1)/2;
+                }
+                ptrs[pos] = p;
+                return NULL;
+        }
+        /* The heap is full, so something will have to be dropped */
+        /* If the new pointer is greater than the current max, drop it */
+        if (heap->gt(p, ptrs[0]))
+                return p;
+        /* Replace the current max and heapify */
+        res = ptrs[0];
+        ptrs[0] = p;
+        pos = 0;
+        while (1) {
+                int left = 2 * pos + 1;
+                int right = 2 * pos + 2;
+                int largest = pos;
+                if (left < heap->size && heap->gt(ptrs[left], p))
+                        largest = left;
+                if (right < heap->size && heap->gt(ptrs[right], ptrs[largest]))
+                        largest = right;
+                if (largest == pos)
+                        break;
+                /* Push p down the heap one level and bump one up */
+                ptrs[pos] = ptrs[largest];
+                ptrs[largest] = p;
+                pos = largest;
+        }
+        return res;
+}
author	Paul Menage <menage@google.com>	2007-10-19 02:40:22 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-10-19 14:53:41 -0400
commit	8707d8b8c0cbdf4441507f8dded194167da896c7 (patch)
tree	1e9ac6b15027bd55263378e551c1595a937d66d6
parent	020958b6272882c1a8bfbe5f3e0927f3845c2698 (diff)

diff --git a/include/linux/prio_heap.h b/include/linux/prio_heap.h new file mode 100644 index 000000000000..08094350f26a --- /dev/null +++ b/include/linux/prio_heap.h
@@ -0,0 +1,58 @@
		1	#ifndef _LINUX_PRIO_HEAP_H
		2	#define _LINUX_PRIO_HEAP_H
		3
		4	/*
		5	* Simple insertion-only static-sized priority heap containing
		6	* pointers, based on CLR, chapter 7
		7	*/
		8
		9	#include <linux/gfp.h>
		10
		11	/**
		12	* struct ptr_heap - simple static-sized priority heap
		13	* @ptrs - pointer to data area
		14	* @max - max number of elements that can be stored in @ptrs
		15	* @size - current number of valid elements in @ptrs (in the range 0..@size-1
		16	* @gt: comparison operator, which should implement "greater than"
		17	*/
		18	struct ptr_heap {
		19	void **ptrs;
		20	int max;
		21	int size;
		22	int (gt)(void , void *);
		23	};
		24
		25	/**
		26	* heap_init - initialize an empty heap with a given memory size
		27	* @heap: the heap structure to be initialized
		28	* @size: amount of memory to use in bytes
		29	* @gfp_mask: mask to pass to kmalloc()
		30	* @gt: comparison operator, which should implement "greater than"
		31	*/
		32	extern int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask,
		33	int (gt)(void , void *));
		34
		35	/**
		36	* heap_free - release a heap's storage
		37	* @heap: the heap structure whose data should be released
		38	*/
		39	void heap_free(struct ptr_heap *heap);
		40
		41	/**
		42	* heap_insert - insert a value into the heap and return any overflowed value
		43	* @heap: the heap to be operated on
		44	* @p: the pointer to be inserted
		45	*
		46	* Attempts to insert the given value into the priority heap. If the
		47	* heap is full prior to the insertion, then the resulting heap will
		48	* consist of the smallest @max elements of the original heap and the
		49	* new element; the greatest element will be removed from the heap and
		50	* returned. Note that the returned element will be the new element
		51	* (i.e. no change to the heap) if the new element is greater than all
		52	* elements currently in the heap.
		53	*/
		54	extern void heap_insert(struct ptr_heap heap, void *p);
		55
		56
		57
		58	#endif /* _LINUX_PRIO_HEAP_H */


diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 64ad59cfad9b..fa31cb9f9898 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c
@@ -38,6 +38,7 @@
38	#include <linux/mount.h>	38	#include <linux/mount.h>
39	#include <linux/namei.h>	39	#include <linux/namei.h>
40	#include <linux/pagemap.h>	40	#include <linux/pagemap.h>
		41	#include <linux/prio_heap.h>
41	#include <linux/proc_fs.h>	42	#include <linux/proc_fs.h>
42	#include <linux/rcupdate.h>	43	#include <linux/rcupdate.h>
43	#include <linux/sched.h>	44	#include <linux/sched.h>
@@ -701,6 +702,36 @@ done:
701	/* Don't kfree(doms) -- partition_sched_domains() does that. */	702	/* Don't kfree(doms) -- partition_sched_domains() does that. */
702	}	703	}
703		704
		705	static inline int started_after_time(struct task_struct *t1,
		706	struct timespec *time,
		707	struct task_struct *t2)
		708	{
		709	int start_diff = timespec_compare(&t1->start_time, time);
		710	if (start_diff > 0) {
		711	return 1;
		712	} else if (start_diff < 0) {
		713	return 0;
		714	} else {
		715	/*
		716	* Arbitrarily, if two processes started at the same
		717	* time, we'll say that the lower pointer value
		718	* started first. Note that t2 may have exited by now
		719	* so this may not be a valid pointer any longer, but
		720	* that's fine - it still serves to distinguish
		721	* between two tasks started (effectively)
		722	* simultaneously.
		723	*/
		724	return t1 > t2;
		725	}
		726	}
		727
		728	static inline int started_after(void p1, void p2)
		729	{
		730	struct task_struct *t1 = p1;
		731	struct task_struct *t2 = p2;
		732	return started_after_time(t1, &t2->start_time, t2);
		733	}
		734
704	/*	735	/*
705	* Call with manage_mutex held. May take callback_mutex during call.	736	* Call with manage_mutex held. May take callback_mutex during call.
706	*/	737	*/
@@ -708,8 +739,15 @@ done:
708	static int update_cpumask(struct cpuset cs, char buf)	739	static int update_cpumask(struct cpuset cs, char buf)
709	{	740	{
710	struct cpuset trialcs;	741	struct cpuset trialcs;
711	int retval;	742	int retval, i;
712	int cpus_changed, is_load_balanced;	743	int is_load_balanced;
		744	struct cgroup_iter it;
		745	struct cgroup *cgrp = cs->css.cgroup;
		746	struct task_struct p, dropped;
		747	/* Never dereference latest_task, since it's not refcounted */
		748	struct task_struct *latest_task = NULL;
		749	struct ptr_heap heap;
		750	struct timespec latest_time = { 0, 0 };
713		751
714	/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */	752	/* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
715	if (cs == &top_cpuset)	753	if (cs == &top_cpuset)
@@ -736,14 +774,73 @@ static int update_cpumask(struct cpuset cs, char buf)
736	if (retval < 0)	774	if (retval < 0)
737	return retval;	775	return retval;
738		776
739	cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);	777	/* Nothing to do if the cpus didn't change */
		778	if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
		779	return 0;
		780	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
		781	if (retval)
		782	return retval;
		783
740	is_load_balanced = is_sched_load_balance(&trialcs);	784	is_load_balanced = is_sched_load_balance(&trialcs);
741		785
742	mutex_lock(&callback_mutex);	786	mutex_lock(&callback_mutex);
743	cs->cpus_allowed = trialcs.cpus_allowed;	787	cs->cpus_allowed = trialcs.cpus_allowed;
744	mutex_unlock(&callback_mutex);	788	mutex_unlock(&callback_mutex);
745		789
746	if (cpus_changed && is_load_balanced)	790	again:
		791	/*
		792	* Scan tasks in the cpuset, and update the cpumasks of any
		793	* that need an update. Since we can't call set_cpus_allowed()
		794	* while holding tasklist_lock, gather tasks to be processed
		795	* in a heap structure. If the statically-sized heap fills up,
		796	* overflow tasks that started later, and in future iterations
		797	* only consider tasks that started after the latest task in
		798	* the previous pass. This guarantees forward progress and
		799	* that we don't miss any tasks
		800	*/
		801	heap.size = 0;
		802	cgroup_iter_start(cgrp, &it);
		803	while ((p = cgroup_iter_next(cgrp, &it))) {
		804	/* Only affect tasks that don't have the right cpus_allowed */
		805	if (cpus_equal(p->cpus_allowed, cs->cpus_allowed))
		806	continue;
		807	/*
		808	* Only process tasks that started after the last task
		809	* we processed
		810	*/
		811	if (!started_after_time(p, &latest_time, latest_task))
		812	continue;
		813	dropped = heap_insert(&heap, p);
		814	if (dropped == NULL) {
		815	get_task_struct(p);
		816	} else if (dropped != p) {
		817	get_task_struct(p);
		818	put_task_struct(dropped);
		819	}
		820	}
		821	cgroup_iter_end(cgrp, &it);
		822	if (heap.size) {
		823	for (i = 0; i < heap.size; i++) {
		824	struct task_struct *p = heap.ptrs[i];
		825	if (i == 0) {
		826	latest_time = p->start_time;
		827	latest_task = p;
		828	}
		829	set_cpus_allowed(p, cs->cpus_allowed);
		830	put_task_struct(p);
		831	}
		832	/*
		833	* If we had to process any tasks at all, scan again
		834	* in case some of them were in the middle of forking
		835	* children that didn't notice the new cpumask
		836	* restriction. Not the most efficient way to do it,
		837	* but it avoids having to take callback_mutex in the
		838	* fork path
		839	*/
		840	goto again;
		841	}
		842	heap_free(&heap);
		843	if (is_load_balanced)
747	rebuild_sched_domains();	844	rebuild_sched_domains();
748		845
749	return 0;	846	return 0;


diff --git a/kernel/sched.c b/kernel/sched.c index 39d6354af489..72a809a54d5b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c
@@ -4471,8 +4471,21 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4471		4471
4472	cpus_allowed = cpuset_cpus_allowed(p);	4472	cpus_allowed = cpuset_cpus_allowed(p);
4473	cpus_and(new_mask, new_mask, cpus_allowed);	4473	cpus_and(new_mask, new_mask, cpus_allowed);
		4474	again:
4474	retval = set_cpus_allowed(p, new_mask);	4475	retval = set_cpus_allowed(p, new_mask);
4475		4476
		4477	if (!retval) {
		4478	cpus_allowed = cpuset_cpus_allowed(p);
		4479	if (!cpus_subset(new_mask, cpus_allowed)) {
		4480	/*
		4481	* We must have raced with a concurrent cpuset
		4482	* update. Just reset the cpus_allowed to the
		4483	* cpuset's cpus_allowed
		4484	*/
		4485	new_mask = cpus_allowed;
		4486	goto again;
		4487	}
		4488	}
4476	out_unlock:	4489	out_unlock:
4477	put_task_struct(p);	4490	put_task_struct(p);
4478	mutex_unlock(&sched_hotcpu_mutex);	4491	mutex_unlock(&sched_hotcpu_mutex);


diff --git a/lib/Makefile b/lib/Makefile index c5f215d509d3..3a0983b77412 100644 --- a/lib/Makefile +++ b/lib/Makefile
@@ -6,7 +6,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
6	rbtree.o radix-tree.o dump_stack.o \	6	rbtree.o radix-tree.o dump_stack.o \
7	idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \	7	idr.o int_sqrt.o bitmap.o extable.o prio_tree.o \
8	sha1.o irq_regs.o reciprocal_div.o argv_split.o \	8	sha1.o irq_regs.o reciprocal_div.o argv_split.o \
9	proportions.o	9	proportions.o prio_heap.o
10		10
11	lib-$(CONFIG_MMU) += ioremap.o	11	lib-$(CONFIG_MMU) += ioremap.o
12	lib-$(CONFIG_SMP) += cpumask.o	12	lib-$(CONFIG_SMP) += cpumask.o


diff --git a/lib/prio_heap.c b/lib/prio_heap.c new file mode 100644 index 000000000000..471944a54e23 --- /dev/null +++ b/lib/prio_heap.c
@@ -0,0 +1,70 @@
		1	/*
		2	* Simple insertion-only static-sized priority heap containing
		3	* pointers, based on CLR, chapter 7
		4	*/
		5
		6	#include <linux/slab.h>
		7	#include <linux/prio_heap.h>
		8
		9	int heap_init(struct ptr_heap *heap, size_t size, gfp_t gfp_mask,
		10	int (gt)(void , void *))
		11	{
		12	heap->ptrs = kmalloc(size, gfp_mask);
		13	if (!heap->ptrs)
		14	return -ENOMEM;
		15	heap->size = 0;
		16	heap->max = size / sizeof(void *);
		17	heap->gt = gt;
		18	return 0;
		19	}
		20
		21	void heap_free(struct ptr_heap *heap)
		22	{
		23	kfree(heap->ptrs);
		24	}
		25
		26	void heap_insert(struct ptr_heap heap, void *p)
		27	{
		28	void *res;
		29	void **ptrs = heap->ptrs;
		30	int pos;
		31
		32	if (heap->size < heap->max) {
		33	/* Heap insertion */
		34	int pos = heap->size++;
		35	while (pos > 0 && heap->gt(p, ptrs[(pos-1)/2])) {
		36	ptrs[pos] = ptrs[(pos-1)/2];
		37	pos = (pos-1)/2;
		38	}
		39	ptrs[pos] = p;
		40	return NULL;
		41	}
		42
		43	/* The heap is full, so something will have to be dropped */
		44
		45	/* If the new pointer is greater than the current max, drop it */
		46	if (heap->gt(p, ptrs[0]))
		47	return p;
		48
		49	/* Replace the current max and heapify */
		50	res = ptrs[0];
		51	ptrs[0] = p;
		52	pos = 0;
		53
		54	while (1) {
		55	int left = 2 * pos + 1;
		56	int right = 2 * pos + 2;
		57	int largest = pos;
		58	if (left < heap->size && heap->gt(ptrs[left], p))
		59	largest = left;
		60	if (right < heap->size && heap->gt(ptrs[right], ptrs[largest]))
		61	largest = right;
		62	if (largest == pos)
		63	break;
		64	/* Push p down the heap one level and bump one up */
		65	ptrs[pos] = ptrs[largest];
		66	ptrs[largest] = p;
		67	pos = largest;
		68	}
		69	return res;
		70	}