1 files changed, 119 insertions, 135 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f76db9dcaa05..026faccca869 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -128,10 +128,6 @@ static inline struct cpuset *task_cs(struct task_struct *task)
        return container_of(task_subsys_state(task, cpuset_subsys_id),
                            struct cpuset, css);
 }
-struct cpuset_hotplug_scanner {
-        struct cgroup_scanner scan;
-        struct cgroup *to;
-};
 /* bits in struct cpuset flags field */
 typedef enum {
@@ -521,6 +517,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
        return 0;
 }
+#ifdef CONFIG_SMP
 /*
 * Helper routine for generate_sched_domains().
 * Do cpusets a, b have overlapping cpus_allowed masks?
@@ -815,6 +812,18 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
        put_online_cpus();
 }
+#else /* !CONFIG_SMP */
+static void do_rebuild_sched_domains(struct work_struct *unused)
+{
+}
+static int generate_sched_domains(struct cpumask **domains,
+                        struct sched_domain_attr **attributes)
+{
+        *domains = NULL;
+        return 1;
+}
+#endif /* CONFIG_SMP */
 static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
@@ -1026,101 +1035,70 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
        mutex_unlock(&callback_mutex);
 }
+/*
+ * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
+ * nodes if memory_migrate flag is set. Called with cgroup_mutex held.
+ */
+static void cpuset_change_nodemask(struct task_struct *p,
+                                   struct cgroup_scanner *scan)
+{
+        struct mm_struct *mm;
+        struct cpuset *cs;
+        int migrate;
+        const nodemask_t *oldmem = scan->data;
+        mm = get_task_mm(p);
+        if (!mm)
+                return;
+        cs = cgroup_cs(scan->cg);
+        migrate = is_memory_migrate(cs);
+        mpol_rebind_mm(mm, &cs->mems_allowed);
+        if (migrate)
+                cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
+        mmput(mm);
+}
 static void *cpuset_being_rebound;
 /**
 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
 * @oldmem: old mems_allowed of cpuset cs
+ * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 *
 * Called with cgroup_mutex held
- * Return 0 if successful, -errno if not.
+ * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * if @heap != NULL.
 */
-static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
+static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
+                                 struct ptr_heap *heap)
 {
-        struct task_struct *p;
+        struct cgroup_scanner scan;
-        struct mm_struct **mmarray;
-        int i, n, ntasks;
-        int migrate;
-        int fudge;
-        struct cgroup_iter it;
-        int retval;
        cpuset_being_rebound = cs;              /* causes mpol_dup() rebind */
-        fudge = 10;                             /* spare mmarray[] slots */
+        scan.cg = cs->css.cgroup;
-        fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */
+        scan.test_task = NULL;
-        retval = -ENOMEM;
+        scan.process_task = cpuset_change_nodemask;
+        scan.heap = heap;
-        /*
+        scan.data = (nodemask_t *)oldmem;
-         * Allocate mmarray[] to hold mm reference for each task
-         * in cpuset cs.  Can't kmalloc GFP_KERNEL while holding
-         * tasklist_lock.  We could use GFP_ATOMIC, but with a
-         * few more lines of code, we can retry until we get a big
-         * enough mmarray[] w/o using GFP_ATOMIC.
-         */
-        while (1) {
-                ntasks = cgroup_task_count(cs->css.cgroup);  /* guess */
-                ntasks += fudge;
-                mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
-                if (!mmarray)
-                        goto done;
-                read_lock(&tasklist_lock);              /* block fork */
-                if (cgroup_task_count(cs->css.cgroup) <= ntasks)
-                        break;                          /* got enough */
-                read_unlock(&tasklist_lock);            /* try again */
-                kfree(mmarray);
-        }
-        n = 0;
-        /* Load up mmarray[] with mm reference for each task in cpuset. */
-        cgroup_iter_start(cs->css.cgroup, &it);
-        while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
-                struct mm_struct *mm;
-                if (n >= ntasks) {
-                        printk(KERN_WARNING
-                                "Cpuset mempolicy rebind incomplete.\n");
-                        break;
-                }
-                mm = get_task_mm(p);
-                if (!mm)
-                        continue;
-                mmarray[n++] = mm;
-        }
-        cgroup_iter_end(cs->css.cgroup, &it);
-        read_unlock(&tasklist_lock);
        /*
-         * Now that we've dropped the tasklist spinlock, we can
+         * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
-         * rebind the vma mempolicies of each mm in mmarray[] to their
+         * take while holding tasklist_lock.  Forks can happen - the
-         * new cpuset, and release that mm.  The mpol_rebind_mm()
+         * mpol_dup() cpuset_being_rebound check will catch such forks,
-         * call takes mmap_sem, which we couldn't take while holding
+         * and rebind their vma mempolicies too.  Because we still hold
-         * tasklist_lock.  Forks can happen again now - the mpol_dup()
+         * the global cgroup_mutex, we know that no other rebind effort
-         * cpuset_being_rebound check will catch such forks, and rebind
+         * will be contending for the global variable cpuset_being_rebound.
-         * their vma mempolicies too.  Because we still hold the global
-         * cgroup_mutex, we know that no other rebind effort will
-         * be contending for the global variable cpuset_being_rebound.
         * It's ok if we rebind the same mm twice; mpol_rebind_mm()
         * is idempotent.  Also migrate pages in each mm to new nodes.
         */
-        migrate = is_memory_migrate(cs);
+        cgroup_scan_tasks(&scan);
-        for (i = 0; i < n; i++) {
-                struct mm_struct *mm = mmarray[i];
-                mpol_rebind_mm(mm, &cs->mems_allowed);
-                if (migrate)
-                        cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
-                mmput(mm);
-        }
        /* We're done rebinding vmas to this cpuset's new mems_allowed. */
-        kfree(mmarray);
        cpuset_being_rebound = NULL;
-        retval = 0;
-done:
-        return retval;
 }
 /*
@@ -1141,6 +1119,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 {
        nodemask_t oldmem;
        int retval;
+        struct ptr_heap heap;
        /*
         * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@@ -1175,12 +1154,18 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
        if (retval < 0)
                goto done;
+        retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+        if (retval < 0)
+                goto done;
        mutex_lock(&callback_mutex);
        cs->mems_allowed = trialcs->mems_allowed;
        cs->mems_generation = cpuset_mems_generation++;
        mutex_unlock(&callback_mutex);
-        retval = update_tasks_nodemask(cs, &oldmem);
+        update_tasks_nodemask(cs, &oldmem, &heap);
+        heap_free(&heap);
 done:
        return retval;
 }
@@ -1192,8 +1177,10 @@ int current_cpuset_is_being_rebound(void)
 static int update_relax_domain_level(struct cpuset *cs, s64 val)
 {
+#ifdef CONFIG_SMP
        if (val < -1 || val >= SD_LV_MAX)
                return -EINVAL;
+#endif
        if (val != cs->relax_domain_level) {
                cs->relax_domain_level = val;
@@ -1355,19 +1342,22 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
                             struct cgroup *cont, struct task_struct *tsk)
 {
        struct cpuset *cs = cgroup_cs(cont);
-        int ret = 0;
        if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
                return -ENOSPC;
-        if (tsk->flags & PF_THREAD_BOUND) {
+        /*
-                mutex_lock(&callback_mutex);
+         * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
-                if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed))
+         * cannot change their cpu affinity and isolating such threads by their
-                        ret = -EINVAL;
+         * set of allowed nodes is unnecessary.  Thus, cpusets are not
-                mutex_unlock(&callback_mutex);
+         * applicable for such threads.  This prevents checking for success of
-        }
+         * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
+         * be changed.
+         */
+        if (tsk->flags & PF_THREAD_BOUND)
+                return -EINVAL;
-        return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL);
+        return security_task_setscheduler(tsk, 0, NULL);
 }
 static void cpuset_attach(struct cgroup_subsys *ss,
@@ -1706,6 +1696,7 @@ static struct cftype files[] = {
                .read_u64 = cpuset_read_u64,
                .write_u64 = cpuset_write_u64,
                .private = FILE_MEMORY_PRESSURE,
+                .mode = S_IRUGO,
        },
        {
@@ -1913,10 +1904,9 @@ int __init cpuset_init(void)
 static void cpuset_do_move_task(struct task_struct *tsk,
                                struct cgroup_scanner *scan)
 {
-        struct cpuset_hotplug_scanner *chsp;
+        struct cgroup *new_cgroup = scan->data;
-        chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
+        cgroup_attach_task(new_cgroup, tsk);
-        cgroup_attach_task(chsp->to, tsk);
 }
 /**
@@ -1932,15 +1922,15 @@ static void cpuset_do_move_task(struct task_struct *tsk,
 */
 static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
 {
-        struct cpuset_hotplug_scanner scan;
+        struct cgroup_scanner scan;
-        scan.scan.cg = from->css.cgroup;
+        scan.cg = from->css.cgroup;
-        scan.scan.test_task = NULL; /* select all tasks in cgroup */
+        scan.test_task = NULL; /* select all tasks in cgroup */
-        scan.scan.process_task = cpuset_do_move_task;
+        scan.process_task = cpuset_do_move_task;
-        scan.scan.heap = NULL;
+        scan.heap = NULL;
-        scan.to = to->css.cgroup;
+        scan.data = to->css.cgroup;
-        if (cgroup_scan_tasks(&scan.scan))
+        if (cgroup_scan_tasks(&scan))
                printk(KERN_ERR "move_member_tasks_to_cpuset: "
                                "cgroup_scan_tasks failed\n");
 }
@@ -2033,7 +2023,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
                        remove_tasks_in_empty_cpuset(cp);
                else {
                        update_tasks_cpumask(cp, NULL);
-                        update_tasks_nodemask(cp, &oldmems);
+                        update_tasks_nodemask(cp, &oldmems, NULL);
                }
        }
 }
@@ -2069,7 +2059,9 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
        }
        cgroup_lock();
+        mutex_lock(&callback_mutex);
        cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
+        mutex_unlock(&callback_mutex);
        scan_for_empty_cpusets(&top_cpuset);
        ndoms = generate_sched_domains(&doms, &attr);
        cgroup_unlock();
@@ -2092,11 +2084,12 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
        cgroup_lock();
        switch (action) {
        case MEM_ONLINE:
-                top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-                break;
        case MEM_OFFLINE:
+                mutex_lock(&callback_mutex);
                top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-                scan_for_empty_cpusets(&top_cpuset);
+                mutex_unlock(&callback_mutex);
+                if (action == MEM_OFFLINE)
+                        scan_for_empty_cpusets(&top_cpuset);
                break;
        default:
                break;
@@ -2206,26 +2199,24 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
 }
 /**
- * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
+ * cpuset_node_allowed_softwall - Can we allocate on a memory node?
- * @z: is this zone on an allowed node?
+ * @node: is this an allowed node?
 * @gfp_mask: memory allocation flags
 *
- * If we're in interrupt, yes, we can always allocate.  If
+ * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
- * __GFP_THISNODE is set, yes, we can always allocate.  If zone
+ * set, yes, we can always allocate.  If node is in our task's mems_allowed,
- * z's node is in our tasks mems_allowed, yes.  If it's not a
+ * yes.  If it's not a __GFP_HARDWALL request and this node is in the nearest
- * __GFP_HARDWALL request and this zone's nodes is in the nearest
+ * hardwalled cpuset ancestor to this task's cpuset, yes.  If the task has been
- * hardwalled cpuset ancestor to this tasks cpuset, yes.
+ * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
- * If the task has been OOM killed and has access to memory reserves
+ * flag, yes.
- * as specified by the TIF_MEMDIE flag, yes.
 * Otherwise, no.
 *
- * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
+ * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
- * reduces to cpuset_zone_allowed_hardwall().  Otherwise,
+ * cpuset_node_allowed_hardwall().  Otherwise, cpuset_node_allowed_softwall()
- * cpuset_zone_allowed_softwall() might sleep, and might allow a zone
+ * might sleep, and might allow a node from an enclosing cpuset.
- * from an enclosing cpuset.
 *
- * cpuset_zone_allowed_hardwall() only handles the simpler case of
+ * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
- * hardwall cpusets, and never sleeps.
+ * cpusets, and never sleeps.
 *
 * The __GFP_THISNODE placement logic is really handled elsewhere,
 * by forcibly using a zonelist starting at a specified node, and by
@@ -2264,20 +2255,17 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
 *      GFP_USER     - only nodes in current tasks mems allowed ok.
 *
 * Rule:
- *    Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
+ *    Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
 *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
 *    the code that might scan up ancestor cpusets and sleep.
 */
+int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
-int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
 {
-        int node;                       /* node that zone z is on */
        const struct cpuset *cs;        /* current cpuset ancestors */
        int allowed;                    /* is allocation in zone z allowed? */
        if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
                return 1;
-        node = zone_to_nid(z);
        might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
        if (node_isset(node, current->mems_allowed))
                return 1;
@@ -2306,15 +2294,15 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
 }
 /*
- * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
+ * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
- * @z: is this zone on an allowed node?
+ * @node: is this an allowed node?
 * @gfp_mask: memory allocation flags
 *
- * If we're in interrupt, yes, we can always allocate.
+ * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
- * If __GFP_THISNODE is set, yes, we can always allocate.  If zone
+ * set, yes, we can always allocate.  If node is in our task's mems_allowed,
- * z's node is in our tasks mems_allowed, yes.   If the task has been
+ * yes.  If the task has been OOM killed and has access to memory reserves as
- * OOM killed and has access to memory reserves as specified by the
+ * specified by the TIF_MEMDIE flag, yes.
- * TIF_MEMDIE flag, yes.  Otherwise, no.
+ * Otherwise, no.
 *
 * The __GFP_THISNODE placement logic is really handled elsewhere,
 * by forcibly using a zonelist starting at a specified node, and by
@@ -2322,20 +2310,16 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
 * any node on the zonelist except the first.  By the time any such
 * calls get to this routine, we should just shut up and say 'yes'.
 *
- * Unlike the cpuset_zone_allowed_softwall() variant, above,
+ * Unlike the cpuset_node_allowed_softwall() variant, above,
- * this variant requires that the zone be in the current tasks
+ * this variant requires that the node be in the current task's
 * mems_allowed or that we're in interrupt.  It does not scan up the
 * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
 * It never sleeps.
 */
+int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
-int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
 {
-        int node;                       /* node that zone z is on */
        if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
                return 1;
-        node = zone_to_nid(z);
        if (node_isset(node, current->mems_allowed))
                return 1;
        /*

diff --git a/kernel/cpuset.c b/kernel/cpuset.c index f76db9dcaa05..026faccca869 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c
@@ -128,10 +128,6 @@ static inline struct cpuset task_cs(struct task_struct task)
128	return container_of(task_subsys_state(task, cpuset_subsys_id),	128	return container_of(task_subsys_state(task, cpuset_subsys_id),
129	struct cpuset, css);	129	struct cpuset, css);
130	}	130	}
131	struct cpuset_hotplug_scanner {
132	struct cgroup_scanner scan;
133	struct cgroup *to;
134	};
135		131
136	/* bits in struct cpuset flags field */	132	/* bits in struct cpuset flags field */
137	typedef enum {	133	typedef enum {
@@ -521,6 +517,7 @@ static int validate_change(const struct cpuset cur, const struct cpuset trial)
521	return 0;	517	return 0;
522	}	518	}
523		519
		520	#ifdef CONFIG_SMP
524	/*	521	/*
525	* Helper routine for generate_sched_domains().	522	* Helper routine for generate_sched_domains().
526	* Do cpusets a, b have overlapping cpus_allowed masks?	523	* Do cpusets a, b have overlapping cpus_allowed masks?
@@ -815,6 +812,18 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
815		812
816	put_online_cpus();	813	put_online_cpus();
817	}	814	}
		815	#else /* !CONFIG_SMP */
		816	static void do_rebuild_sched_domains(struct work_struct *unused)
		817	{
		818	}
		819
		820	static int generate_sched_domains(struct cpumask **domains,
		821	struct sched_domain_attr **attributes)
		822	{
		823	*domains = NULL;
		824	return 1;
		825	}
		826	#endif /* CONFIG_SMP */
818		827
819	static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);	828	static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
820		829
@@ -1026,101 +1035,70 @@ static void cpuset_migrate_mm(struct mm_struct mm, const nodemask_t from,
1026	mutex_unlock(&callback_mutex);	1035	mutex_unlock(&callback_mutex);
1027	}	1036	}
1028		1037
		1038	/*
		1039	* Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
		1040	* nodes if memory_migrate flag is set. Called with cgroup_mutex held.
		1041	*/
		1042	static void cpuset_change_nodemask(struct task_struct *p,
		1043	struct cgroup_scanner *scan)
		1044	{
		1045	struct mm_struct *mm;
		1046	struct cpuset *cs;
		1047	int migrate;
		1048	const nodemask_t *oldmem = scan->data;
		1049
		1050	mm = get_task_mm(p);
		1051	if (!mm)
		1052	return;
		1053
		1054	cs = cgroup_cs(scan->cg);
		1055	migrate = is_memory_migrate(cs);
		1056
		1057	mpol_rebind_mm(mm, &cs->mems_allowed);
		1058	if (migrate)
		1059	cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
		1060	mmput(mm);
		1061	}
		1062
1029	static void *cpuset_being_rebound;	1063	static void *cpuset_being_rebound;
1030		1064
1031	/**	1065	/**
1032	* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.	1066	* update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
1033	* @cs: the cpuset in which each task's mems_allowed mask needs to be changed	1067	* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
1034	* @oldmem: old mems_allowed of cpuset cs	1068	* @oldmem: old mems_allowed of cpuset cs
		1069	* @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
1035	*	1070	*
1036	* Called with cgroup_mutex held	1071	* Called with cgroup_mutex held
1037	* Return 0 if successful, -errno if not.	1072	* No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
		1073	* if @heap != NULL.
1038	*/	1074	*/
1039	static int update_tasks_nodemask(struct cpuset cs, const nodemask_t oldmem)	1075	static void update_tasks_nodemask(struct cpuset cs, const nodemask_t oldmem,
		1076	struct ptr_heap *heap)
1040	{	1077	{
1041	struct task_struct *p;	1078	struct cgroup_scanner scan;
1042	struct mm_struct **mmarray;
1043	int i, n, ntasks;
1044	int migrate;
1045	int fudge;
1046	struct cgroup_iter it;
1047	int retval;
1048		1079
1049	cpuset_being_rebound = cs; /* causes mpol_dup() rebind */	1080	cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
1050		1081
1051	fudge = 10; /* spare mmarray[] slots */	1082	scan.cg = cs->css.cgroup;
1052	fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */	1083	scan.test_task = NULL;
1053	retval = -ENOMEM;	1084	scan.process_task = cpuset_change_nodemask;
1054		1085	scan.heap = heap;
1055	/*	1086	scan.data = (nodemask_t *)oldmem;
1056	* Allocate mmarray[] to hold mm reference for each task
1057	* in cpuset cs. Can't kmalloc GFP_KERNEL while holding
1058	* tasklist_lock. We could use GFP_ATOMIC, but with a
1059	* few more lines of code, we can retry until we get a big
1060	* enough mmarray[] w/o using GFP_ATOMIC.
1061	*/
1062	while (1) {
1063	ntasks = cgroup_task_count(cs->css.cgroup); /* guess */
1064	ntasks += fudge;
1065	mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
1066	if (!mmarray)
1067	goto done;
1068	read_lock(&tasklist_lock); /* block fork */
1069	if (cgroup_task_count(cs->css.cgroup) <= ntasks)
1070	break; /* got enough */
1071	read_unlock(&tasklist_lock); /* try again */
1072	kfree(mmarray);
1073	}
1074
1075	n = 0;
1076
1077	/* Load up mmarray[] with mm reference for each task in cpuset. */
1078	cgroup_iter_start(cs->css.cgroup, &it);
1079	while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
1080	struct mm_struct *mm;
1081
1082	if (n >= ntasks) {
1083	printk(KERN_WARNING
1084	"Cpuset mempolicy rebind incomplete.\n");
1085	break;
1086	}
1087	mm = get_task_mm(p);
1088	if (!mm)
1089	continue;
1090	mmarray[n++] = mm;
1091	}
1092	cgroup_iter_end(cs->css.cgroup, &it);
1093	read_unlock(&tasklist_lock);
1094		1087
1095	/*	1088	/*
1096	* Now that we've dropped the tasklist spinlock, we can	1089	* The mpol_rebind_mm() call takes mmap_sem, which we couldn't
1097	* rebind the vma mempolicies of each mm in mmarray[] to their	1090	* take while holding tasklist_lock. Forks can happen - the
1098	* new cpuset, and release that mm. The mpol_rebind_mm()	1091	* mpol_dup() cpuset_being_rebound check will catch such forks,
1099	* call takes mmap_sem, which we couldn't take while holding	1092	* and rebind their vma mempolicies too. Because we still hold
1100	* tasklist_lock. Forks can happen again now - the mpol_dup()	1093	* the global cgroup_mutex, we know that no other rebind effort
1101	* cpuset_being_rebound check will catch such forks, and rebind	1094	* will be contending for the global variable cpuset_being_rebound.
1102	* their vma mempolicies too. Because we still hold the global
1103	* cgroup_mutex, we know that no other rebind effort will
1104	* be contending for the global variable cpuset_being_rebound.
1105	* It's ok if we rebind the same mm twice; mpol_rebind_mm()	1095	* It's ok if we rebind the same mm twice; mpol_rebind_mm()
1106	* is idempotent. Also migrate pages in each mm to new nodes.	1096	* is idempotent. Also migrate pages in each mm to new nodes.
1107	*/	1097	*/
1108	migrate = is_memory_migrate(cs);	1098	cgroup_scan_tasks(&scan);
1109	for (i = 0; i < n; i++) {
1110	struct mm_struct *mm = mmarray[i];
1111
1112	mpol_rebind_mm(mm, &cs->mems_allowed);
1113	if (migrate)
1114	cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1115	mmput(mm);
1116	}
1117		1099
1118	/* We're done rebinding vmas to this cpuset's new mems_allowed. */	1100	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
1119	kfree(mmarray);
1120	cpuset_being_rebound = NULL;	1101	cpuset_being_rebound = NULL;
1121	retval = 0;
1122	done:
1123	return retval;
1124	}	1102	}
1125		1103
1126	/*	1104	/*
@@ -1141,6 +1119,7 @@ static int update_nodemask(struct cpuset cs, struct cpuset trialcs,
1141	{	1119	{
1142	nodemask_t oldmem;	1120	nodemask_t oldmem;
1143	int retval;	1121	int retval;
		1122	struct ptr_heap heap;
1144		1123
1145	/*	1124	/*
1146	* top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];	1125	* top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@@ -1175,12 +1154,18 @@ static int update_nodemask(struct cpuset cs, struct cpuset trialcs,
1175	if (retval < 0)	1154	if (retval < 0)
1176	goto done;	1155	goto done;
1177		1156
		1157	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
		1158	if (retval < 0)
		1159	goto done;
		1160
1178	mutex_lock(&callback_mutex);	1161	mutex_lock(&callback_mutex);
1179	cs->mems_allowed = trialcs->mems_allowed;	1162	cs->mems_allowed = trialcs->mems_allowed;
1180	cs->mems_generation = cpuset_mems_generation++;	1163	cs->mems_generation = cpuset_mems_generation++;
1181	mutex_unlock(&callback_mutex);	1164	mutex_unlock(&callback_mutex);
1182		1165
1183	retval = update_tasks_nodemask(cs, &oldmem);	1166	update_tasks_nodemask(cs, &oldmem, &heap);
		1167
		1168	heap_free(&heap);
1184	done:	1169	done:
1185	return retval;	1170	return retval;
1186	}	1171	}
@@ -1192,8 +1177,10 @@ int current_cpuset_is_being_rebound(void)
1192		1177
1193	static int update_relax_domain_level(struct cpuset *cs, s64 val)	1178	static int update_relax_domain_level(struct cpuset *cs, s64 val)
1194	{	1179	{
		1180	#ifdef CONFIG_SMP
1195	if (val < -1 \|\| val >= SD_LV_MAX)	1181	if (val < -1 \|\| val >= SD_LV_MAX)
1196	return -EINVAL;	1182	return -EINVAL;
		1183	#endif
1197		1184
1198	if (val != cs->relax_domain_level) {	1185	if (val != cs->relax_domain_level) {
1199	cs->relax_domain_level = val;	1186	cs->relax_domain_level = val;
@@ -1355,19 +1342,22 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
1355	struct cgroup cont, struct task_struct tsk)	1342	struct cgroup cont, struct task_struct tsk)
1356	{	1343	{
1357	struct cpuset *cs = cgroup_cs(cont);	1344	struct cpuset *cs = cgroup_cs(cont);
1358	int ret = 0;
1359		1345
1360	if (cpumask_empty(cs->cpus_allowed) \|\| nodes_empty(cs->mems_allowed))	1346	if (cpumask_empty(cs->cpus_allowed) \|\| nodes_empty(cs->mems_allowed))
1361	return -ENOSPC;	1347	return -ENOSPC;
1362		1348
1363	if (tsk->flags & PF_THREAD_BOUND) {	1349	/*
1364	mutex_lock(&callback_mutex);	1350	* Kthreads bound to specific cpus cannot be moved to a new cpuset; we
1365	if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed))	1351	* cannot change their cpu affinity and isolating such threads by their
1366	ret = -EINVAL;	1352	* set of allowed nodes is unnecessary. Thus, cpusets are not
1367	mutex_unlock(&callback_mutex);	1353	* applicable for such threads. This prevents checking for success of
1368	}	1354	* set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
		1355	* be changed.
		1356	*/
		1357	if (tsk->flags & PF_THREAD_BOUND)
		1358	return -EINVAL;
1369		1359
1370	return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL);	1360	return security_task_setscheduler(tsk, 0, NULL);
1371	}	1361	}
1372		1362
1373	static void cpuset_attach(struct cgroup_subsys *ss,	1363	static void cpuset_attach(struct cgroup_subsys *ss,
@@ -1706,6 +1696,7 @@ static struct cftype files[] = {
1706	.read_u64 = cpuset_read_u64,	1696	.read_u64 = cpuset_read_u64,
1707	.write_u64 = cpuset_write_u64,	1697	.write_u64 = cpuset_write_u64,
1708	.private = FILE_MEMORY_PRESSURE,	1698	.private = FILE_MEMORY_PRESSURE,
		1699	.mode = S_IRUGO,
1709	},	1700	},
1710		1701
1711	{	1702	{
@@ -1913,10 +1904,9 @@ int __init cpuset_init(void)
1913	static void cpuset_do_move_task(struct task_struct *tsk,	1904	static void cpuset_do_move_task(struct task_struct *tsk,
1914	struct cgroup_scanner *scan)	1905	struct cgroup_scanner *scan)
1915	{	1906	{
1916	struct cpuset_hotplug_scanner *chsp;	1907	struct cgroup *new_cgroup = scan->data;
1917		1908
1918	chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);	1909	cgroup_attach_task(new_cgroup, tsk);
1919	cgroup_attach_task(chsp->to, tsk);
1920	}	1910	}
1921		1911
1922	/**	1912	/**
@@ -1932,15 +1922,15 @@ static void cpuset_do_move_task(struct task_struct *tsk,
1932	*/	1922	*/
1933	static void move_member_tasks_to_cpuset(struct cpuset from, struct cpuset to)	1923	static void move_member_tasks_to_cpuset(struct cpuset from, struct cpuset to)
1934	{	1924	{
1935	struct cpuset_hotplug_scanner scan;	1925	struct cgroup_scanner scan;
1936		1926
1937	scan.scan.cg = from->css.cgroup;	1927	scan.cg = from->css.cgroup;
1938	scan.scan.test_task = NULL; /* select all tasks in cgroup */	1928	scan.test_task = NULL; /* select all tasks in cgroup */
1939	scan.scan.process_task = cpuset_do_move_task;	1929	scan.process_task = cpuset_do_move_task;
1940	scan.scan.heap = NULL;	1930	scan.heap = NULL;
1941	scan.to = to->css.cgroup;	1931	scan.data = to->css.cgroup;
1942		1932
1943	if (cgroup_scan_tasks(&scan.scan))	1933	if (cgroup_scan_tasks(&scan))
1944	printk(KERN_ERR "move_member_tasks_to_cpuset: "	1934	printk(KERN_ERR "move_member_tasks_to_cpuset: "
1945	"cgroup_scan_tasks failed\n");	1935	"cgroup_scan_tasks failed\n");
1946	}	1936	}
@@ -2033,7 +2023,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
2033	remove_tasks_in_empty_cpuset(cp);	2023	remove_tasks_in_empty_cpuset(cp);
2034	else {	2024	else {
2035	update_tasks_cpumask(cp, NULL);	2025	update_tasks_cpumask(cp, NULL);
2036	update_tasks_nodemask(cp, &oldmems);	2026	update_tasks_nodemask(cp, &oldmems, NULL);
2037	}	2027	}
2038	}	2028	}
2039	}	2029	}
@@ -2069,7 +2059,9 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
2069	}	2059	}
2070		2060
2071	cgroup_lock();	2061	cgroup_lock();
		2062	mutex_lock(&callback_mutex);
2072	cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);	2063	cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
		2064	mutex_unlock(&callback_mutex);
2073	scan_for_empty_cpusets(&top_cpuset);	2065	scan_for_empty_cpusets(&top_cpuset);
2074	ndoms = generate_sched_domains(&doms, &attr);	2066	ndoms = generate_sched_domains(&doms, &attr);
2075	cgroup_unlock();	2067	cgroup_unlock();
@@ -2092,11 +2084,12 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
2092	cgroup_lock();	2084	cgroup_lock();
2093	switch (action) {	2085	switch (action) {
2094	case MEM_ONLINE:	2086	case MEM_ONLINE:
2095	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2096	break;
2097	case MEM_OFFLINE:	2087	case MEM_OFFLINE:
		2088	mutex_lock(&callback_mutex);
2098	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];	2089	top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2099	scan_for_empty_cpusets(&top_cpuset);	2090	mutex_unlock(&callback_mutex);
		2091	if (action == MEM_OFFLINE)
		2092	scan_for_empty_cpusets(&top_cpuset);
2100	break;	2093	break;
2101	default:	2094	default:
2102	break;	2095	break;
@@ -2206,26 +2199,24 @@ static const struct cpuset nearest_hardwall_ancestor(const struct cpuset cs)
2206	}	2199	}
2207		2200
2208	/**	2201	/**
2209	* cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?	2202	* cpuset_node_allowed_softwall - Can we allocate on a memory node?
2210	* @z: is this zone on an allowed node?	2203	* @node: is this an allowed node?
2211	* @gfp_mask: memory allocation flags	2204	* @gfp_mask: memory allocation flags
2212	*	2205	*
2213	* If we're in interrupt, yes, we can always allocate. If	2206	* If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
2214	* __GFP_THISNODE is set, yes, we can always allocate. If zone	2207	* set, yes, we can always allocate. If node is in our task's mems_allowed,
2215	* z's node is in our tasks mems_allowed, yes. If it's not a	2208	* yes. If it's not a __GFP_HARDWALL request and this node is in the nearest
2216	* __GFP_HARDWALL request and this zone's nodes is in the nearest	2209	* hardwalled cpuset ancestor to this task's cpuset, yes. If the task has been
2217	* hardwalled cpuset ancestor to this tasks cpuset, yes.	2210	* OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
2218	* If the task has been OOM killed and has access to memory reserves	2211	* flag, yes.
2219	* as specified by the TIF_MEMDIE flag, yes.
2220	* Otherwise, no.	2212	* Otherwise, no.
2221	*	2213	*
2222	* If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()	2214	* If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
2223	* reduces to cpuset_zone_allowed_hardwall(). Otherwise,	2215	* cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall()
2224	* cpuset_zone_allowed_softwall() might sleep, and might allow a zone	2216	* might sleep, and might allow a node from an enclosing cpuset.
2225	* from an enclosing cpuset.
2226	*	2217	*
2227	* cpuset_zone_allowed_hardwall() only handles the simpler case of	2218	* cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
2228	* hardwall cpusets, and never sleeps.	2219	* cpusets, and never sleeps.
2229	*	2220	*
2230	* The __GFP_THISNODE placement logic is really handled elsewhere,	2221	* The __GFP_THISNODE placement logic is really handled elsewhere,
2231	* by forcibly using a zonelist starting at a specified node, and by	2222	* by forcibly using a zonelist starting at a specified node, and by
@@ -2264,20 +2255,17 @@ static const struct cpuset nearest_hardwall_ancestor(const struct cpuset cs)
2264	* GFP_USER - only nodes in current tasks mems allowed ok.	2255	* GFP_USER - only nodes in current tasks mems allowed ok.
2265	*	2256	*
2266	* Rule:	2257	* Rule:
2267	* Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you	2258	* Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
2268	* pass in the __GFP_HARDWALL flag set in gfp_flag, which disables	2259	* pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
2269	* the code that might scan up ancestor cpusets and sleep.	2260	* the code that might scan up ancestor cpusets and sleep.
2270	*/	2261	*/
2271		2262	int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
2272	int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2273	{	2263	{
2274	int node; /* node that zone z is on */
2275	const struct cpuset cs; / current cpuset ancestors */	2264	const struct cpuset cs; / current cpuset ancestors */
2276	int allowed; /* is allocation in zone z allowed? */	2265	int allowed; /* is allocation in zone z allowed? */
2277		2266
2278	if (in_interrupt() \|\| (gfp_mask & __GFP_THISNODE))	2267	if (in_interrupt() \|\| (gfp_mask & __GFP_THISNODE))
2279	return 1;	2268	return 1;
2280	node = zone_to_nid(z);
2281	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));	2269	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2282	if (node_isset(node, current->mems_allowed))	2270	if (node_isset(node, current->mems_allowed))
2283	return 1;	2271	return 1;
@@ -2306,15 +2294,15 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2306	}	2294	}
2307		2295
2308	/*	2296	/*
2309	* cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?	2297	* cpuset_node_allowed_hardwall - Can we allocate on a memory node?
2310	* @z: is this zone on an allowed node?	2298	* @node: is this an allowed node?
2311	* @gfp_mask: memory allocation flags	2299	* @gfp_mask: memory allocation flags
2312	*	2300	*
2313	* If we're in interrupt, yes, we can always allocate.	2301	* If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is
2314	* If __GFP_THISNODE is set, yes, we can always allocate. If zone	2302	* set, yes, we can always allocate. If node is in our task's mems_allowed,
2315	* z's node is in our tasks mems_allowed, yes. If the task has been	2303	* yes. If the task has been OOM killed and has access to memory reserves as
2316	* OOM killed and has access to memory reserves as specified by the	2304	* specified by the TIF_MEMDIE flag, yes.
2317	* TIF_MEMDIE flag, yes. Otherwise, no.	2305	* Otherwise, no.
2318	*	2306	*
2319	* The __GFP_THISNODE placement logic is really handled elsewhere,	2307	* The __GFP_THISNODE placement logic is really handled elsewhere,
2320	* by forcibly using a zonelist starting at a specified node, and by	2308	* by forcibly using a zonelist starting at a specified node, and by
@@ -2322,20 +2310,16 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
2322	* any node on the zonelist except the first. By the time any such	2310	* any node on the zonelist except the first. By the time any such
2323	* calls get to this routine, we should just shut up and say 'yes'.	2311	* calls get to this routine, we should just shut up and say 'yes'.
2324	*	2312	*
2325	* Unlike the cpuset_zone_allowed_softwall() variant, above,	2313	* Unlike the cpuset_node_allowed_softwall() variant, above,
2326	* this variant requires that the zone be in the current tasks	2314	* this variant requires that the node be in the current task's
2327	* mems_allowed or that we're in interrupt. It does not scan up the	2315	* mems_allowed or that we're in interrupt. It does not scan up the
2328	* cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.	2316	* cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
2329	* It never sleeps.	2317	* It never sleeps.
2330	*/	2318	*/
2331		2319	int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
2332	int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
2333	{	2320	{
2334	int node; /* node that zone z is on */
2335
2336	if (in_interrupt() \|\| (gfp_mask & __GFP_THISNODE))	2321	if (in_interrupt() \|\| (gfp_mask & __GFP_THISNODE))
2337	return 1;	2322	return 1;
2338	node = zone_to_nid(z);
2339	if (node_isset(node, current->mems_allowed))	2323	if (node_isset(node, current->mems_allowed))
2340	return 1;	2324	return 1;
2341	/*	2325	/*