1 files changed, 198 insertions, 153 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index d5ab79cf516d..eab7bd6628e0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -14,6 +14,8 @@
 *  2003-10-22 Updates by Stephen Hemminger.
 *  2004 May-July Rework by Paul Jackson.
 *  2006 Rework by Paul Menage to use generic cgroups
+ *  2008 Rework of the scheduler domains and CPU hotplug handling
+ *       by Max Krasnyansky
 *
 *  This file is subject to the terms and conditions of the GNU General Public
 *  License.  See the file COPYING in the main directory of the Linux
@@ -236,9 +238,11 @@ static struct cpuset top_cpuset = {
 static DEFINE_MUTEX(callback_mutex);
-/* This is ugly, but preserves the userspace API for existing cpuset
+/*
+ * This is ugly, but preserves the userspace API for existing cpuset
 * users. If someone tries to mount the "cpuset" filesystem, we
- * silently switch it to mount "cgroup" instead */
+ * silently switch it to mount "cgroup" instead
+ */
 static int cpuset_get_sb(struct file_system_type *fs_type,
                         int flags, const char *unused_dev_name,
                         void *data, struct vfsmount *mnt)
@@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 }
 /*
- * Helper routine for rebuild_sched_domains().
+ * Helper routine for generate_sched_domains().
 * Do cpusets a, b have overlapping cpus_allowed masks?
 */
 static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
 {
        return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
@@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 }
 /*
- * rebuild_sched_domains()
+ * generate_sched_domains()
- *
+ *
- * This routine will be called to rebuild the scheduler's dynamic
+ * This function builds a partial partition of the systems CPUs
- * sched domains:
+ * A 'partial partition' is a set of non-overlapping subsets whose
- * - if the flag 'sched_load_balance' of any cpuset with non-empty
+ * union is a subset of that set.
- *   'cpus' changes,
+ * The output of this function needs to be passed to kernel/sched.c
- * - or if the 'cpus' allowed changes in any cpuset which has that
+ * partition_sched_domains() routine, which will rebuild the scheduler's
- *   flag enabled,
+ * load balancing domains (sched domains) as specified by that partial
- * - or if the 'sched_relax_domain_level' of any cpuset which has
+ * partition.
- *   that flag enabled and with non-empty 'cpus' changes,
- * - or if any cpuset with non-empty 'cpus' is removed,
- * - or if a cpu gets offlined.
- *
- * This routine builds a partial partition of the systems CPUs
- * (the set of non-overlappping cpumask_t's in the array 'part'
- * below), and passes that partial partition to the kernel/sched.c
- * partition_sched_domains() routine, which will rebuild the
- * schedulers load balancing domains (sched domains) as specified
- * by that partial partition.  A 'partial partition' is a set of
- * non-overlapping subsets whose union is a subset of that set.
 *
 * See "What is sched_load_balance" in Documentation/cpusets.txt
 * for a background explanation of this.
@@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 * domains when operating in the severe memory shortage situations
 * that could cause allocation failures below.
 *
- * Call with cgroup_mutex held.  May take callback_mutex during
+ * Must be called with cgroup_lock held.
- * call due to the kfifo_alloc() and kmalloc() calls.  May nest
- * a call to the get_online_cpus()/put_online_cpus() pair.
- * Must not be called holding callback_mutex, because we must not
- * call get_online_cpus() while holding callback_mutex.  Elsewhere
- * the kernel nests callback_mutex inside get_online_cpus() calls.
- * So the reverse nesting would risk an ABBA deadlock.
 *
 * The three key local variables below are:
 *    q  - a linked-list queue of cpuset pointers, used to implement a
@@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
 *      element of the partition (one sched domain) to be passed to
 *      partition_sched_domains().
 */
+static int generate_sched_domains(cpumask_t **domains,
-void rebuild_sched_domains(void)
+                        struct sched_domain_attr **attributes)
 {
-        LIST_HEAD(q);           /* queue of cpusets to be scanned*/
+        LIST_HEAD(q);           /* queue of cpusets to be scanned */
        struct cpuset *cp;      /* scans q */
        struct cpuset **csa;    /* array of all cpuset ptrs */
        int csn;                /* how many cpuset ptrs in csa so far */
@@ -601,23 +587,26 @@ void rebuild_sched_domains(void)
        int ndoms;              /* number of sched domains in result */
        int nslot;              /* next empty doms[] cpumask_t slot */
-        csa = NULL;
+        ndoms = 0;
        doms = NULL;
        dattr = NULL;
+        csa = NULL;
        /* Special case for the 99% of systems with one, full, sched domain */
        if (is_sched_load_balance(&top_cpuset)) {
-                ndoms = 1;
                doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
                if (!doms)
-                        goto rebuild;
+                        goto done;
                dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
                if (dattr) {
                        *dattr = SD_ATTR_INIT;
                        update_domain_attr_tree(dattr, &top_cpuset);
                }
                *doms = top_cpuset.cpus_allowed;
-                goto rebuild;
+                ndoms = 1;
+                goto done;
        }
        csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
@@ -680,61 +669,141 @@ restart:
                }
        }
-        /* Convert <csn, csa> to <ndoms, doms> */
+        /*
+         * Now we know how many domains to create.
+         * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
+         */
        doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
-        if (!doms)
+        if (!doms) {
-                goto rebuild;
+                ndoms = 0;
+                goto done;
+        }
+        /*
+         * The rest of the code, including the scheduler, can deal with
+         * dattr==NULL case. No need to abort if alloc fails.
+         */
        dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
        for (nslot = 0, i = 0; i < csn; i++) {
                struct cpuset *a = csa[i];
+                cpumask_t *dp;
                int apn = a->pn;
-                if (apn >= 0) {
+                if (apn < 0) {
-                        cpumask_t *dp = doms + nslot;
+                        /* Skip completed partitions */
+                        continue;
-                        if (nslot == ndoms) {
+                }
-                                static int warnings = 10;
-                                if (warnings) {
+                dp = doms + nslot;
-                                        printk(KERN_WARNING
-                                         "rebuild_sched_domains confused:"
+                if (nslot == ndoms) {
-                                          " nslot %d, ndoms %d, csn %d, i %d,"
+                        static int warnings = 10;
-                                          " apn %d\n",
+                        if (warnings) {
-                                          nslot, ndoms, csn, i, apn);
+                                printk(KERN_WARNING
-                                        warnings--;
+                                 "rebuild_sched_domains confused:"
-                                }
+                                  " nslot %d, ndoms %d, csn %d, i %d,"
-                                continue;
+                                  " apn %d\n",
+                                  nslot, ndoms, csn, i, apn);
+                                warnings--;
                        }
+                        continue;
+                }
-                        cpus_clear(*dp);
+                cpus_clear(*dp);
-                        if (dattr)
+                if (dattr)
-                                *(dattr + nslot) = SD_ATTR_INIT;
+                        *(dattr + nslot) = SD_ATTR_INIT;
-                        for (j = i; j < csn; j++) {
+                for (j = i; j < csn; j++) {
-                                struct cpuset *b = csa[j];
+                        struct cpuset *b = csa[j];
-                                if (apn == b->pn) {
+                        if (apn == b->pn) {
-                                        cpus_or(*dp, *dp, b->cpus_allowed);
+                                cpus_or(*dp, *dp, b->cpus_allowed);
-                                        b->pn = -1;
+                                if (dattr)
-                                        if (dattr)
+                                        update_domain_attr_tree(dattr + nslot, b);
-                                                update_domain_attr_tree(dattr
-                                                                   + nslot, b);
+                                /* Done with this partition */
-                                }
+                                b->pn = -1;
                        }
-                        nslot++;
                }
+                nslot++;
        }
        BUG_ON(nslot != ndoms);
-rebuild:
+done:
-        /* Have scheduler rebuild sched domains */
+        kfree(csa);
+        *domains    = doms;
+        *attributes = dattr;
+        return ndoms;
+}
+/*
+ * Rebuild scheduler domains.
+ *
+ * Call with neither cgroup_mutex held nor within get_online_cpus().
+ * Takes both cgroup_mutex and get_online_cpus().
+ *
+ * Cannot be directly called from cpuset code handling changes
+ * to the cpuset pseudo-filesystem, because it cannot be called
+ * from code that already holds cgroup_mutex.
+ */
+static void do_rebuild_sched_domains(struct work_struct *unused)
+{
+        struct sched_domain_attr *attr;
+        cpumask_t *doms;
+        int ndoms;
        get_online_cpus();
-        partition_sched_domains(ndoms, doms, dattr);
+        /* Generate domain masks and attrs */
+        cgroup_lock();
+        ndoms = generate_sched_domains(&doms, &attr);
+        cgroup_unlock();
+        /* Have scheduler rebuild the domains */
+        partition_sched_domains(ndoms, doms, attr);
        put_online_cpus();
+}
-done:
+static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
-        kfree(csa);
-        /* Don't kfree(doms) -- partition_sched_domains() does that. */
+/*
-        /* Don't kfree(dattr) -- partition_sched_domains() does that. */
+ * Rebuild scheduler domains, asynchronously via workqueue.
+ *
+ * If the flag 'sched_load_balance' of any cpuset with non-empty
+ * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
+ * which has that flag enabled, or if any cpuset with a non-empty
+ * 'cpus' is removed, then call this routine to rebuild the
+ * scheduler's dynamic sched domains.
+ *
+ * The rebuild_sched_domains() and partition_sched_domains()
+ * routines must nest cgroup_lock() inside get_online_cpus(),
+ * but such cpuset changes as these must nest that locking the
+ * other way, holding cgroup_lock() for much of the code.
+ *
+ * So in order to avoid an ABBA deadlock, the cpuset code handling
+ * these user changes delegates the actual sched domain rebuilding
+ * to a separate workqueue thread, which ends up processing the
+ * above do_rebuild_sched_domains() function.
+ */
+static void async_rebuild_sched_domains(void)
+{
+        schedule_work(&rebuild_sched_domains_work);
+}
+/*
+ * Accomplishes the same scheduler domain rebuild as the above
+ * async_rebuild_sched_domains(), however it directly calls the
+ * rebuild routine synchronously rather than calling it via an
+ * asynchronous work thread.
+ *
+ * This can only be called from code that is not holding
+ * cgroup_mutex (not nested in a cgroup_lock() call.)
+ */
+void rebuild_sched_domains(void)
+{
+        do_rebuild_sched_domains(NULL);
 }
 /**
@@ -774,37 +843,25 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
 /**
 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
+ * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
 *
 * Called with cgroup_mutex held
 *
 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
 * calling callback functions for each.
 *
- * Return 0 if successful, -errno if not.
+ * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * if @heap != NULL.
 */
-static int update_tasks_cpumask(struct cpuset *cs)
+static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
 {
        struct cgroup_scanner scan;
-        struct ptr_heap heap;
-        int retval;
-        /*
-         * cgroup_scan_tasks() will initialize heap->gt for us.
-         * heap_init() is still needed here for we should not change
-         * cs->cpus_allowed when heap_init() fails.
-         */
-        retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
-        if (retval)
-                return retval;
        scan.cg = cs->css.cgroup;
        scan.test_task = cpuset_test_cpumask;
        scan.process_task = cpuset_change_cpumask;
-        scan.heap = &heap;
+        scan.heap = heap;
-        retval = cgroup_scan_tasks(&scan);
+        cgroup_scan_tasks(&scan);
-        heap_free(&heap);
-        return retval;
 }
 /**
@@ -814,6 +871,7 @@ static int update_tasks_cpumask(struct cpuset *cs)
 */
 static int update_cpumask(struct cpuset *cs, const char *buf)
 {
+        struct ptr_heap heap;
        struct cpuset trialcs;
        int retval;
        int is_load_balanced;
@@ -848,6 +906,10 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
        if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
                return 0;
+        retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+        if (retval)
+                return retval;
        is_load_balanced = is_sched_load_balance(&trialcs);
        mutex_lock(&callback_mutex);
@@ -858,12 +920,12 @@ static int update_cpumask(struct cpuset *cs, const char *buf)
         * Scan tasks in the cpuset, and update the cpumasks of any
         * that need an update.
         */
-        retval = update_tasks_cpumask(cs);
+        update_tasks_cpumask(cs, &heap);
-        if (retval < 0)
-                return retval;
+        heap_free(&heap);
        if (is_load_balanced)
-                rebuild_sched_domains();
+                async_rebuild_sched_domains();
        return 0;
 }
@@ -1090,7 +1152,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
        if (val != cs->relax_domain_level) {
                cs->relax_domain_level = val;
                if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
-                        rebuild_sched_domains();
+                        async_rebuild_sched_domains();
        }
        return 0;
@@ -1131,7 +1193,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
        mutex_unlock(&callback_mutex);
        if (cpus_nonempty && balance_flag_changed)
-                rebuild_sched_domains();
+                async_rebuild_sched_domains();
        return 0;
 }
@@ -1492,6 +1554,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
        default:
                BUG();
        }
+        /* Unreachable but makes gcc happy */
+        return 0;
 }
 static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
@@ -1504,6 +1569,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
        default:
                BUG();
        }
+        /* Unrechable but makes gcc happy */
+        return 0;
 }
@@ -1692,15 +1760,9 @@ static struct cgroup_subsys_state *cpuset_create(
 }
 /*
- * Locking note on the strange update_flag() call below:
- *
 * If the cpuset being removed has its flag 'sched_load_balance'
 * enabled, then simulate turning sched_load_balance off, which
- * will call rebuild_sched_domains().  The get_online_cpus()
+ * will call async_rebuild_sched_domains().
- * call in rebuild_sched_domains() must not be made while holding
- * callback_mutex.  Elsewhere the kernel nests callback_mutex inside
- * get_online_cpus() calls.  So the reverse nesting would risk an
- * ABBA deadlock.
 */
 static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -1719,7 +1781,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
 struct cgroup_subsys cpuset_subsys = {
        .name = "cpuset",
        .create = cpuset_create,
-        .destroy  = cpuset_destroy,
+        .destroy = cpuset_destroy,
        .can_attach = cpuset_can_attach,
        .attach = cpuset_attach,
        .populate = cpuset_populate,
@@ -1811,7 +1873,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
 }
 /*
- * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
+ * If CPU and/or memory hotplug handlers, below, unplug any CPUs
 * or memory nodes, we need to walk over the cpuset hierarchy,
 * removing that CPU or node from all cpusets.  If this removes the
 * last CPU or node from a cpuset, then move the tasks in the empty
@@ -1859,7 +1921,7 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
 * that has tasks along with an empty 'mems'.  But if we did see such
 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
 */
-static void scan_for_empty_cpusets(const struct cpuset *root)
+static void scan_for_empty_cpusets(struct cpuset *root)
 {
        LIST_HEAD(queue);
        struct cpuset *cp;      /* scans cpusets being updated */
@@ -1896,42 +1958,13 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
                     nodes_empty(cp->mems_allowed))
                        remove_tasks_in_empty_cpuset(cp);
                else {
-                        update_tasks_cpumask(cp);
+                        update_tasks_cpumask(cp, NULL);
                        update_tasks_nodemask(cp, &oldmems);
                }
        }
 }
 /*
- * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
- * cpu_online_map and node_states[N_HIGH_MEMORY].  Force the top cpuset to
- * track what's online after any CPU or memory node hotplug or unplug event.
- *
- * Since there are two callers of this routine, one for CPU hotplug
- * events and one for memory node hotplug events, we could have coded
- * two separate routines here.  We code it as a single common routine
- * in order to minimize text size.
- */
-static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
-{
-        cgroup_lock();
-        top_cpuset.cpus_allowed = cpu_online_map;
-        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-        scan_for_empty_cpusets(&top_cpuset);
-        /*
-         * Scheduler destroys domains on hotplug events.
-         * Rebuild them based on the current settings.
-         */
-        if (rebuild_sd)
-                rebuild_sched_domains();
-        cgroup_unlock();
-}
-/*
 * The top_cpuset tracks what CPUs and Memory Nodes are online,
 * period.  This is necessary in order to make cpusets transparent
 * (of no affect) on systems that are actively using CPU hotplug
@@ -1939,40 +1972,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
 *
 * This routine ensures that top_cpuset.cpus_allowed tracks
 * cpu_online_map on each CPU hotplug (cpuhp) event.
+ *
+ * Called within get_online_cpus().  Needs to call cgroup_lock()
+ * before calling generate_sched_domains().
 */
+static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
-static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
                                unsigned long phase, void *unused_cpu)
 {
+        struct sched_domain_attr *attr;
+        cpumask_t *doms;
+        int ndoms;
        switch (phase) {
-        case CPU_UP_CANCELED:
-        case CPU_UP_CANCELED_FROZEN:
-        case CPU_DOWN_FAILED:
-        case CPU_DOWN_FAILED_FROZEN:
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
-                common_cpu_mem_hotplug_unplug(1);
                break;
        default:
                return NOTIFY_DONE;
        }
+        cgroup_lock();
+        top_cpuset.cpus_allowed = cpu_online_map;
+        scan_for_empty_cpusets(&top_cpuset);
+        ndoms = generate_sched_domains(&doms, &attr);
+        cgroup_unlock();
+        /* Have scheduler rebuild the domains */
+        partition_sched_domains(ndoms, doms, attr);
        return NOTIFY_OK;
 }
 #ifdef CONFIG_MEMORY_HOTPLUG
 /*
 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
- * Call this routine anytime after you change
+ * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
- * node_states[N_HIGH_MEMORY].
+ * See also the previous routine cpuset_track_online_cpus().
- * See also the previous routine cpuset_handle_cpuhp().
 */
 void cpuset_track_online_nodes(void)
 {
-        common_cpu_mem_hotplug_unplug(0);
+        cgroup_lock();
+        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
+        scan_for_empty_cpusets(&top_cpuset);
+        cgroup_unlock();
 }
 #endif
@@ -1987,7 +2032,7 @@ void __init cpuset_init_smp(void)
        top_cpuset.cpus_allowed = cpu_online_map;
        top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-        hotcpu_notifier(cpuset_handle_cpuhp, 0);
+        hotcpu_notifier(cpuset_track_online_cpus, 0);
 }
 /**