5 files changed, 335 insertions, 205 deletions
diff --git a/kernel/dma.c b/kernel/dma.c
index aef0a45b78..2020644c93 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -62,6 +62,11 @@ static struct dma_chan dma_chan_busy[MAX_DMA_CHANNELS] = {
 };
+/**
+ * request_dma - request and reserve a system DMA channel
+ * @dmanr: DMA channel number
+ * @device_id: reserving device ID string, used in /proc/dma
+ */
 int request_dma(unsigned int dmanr, const char * device_id)
 {
        if (dmanr >= MAX_DMA_CHANNELS)
@@ -76,7 +81,10 @@ int request_dma(unsigned int dmanr, const char * device_id)
        return 0;
 } /* request_dma */
+/**
+ * free_dma - free a reserved system DMA channel
+ * @dmanr: DMA channel number
+ */
 void free_dma(unsigned int dmanr)
 {
        if (dmanr >= MAX_DMA_CHANNELS) {
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 342bca62c4..eeac3e313b 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -69,6 +69,15 @@ static inline int is_kernel(unsigned long addr)
        return in_gate_area_no_task(addr);
 }
+static int is_ksym_addr(unsigned long addr)
+{
+        if (all_var)
+                return is_kernel(addr);
+        return is_kernel_text(addr) || is_kernel_inittext(addr) ||
+                is_kernel_extratext(addr);
+}
 /* expand a compressed symbol data into the resulting uncompressed string,
   given the offset to where the symbol is in the compressed stream */
 static unsigned int kallsyms_expand_symbol(unsigned int off, char *result)
@@ -155,6 +164,73 @@ unsigned long kallsyms_lookup_name(const char *name)
        return module_kallsyms_lookup_name(name);
 }
+static unsigned long get_symbol_pos(unsigned long addr,
+                                    unsigned long *symbolsize,
+                                    unsigned long *offset)
+{
+        unsigned long symbol_start = 0, symbol_end = 0;
+        unsigned long i, low, high, mid;
+        /* This kernel should never had been booted. */
+        BUG_ON(!kallsyms_addresses);
+        /* do a binary search on the sorted kallsyms_addresses array */
+        low = 0;
+        high = kallsyms_num_syms;
+        while (high - low > 1) {
+                mid = (low + high) / 2;
+                if (kallsyms_addresses[mid] <= addr)
+                        low = mid;
+                else
+                        high = mid;
+        }
+        /*
+         * search for the first aliased symbol. Aliased
+         * symbols are symbols with the same address
+         */
+        while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low])
+                --low;
+        symbol_start = kallsyms_addresses[low];
+        /* Search for next non-aliased symbol */
+        for (i = low + 1; i < kallsyms_num_syms; i++) {
+                if (kallsyms_addresses[i] > symbol_start) {
+                        symbol_end = kallsyms_addresses[i];
+                        break;
+                }
+        }
+        /* if we found no next symbol, we use the end of the section */
+        if (!symbol_end) {
+                if (is_kernel_inittext(addr))
+                        symbol_end = (unsigned long)_einittext;
+                else if (all_var)
+                        symbol_end = (unsigned long)_end;
+                else
+                        symbol_end = (unsigned long)_etext;
+        }
+        *symbolsize = symbol_end - symbol_start;
+        *offset = addr - symbol_start;
+        return low;
+}
+/*
+ * Lookup an address but don't bother to find any names.
+ */
+int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
+                                unsigned long *offset)
+{
+        if (is_ksym_addr(addr))
+                return !!get_symbol_pos(addr, symbolsize, offset);
+        return !!module_address_lookup(addr, symbolsize, offset, NULL);
+}
 /*
 * Lookup an address
 * - modname is set to NULL if it's in the kernel
@@ -167,57 +243,18 @@ const char *kallsyms_lookup(unsigned long addr,
                            unsigned long *offset,
                            char **modname, char *namebuf)
 {
-        unsigned long i, low, high, mid;
        const char *msym;
-        /* This kernel should never had been booted. */
-        BUG_ON(!kallsyms_addresses);
        namebuf[KSYM_NAME_LEN] = 0;
        namebuf[0] = 0;
-        if ((all_var && is_kernel(addr)) ||
+        if (is_ksym_addr(addr)) {
-            (!all_var && (is_kernel_text(addr) || is_kernel_inittext(addr) ||
+                unsigned long pos;
-                                is_kernel_extratext(addr)))) {
-                unsigned long symbol_end = 0;
-                /* do a binary search on the sorted kallsyms_addresses array */
-                low = 0;
-                high = kallsyms_num_syms;
-                while (high-low > 1) {
-                        mid = (low + high) / 2;
-                        if (kallsyms_addresses[mid] <= addr) low = mid;
-                        else high = mid;
-                }
-                /* search for the first aliased symbol. Aliased symbols are
-                   symbols with the same address */
-                while (low && kallsyms_addresses[low - 1] == kallsyms_addresses[low])
-                        --low;
+                pos = get_symbol_pos(addr, symbolsize, offset);
                /* Grab name */
-                kallsyms_expand_symbol(get_symbol_offset(low), namebuf);
+                kallsyms_expand_symbol(get_symbol_offset(pos), namebuf);
-                /* Search for next non-aliased symbol */
-                for (i = low + 1; i < kallsyms_num_syms; i++) {
-                        if (kallsyms_addresses[i] > kallsyms_addresses[low]) {
-                                symbol_end = kallsyms_addresses[i];
-                                break;
-                        }
-                }
-                /* if we found no next symbol, we use the end of the section */
-                if (!symbol_end) {
-                        if (is_kernel_inittext(addr))
-                                symbol_end = (unsigned long)_einittext;
-                        else
-                                symbol_end = all_var ? (unsigned long)_end : (unsigned long)_etext;
-                }
-                *symbolsize = symbol_end - kallsyms_addresses[low];
                *modname = NULL;
-                *offset = addr - kallsyms_addresses[low];
                return namebuf;
        }
diff --git a/kernel/module.c b/kernel/module.c
index 7c77a0a927..7f60e782de 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2040,7 +2040,8 @@ const char *module_address_lookup(unsigned long addr,
        list_for_each_entry(mod, &modules, list) {
                if (within(addr, mod->module_init, mod->init_size)
                    || within(addr, mod->module_core, mod->core_size)) {
-                        *modname = mod->name;
+                        if (modname)
+                                *modname = mod->name;
                        return get_ksymbol(mod, addr, size, offset);
                }
        }
diff --git a/kernel/resource.c b/kernel/resource.c
index 9db38a1a75..6de60c1214 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -193,6 +193,13 @@ static int __release_resource(struct resource *old)
        return -EINVAL;
 }
+/**
+ * request_resource - request and reserve an I/O or memory resource
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ *
+ * Returns 0 for success, negative error code on error.
+ */
 int request_resource(struct resource *root, struct resource *new)
 {
        struct resource *conflict;
@@ -205,6 +212,15 @@ int request_resource(struct resource *root, struct resource *new)
 EXPORT_SYMBOL(request_resource);
+/**
+ * ____request_resource - reserve a resource, with resource conflict returned
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ *
+ * Returns:
+ * On success, NULL is returned.
+ * On error, a pointer to the conflicting resource is returned.
+ */
 struct resource *____request_resource(struct resource *root, struct resource *new)
 {
        struct resource *conflict;
@@ -217,6 +233,10 @@ struct resource *____request_resource(struct resource *root, struct resource *ne
 EXPORT_SYMBOL(____request_resource);
+/**
+ * release_resource - release a previously reserved resource
+ * @old: resource pointer
+ */
 int release_resource(struct resource *old)
 {
        int retval;
@@ -315,8 +335,16 @@ static int find_resource(struct resource *root, struct resource *new,
        return -EBUSY;
 }
-/*
+/**
- * Allocate empty slot in the resource tree given range and alignment.
+ * allocate_resource - allocate empty slot in the resource tree given range & alignment
+ * @root: root resource descriptor
+ * @new: resource descriptor desired by caller
+ * @size: requested resource region size
+ * @min: minimum size to allocate
+ * @max: maximum size to allocate
+ * @align: alignment requested, in bytes
+ * @alignf: alignment function, optional, called if not NULL
+ * @alignf_data: arbitrary data to pass to the @alignf function
 */
 int allocate_resource(struct resource *root, struct resource *new,
                      resource_size_t size, resource_size_t min,
@@ -407,10 +435,15 @@ int insert_resource(struct resource *parent, struct resource *new)
        return result;
 }
-/*
+/**
+ * adjust_resource - modify a resource's start and size
+ * @res: resource to modify
+ * @start: new start value
+ * @size: new size
+ *
 * Given an existing resource, change its start and size to match the
- * arguments.  Returns -EBUSY if it can't fit.  Existing children of
+ * arguments.  Returns 0 on success, -EBUSY if it can't fit.
- * the resource are assumed to be immutable.
+ * Existing children of the resource are assumed to be immutable.
 */
 int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
 {
@@ -456,11 +489,19 @@ EXPORT_SYMBOL(adjust_resource);
 * Note how this, unlike the above, knows about
 * the IO flag meanings (busy etc).
 *
- * Request-region creates a new busy region.
+ * request_region creates a new busy region.
 *
- * Check-region returns non-zero if the area is already busy
+ * check_region returns non-zero if the area is already busy.
 *
- * Release-region releases a matching busy region.
+ * release_region releases a matching busy region.
+ */
+/**
+ * __request_region - create a new busy resource region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ * @name: reserving caller's ID string
 */
 struct resource * __request_region(struct resource *parent,
                                   resource_size_t start, resource_size_t n,
@@ -497,9 +538,23 @@ struct resource * __request_region(struct resource *parent,
        }
        return res;
 }
 EXPORT_SYMBOL(__request_region);
+/**
+ * __check_region - check if a resource region is busy or free
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ *
+ * Returns 0 if the region is free at the moment it is checked,
+ * returns %-EBUSY if the region is busy.
+ *
+ * NOTE:
+ * This function is deprecated because its use is racy.
+ * Even if it returns 0, a subsequent call to request_region()
+ * may fail because another driver etc. just allocated the region.
+ * Do NOT use it.  It will be removed from the kernel.
+ */
 int __check_region(struct resource *parent, resource_size_t start,
                        resource_size_t n)
 {
@@ -513,9 +568,16 @@ int __check_region(struct resource *parent, resource_size_t start,
        kfree(res);
        return 0;
 }
 EXPORT_SYMBOL(__check_region);
+/**
+ * __release_region - release a previously reserved resource region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @n: resource region size
+ *
+ * The described resource region must match a currently busy region.
+ */
 void __release_region(struct resource *parent, resource_size_t start,
                        resource_size_t n)
 {
@@ -553,7 +615,6 @@ void __release_region(struct resource *parent, resource_size_t start,
                "<%016llx-%016llx>\n", (unsigned long long)start,
                (unsigned long long)end);
 }
 EXPORT_SYMBOL(__release_region);
 /*
diff --git a/kernel/sched.c b/kernel/sched.c
index e4e54e86f4..53608a59d6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1232,7 +1232,7 @@ nextgroup:
 }
 /*
- * find_idlest_queue - find the idlest runqueue among the cpus in group.
+ * find_idlest_cpu - find the idlest cpu among the cpus in group.
 */
 static int
 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
@@ -1286,21 +1286,29 @@ static int sched_balance_self(int cpu, int flag)
        while (sd) {
                cpumask_t span;
                struct sched_group *group;
-                int new_cpu;
+                int new_cpu, weight;
-                int weight;
+                if (!(sd->flags & flag)) {
+                        sd = sd->child;
+                        continue;
+                }
                span = sd->span;
                group = find_idlest_group(sd, t, cpu);
-                if (!group)
+                if (!group) {
-                        goto nextlevel;
+                        sd = sd->child;
+                        continue;
+                }
                new_cpu = find_idlest_cpu(group, t, cpu);
-                if (new_cpu == -1 || new_cpu == cpu)
+                if (new_cpu == -1 || new_cpu == cpu) {
-                        goto nextlevel;
+                        /* Now try balancing at a lower domain level of cpu */
+                        sd = sd->child;
+                        continue;
+                }
-                /* Now try balancing at a lower domain level */
+                /* Now try balancing at a lower domain level of new_cpu */
                cpu = new_cpu;
-nextlevel:
                sd = NULL;
                weight = cpus_weight(span);
                for_each_domain(cpu, tmp) {
@@ -2533,8 +2541,14 @@ static int load_balance(int this_cpu, struct rq *this_rq,
        struct rq *busiest;
        cpumask_t cpus = CPU_MASK_ALL;
+        /*
+         * When power savings policy is enabled for the parent domain, idle
+         * sibling can pick up load irrespective of busy siblings. In this case,
+         * let the state of idle sibling percolate up as IDLE, instead of
+         * portraying it as NOT_IDLE.
+         */
        if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
-            !sched_smt_power_savings)
+            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                sd_idle = 1;
        schedstat_inc(sd, lb_cnt[idle]);
@@ -2630,7 +2644,7 @@ redo:
        }
        if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-            !sched_smt_power_savings)
+            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                return -1;
        return nr_moved;
@@ -2646,7 +2660,7 @@ out_one_pinned:
                sd->balance_interval *= 2;
        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-                        !sched_smt_power_savings)
+            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                return -1;
        return 0;
 }
@@ -2668,7 +2682,14 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
        int sd_idle = 0;
        cpumask_t cpus = CPU_MASK_ALL;
-        if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
+        /*
+         * When power savings policy is enabled for the parent domain, idle
+         * sibling can pick up load irrespective of busy siblings. In this case,
+         * let the state of idle sibling percolate up as IDLE, instead of
+         * portraying it as NOT_IDLE.
+         */
+        if (sd->flags & SD_SHARE_CPUPOWER &&
+            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                sd_idle = 1;
        schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2709,7 +2730,8 @@ redo:
        if (!nr_moved) {
                schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
-                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+                if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
+                    !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                        return -1;
        } else
                sd->nr_balance_failed = 0;
@@ -2719,7 +2741,7 @@ redo:
 out_balanced:
        schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
        if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
-                                        !sched_smt_power_savings)
+            !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
                return -1;
        sd->nr_balance_failed = 0;
@@ -4817,7 +4839,7 @@ void show_state(void)
 * NOTE: this function does not set the idle thread's NEED_RESCHED
 * flag, to make booting more robust.
 */
-void __devinit init_idle(struct task_struct *idle, int cpu)
+void __cpuinit init_idle(struct task_struct *idle, int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
        unsigned long flags;
@@ -5392,7 +5414,9 @@ static int sd_degenerate(struct sched_domain *sd)
        if (sd->flags & (SD_LOAD_BALANCE |
                         SD_BALANCE_NEWIDLE |
                         SD_BALANCE_FORK |
-                         SD_BALANCE_EXEC)) {
+                         SD_BALANCE_EXEC |
+                         SD_SHARE_CPUPOWER |
+                         SD_SHARE_PKG_RESOURCES)) {
                if (sd->groups != sd->groups->next)
                        return 0;
        }
@@ -5426,7 +5450,9 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                pflags &= ~(SD_LOAD_BALANCE |
                                SD_BALANCE_NEWIDLE |
                                SD_BALANCE_FORK |
-                                SD_BALANCE_EXEC);
+                                SD_BALANCE_EXEC |
+                                SD_SHARE_CPUPOWER |
+                                SD_SHARE_PKG_RESOURCES);
        }
        if (~cflags & pflags)
                return 0;
@@ -5448,12 +5474,18 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
                struct sched_domain *parent = tmp->parent;
                if (!parent)
                        break;
-                if (sd_parent_degenerate(tmp, parent))
+                if (sd_parent_degenerate(tmp, parent)) {
                        tmp->parent = parent->parent;
+                        if (parent->parent)
+                                parent->parent->child = tmp;
+                }
        }
-        if (sd && sd_degenerate(sd))
+        if (sd && sd_degenerate(sd)) {
                sd = sd->parent;
+                if (sd)
+                        sd->child = NULL;
+        }
        sched_domain_debug(sd, cpu);
@@ -5461,7 +5493,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
 }
 /* cpus with isolated domains */
-static cpumask_t __devinitdata cpu_isolated_map = CPU_MASK_NONE;
+static cpumask_t __cpuinitdata cpu_isolated_map = CPU_MASK_NONE;
 /* Setup the mask of cpus configured for isolated domains */
 static int __init isolated_cpu_setup(char *str)
@@ -5489,15 +5521,17 @@ __setup ("isolcpus=", isolated_cpu_setup);
 * covered by the given span, and will set each group's ->cpumask correctly,
 * and ->cpu_power to 0.
 */
-static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+static void
-                                    int (*group_fn)(int cpu))
+init_sched_build_groups(struct sched_group groups[], cpumask_t span,
+                        const cpumask_t *cpu_map,
+                        int (*group_fn)(int cpu, const cpumask_t *cpu_map))
 {
        struct sched_group *first = NULL, *last = NULL;
        cpumask_t covered = CPU_MASK_NONE;
        int i;
        for_each_cpu_mask(i, span) {
-                int group = group_fn(i);
+                int group = group_fn(i, cpu_map);
                struct sched_group *sg = &groups[group];
                int j;
@@ -5508,7 +5542,7 @@ static void init_sched_build_groups(struct sched_group groups[], cpumask_t span,
                sg->cpu_power = 0;
                for_each_cpu_mask(j, span) {
-                        if (group_fn(j) != group)
+                        if (group_fn(j, cpu_map) != group)
                                continue;
                        cpu_set(j, covered);
@@ -5975,13 +6009,15 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map)
 #endif
                );
        if (system_state == SYSTEM_BOOTING) {
-                printk("migration_cost=");
+                if (num_online_cpus() > 1) {
-                for (distance = 0; distance <= max_distance; distance++) {
+                        printk("migration_cost=");
-                        if (distance)
+                        for (distance = 0; distance <= max_distance; distance++) {
-                                printk(",");
+                                if (distance)
-                        printk("%ld", (long)migration_cost[distance] / 1000);
+                                        printk(",");
+                                printk("%ld", (long)migration_cost[distance] / 1000);
+                        }
+                        printk("\n");
                }
-                printk("\n");
        }
        j1 = jiffies;
        if (migration_debug)
@@ -6084,7 +6120,7 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
 static struct sched_group sched_group_cpus[NR_CPUS];
-static int cpu_to_cpu_group(int cpu)
+static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map)
 {
        return cpu;
 }
@@ -6095,31 +6131,36 @@ static int cpu_to_cpu_group(int cpu)
 */
 #ifdef CONFIG_SCHED_MC
 static DEFINE_PER_CPU(struct sched_domain, core_domains);
-static struct sched_group *sched_group_core_bycpu[NR_CPUS];
+static struct sched_group sched_group_core[NR_CPUS];
 #endif
 #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
-static int cpu_to_core_group(int cpu)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
 {
-        return first_cpu(cpu_sibling_map[cpu]);
+        cpumask_t mask = cpu_sibling_map[cpu];
+        cpus_and(mask, mask, *cpu_map);
+        return first_cpu(mask);
 }
 #elif defined(CONFIG_SCHED_MC)
-static int cpu_to_core_group(int cpu)
+static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map)
 {
        return cpu;
 }
 #endif
 static DEFINE_PER_CPU(struct sched_domain, phys_domains);
-static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
+static struct sched_group sched_group_phys[NR_CPUS];
-static int cpu_to_phys_group(int cpu)
+static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map)
 {
 #ifdef CONFIG_SCHED_MC
        cpumask_t mask = cpu_coregroup_map(cpu);
+        cpus_and(mask, mask, *cpu_map);
        return first_cpu(mask);
 #elif defined(CONFIG_SCHED_SMT)
-        return first_cpu(cpu_sibling_map[cpu]);
+        cpumask_t mask = cpu_sibling_map[cpu];
+        cpus_and(mask, mask, *cpu_map);
+        return first_cpu(mask);
 #else
        return cpu;
 #endif
@@ -6137,7 +6178,7 @@ static struct sched_group **sched_group_nodes_bycpu[NR_CPUS];
 static DEFINE_PER_CPU(struct sched_domain, allnodes_domains);
 static struct sched_group *sched_group_allnodes_bycpu[NR_CPUS];
-static int cpu_to_allnodes_group(int cpu)
+static int cpu_to_allnodes_group(int cpu, const cpumask_t *cpu_map)
 {
        return cpu_to_node(cpu);
 }
@@ -6169,12 +6210,11 @@ next_sg:
 }
 #endif
+#ifdef CONFIG_NUMA
 /* Free memory allocated for various sched_group structures */
 static void free_sched_groups(const cpumask_t *cpu_map)
 {
-        int cpu;
+        int cpu, i;
-#ifdef CONFIG_NUMA
-        int i;
        for_each_cpu_mask(cpu, *cpu_map) {
                struct sched_group *sched_group_allnodes
@@ -6211,19 +6251,63 @@ next_sg:
                kfree(sched_group_nodes);
                sched_group_nodes_bycpu[cpu] = NULL;
        }
+}
+#else
+static void free_sched_groups(const cpumask_t *cpu_map)
+{
+}
 #endif
-        for_each_cpu_mask(cpu, *cpu_map) {
-                if (sched_group_phys_bycpu[cpu]) {
+/*
-                        kfree(sched_group_phys_bycpu[cpu]);
+ * Initialize sched groups cpu_power.
-                        sched_group_phys_bycpu[cpu] = NULL;
+ *
-                }
+ * cpu_power indicates the capacity of sched group, which is used while
-#ifdef CONFIG_SCHED_MC
+ * distributing the load between different sched groups in a sched domain.
-                if (sched_group_core_bycpu[cpu]) {
+ * Typically cpu_power for all the groups in a sched domain will be same unless
-                        kfree(sched_group_core_bycpu[cpu]);
+ * there are asymmetries in the topology. If there are asymmetries, group
-                        sched_group_core_bycpu[cpu] = NULL;
+ * having more cpu_power will pickup more load compared to the group having
-                }
+ * less cpu_power.
-#endif
+ *
+ * cpu_power will be a multiple of SCHED_LOAD_SCALE. This multiple represents
+ * the maximum number of tasks a group can handle in the presence of other idle
+ * or lightly loaded groups in the same sched domain.
+ */
+static void init_sched_groups_power(int cpu, struct sched_domain *sd)
+{
+        struct sched_domain *child;
+        struct sched_group *group;
+        WARN_ON(!sd || !sd->groups);
+        if (cpu != first_cpu(sd->groups->cpumask))
+                return;
+        child = sd->child;
+        /*
+         * For perf policy, if the groups in child domain share resources
+         * (for example cores sharing some portions of the cache hierarchy
+         * or SMT), then set this domain groups cpu_power such that each group
+         * can handle only one task, when there are other idle groups in the
+         * same sched domain.
+         */
+        if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) &&
+                       (child->flags &
+                        (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) {
+                sd->groups->cpu_power = SCHED_LOAD_SCALE;
+                return;
        }
+        sd->groups->cpu_power = 0;
+        /*
+         * add cpu_power of each child group to this groups cpu_power
+         */
+        group = child->groups;
+        do {
+                sd->groups->cpu_power += group->cpu_power;
+                group = group->next;
+        } while (group != child->groups);
 }
 /*
@@ -6233,10 +6317,7 @@ next_sg:
 static int build_sched_domains(const cpumask_t *cpu_map)
 {
        int i;
-        struct sched_group *sched_group_phys = NULL;
+        struct sched_domain *sd;
-#ifdef CONFIG_SCHED_MC
-        struct sched_group *sched_group_core = NULL;
-#endif
 #ifdef CONFIG_NUMA
        struct sched_group **sched_group_nodes = NULL;
        struct sched_group *sched_group_allnodes = NULL;
@@ -6268,9 +6349,10 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                                > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) {
                        if (!sched_group_allnodes) {
                                sched_group_allnodes
-                                        = kmalloc(sizeof(struct sched_group)
+                                        = kmalloc_node(sizeof(struct sched_group)
-                                                        * MAX_NUMNODES,
+                                                        * MAX_NUMNODES,
-                                                  GFP_KERNEL);
+                                                  GFP_KERNEL,
+                                                  cpu_to_node(i));
                                if (!sched_group_allnodes) {
                                        printk(KERN_WARNING
                                        "Can not alloc allnodes sched group\n");
@@ -6282,7 +6364,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                        sd = &per_cpu(allnodes_domains, i);
                        *sd = SD_ALLNODES_INIT;
                        sd->span = *cpu_map;
-                        group = cpu_to_allnodes_group(i);
+                        group = cpu_to_allnodes_group(i, cpu_map);
                        sd->groups = &sched_group_allnodes[group];
                        p = sd;
                } else
@@ -6292,60 +6374,42 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                *sd = SD_NODE_INIT;
                sd->span = sched_domain_node_span(cpu_to_node(i));
                sd->parent = p;
+                if (p)
+                        p->child = sd;
                cpus_and(sd->span, sd->span, *cpu_map);
 #endif
-                if (!sched_group_phys) {
-                        sched_group_phys
-                                = kmalloc(sizeof(struct sched_group) * NR_CPUS,
-                                          GFP_KERNEL);
-                        if (!sched_group_phys) {
-                                printk (KERN_WARNING "Can not alloc phys sched"
-                                                     "group\n");
-                                goto error;
-                        }
-                        sched_group_phys_bycpu[i] = sched_group_phys;
-                }
                p = sd;
                sd = &per_cpu(phys_domains, i);
-                group = cpu_to_phys_group(i);
+                group = cpu_to_phys_group(i, cpu_map);
                *sd = SD_CPU_INIT;
                sd->span = nodemask;
                sd->parent = p;
+                if (p)
+                        p->child = sd;
                sd->groups = &sched_group_phys[group];
 #ifdef CONFIG_SCHED_MC
-                if (!sched_group_core) {
-                        sched_group_core
-                                = kmalloc(sizeof(struct sched_group) * NR_CPUS,
-                                          GFP_KERNEL);
-                        if (!sched_group_core) {
-                                printk (KERN_WARNING "Can not alloc core sched"
-                                                     "group\n");
-                                goto error;
-                        }
-                        sched_group_core_bycpu[i] = sched_group_core;
-                }
                p = sd;
                sd = &per_cpu(core_domains, i);
-                group = cpu_to_core_group(i);
+                group = cpu_to_core_group(i, cpu_map);
                *sd = SD_MC_INIT;
                sd->span = cpu_coregroup_map(i);
                cpus_and(sd->span, sd->span, *cpu_map);
                sd->parent = p;
+                p->child = sd;
                sd->groups = &sched_group_core[group];
 #endif
 #ifdef CONFIG_SCHED_SMT
                p = sd;
                sd = &per_cpu(cpu_domains, i);
-                group = cpu_to_cpu_group(i);
+                group = cpu_to_cpu_group(i, cpu_map);
                *sd = SD_SIBLING_INIT;
                sd->span = cpu_sibling_map[i];
                cpus_and(sd->span, sd->span, *cpu_map);
                sd->parent = p;
+                p->child = sd;
                sd->groups = &sched_group_cpus[group];
 #endif
        }
@@ -6359,7 +6423,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                        continue;
                init_sched_build_groups(sched_group_cpus, this_sibling_map,
-                                                &cpu_to_cpu_group);
+                                        cpu_map, &cpu_to_cpu_group);
        }
 #endif
@@ -6371,7 +6435,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                if (i != first_cpu(this_core_map))
                        continue;
                init_sched_build_groups(sched_group_core, this_core_map,
-                                        &cpu_to_core_group);
+                                        cpu_map, &cpu_to_core_group);
        }
 #endif
@@ -6385,14 +6449,14 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                        continue;
                init_sched_build_groups(sched_group_phys, nodemask,
-                                                &cpu_to_phys_group);
+                                        cpu_map, &cpu_to_phys_group);
        }
 #ifdef CONFIG_NUMA
        /* Set up node groups */
        if (sched_group_allnodes)
                init_sched_build_groups(sched_group_allnodes, *cpu_map,
-                                        &cpu_to_allnodes_group);
+                                        cpu_map, &cpu_to_allnodes_group);
        for (i = 0; i < MAX_NUMNODES; i++) {
                /* Set up node groups */
@@ -6464,72 +6528,20 @@ static int build_sched_domains(const cpumask_t *cpu_map)
        /* Calculate CPU power for physical packages and nodes */
 #ifdef CONFIG_SCHED_SMT
        for_each_cpu_mask(i, *cpu_map) {
-                struct sched_domain *sd;
                sd = &per_cpu(cpu_domains, i);
-                sd->groups->cpu_power = SCHED_LOAD_SCALE;
+                init_sched_groups_power(i, sd);
        }
 #endif
 #ifdef CONFIG_SCHED_MC
        for_each_cpu_mask(i, *cpu_map) {
-                int power;
-                struct sched_domain *sd;
                sd = &per_cpu(core_domains, i);
-                if (sched_smt_power_savings)
+                init_sched_groups_power(i, sd);
-                        power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
-                else
-                        power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
-                                            * SCHED_LOAD_SCALE / 10;
-                sd->groups->cpu_power = power;
        }
 #endif
        for_each_cpu_mask(i, *cpu_map) {
-                struct sched_domain *sd;
-#ifdef CONFIG_SCHED_MC
                sd = &per_cpu(phys_domains, i);
-                if (i != first_cpu(sd->groups->cpumask))
+                init_sched_groups_power(i, sd);
-                        continue;
-                sd->groups->cpu_power = 0;
-                if (sched_mc_power_savings || sched_smt_power_savings) {
-                        int j;
-                        for_each_cpu_mask(j, sd->groups->cpumask) {
-                                struct sched_domain *sd1;
-                                sd1 = &per_cpu(core_domains, j);
-                                /*
-                                 * for each core we will add once
-                                 * to the group in physical domain
-                                 */
-                                if (j != first_cpu(sd1->groups->cpumask))
-                                        continue;
-                                if (sched_smt_power_savings)
-                                        sd->groups->cpu_power += sd1->groups->cpu_power;
-                                else
-                                        sd->groups->cpu_power += SCHED_LOAD_SCALE;
-                        }
-                } else
-                        /*
-                         * This has to be < 2 * SCHED_LOAD_SCALE
-                         * Lets keep it SCHED_LOAD_SCALE, so that
-                         * while calculating NUMA group's cpu_power
-                         * we can simply do
-                         *  numa_group->cpu_power += phys_group->cpu_power;
-                         *
-                         * See "only add power once for each physical pkg"
-                         * comment below
-                         */
-                        sd->groups->cpu_power = SCHED_LOAD_SCALE;
-#else
-                int power;
-                sd = &per_cpu(phys_domains, i);
-                if (sched_smt_power_savings)
-                        power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
-                else
-                        power = SCHED_LOAD_SCALE;
-                sd->groups->cpu_power = power;
-#endif
        }
 #ifdef CONFIG_NUMA
@@ -6537,7 +6549,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
                init_numa_sched_groups_power(sched_group_nodes[i]);
        if (sched_group_allnodes) {
-                int group = cpu_to_allnodes_group(first_cpu(*cpu_map));
+                int group = cpu_to_allnodes_group(first_cpu(*cpu_map), cpu_map);
                struct sched_group *sg = &sched_group_allnodes[group];
                init_numa_sched_groups_power(sg);
@@ -6563,9 +6575,11 @@ static int build_sched_domains(const cpumask_t *cpu_map)
        return 0;
+#ifdef CONFIG_NUMA
 error:
        free_sched_groups(cpu_map);
        return -ENOMEM;
+#endif
 }
 /*
 * Set up scheduler domains and groups.  Callers must hold the hotplug lock.
@@ -6747,11 +6761,20 @@ static int update_sched_domains(struct notifier_block *nfb,
 void __init sched_init_smp(void)
 {
+        cpumask_t non_isolated_cpus;
        lock_cpu_hotplug();
        arch_init_sched_domains(&cpu_online_map);
+        cpus_andnot(non_isolated_cpus, cpu_online_map, cpu_isolated_map);
+        if (cpus_empty(non_isolated_cpus))
+                cpu_set(smp_processor_id(), non_isolated_cpus);
        unlock_cpu_hotplug();
        /* XXX: Theoretical race here - CPU may be hotplugged now */
        hotcpu_notifier(update_sched_domains, 0);
+        /* Move init over to a non-isolated CPU */
+        if (set_cpus_allowed(current, non_isolated_cpus) < 0)
+                BUG();
 }
 #else
 void __init sched_init_smp(void)