/* * arch/ia64/kernel/domain.c * Architecture specific sched-domains builder. * * Copyright (C) 2004 Jesse Barnes * Copyright (C) 2004 Silicon Graphics, Inc. */ #include <linux/sched.h> #include <linux/percpu.h> #include <linux/slab.h> #include <linux/cpumask.h> #include <linux/init.h> #include <linux/topology.h> #include <linux/nodemask.h> #define SD_NODES_PER_DOMAIN 16 #ifdef CONFIG_NUMA /** * find_next_best_node - find the next node to include in a sched_domain * @node: node whose sched_domain we're building * @used_nodes: nodes already in the sched_domain * * Find the next node to include in a given scheduling domain. Simply * finds the closest node not already in the @used_nodes map. * * Should use nodemask_t. */ static int find_next_best_node(int node, unsigned long *used_nodes) { int i, n, val, min_val, best_node = 0; min_val = INT_MAX; for (i = 0; i < MAX_NUMNODES; i++) { /* Start at @node */ n = (node + i) % MAX_NUMNODES; if (!nr_cpus_node(n)) continue; /* Skip already used nodes */ if (test_bit(n, used_nodes)) continue; /* Simple min distance search */ val = node_distance(node, n); if (val < min_val) { min_val = val; best_node = n; } } set_bit(best_node, used_nodes); return best_node; } /** * sched_domain_node_span - get a cpumask for a node's sched_domain * @node: node whose cpumask we're constructing * @size: number of nodes to include in this span * * Given a node, construct a good cpumask for its sched_domain to span. It * should be one that prevents unnecessary balancing, but also spreads tasks * out optimally. */ static cpumask_t sched_domain_node_span(int node) { int i; cpumask_t span, nodemask; DECLARE_BITMAP(used_nodes, MAX_NUMNODES); cpus_clear(span); bitmap_zero(used_nodes, MAX_NUMNODES); nodemask = node_to_cpumask(node); cpus_or(span, span, nodemask); set_bit(node, used_nodes); for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { int next_node = find_next_best_node(node, used_nodes); nodemask = node_to_cpumask(next_node); cpus_or(span, span, nodemask); } return span; } #endif /* * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we * can switch it on easily if needed. */ #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static struct sched_group sched_group_cpus[NR_CPUS]; static int cpu_to_cpu_group(int cpu) { return cpu; } #endif static DEFINE_PER_CPU(struct sched_domain, phys_domains); static struct sched_group sched_group_phys[NR_CPUS]; static int cpu_to_phys_group(int cpu) { #ifdef CONFIG_SCHED_SMT return first_cpu(cpu_sibling_map[cpu]); #else return cpu; #endif } #ifdef CONFIG_NUMA /* * The init_sched_build_groups can't handle what we want to do with node * groups, so roll our own. Now each node has its own list of groups which * gets dynamically allocated. */ static DEFINE_PER_CPU(struct sched_domain, node_domains); static struct sched_group *sched_group_nodes[MAX_NUMNODES]; static DEFINE_PER_CPU(struct sched_domain, allnodes_domains); static struct sched_group sched_group_allnodes[MAX_NUMNODES]; static int cpu_to_allnodes_group(int cpu) { return cpu_to_node(cpu); } #endif /* * Build sched domains for a given set of cpus and attach the sched domains * to the individual cpus */ void build_sched_domains(const cpumask_t *cpu_map) { int i; /* * Set up domains for cpus specified by the cpu_map. */ for_each_cpu_mask(i, *cpu_map) { int group; struct sched_domain *sd = NULL, *p; cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); cpus_and(nodemask, nodemask, *cpu_map); #ifdef CONFIG_NUMA if (num_online_cpus() > SD_NODES_PER_DOMAIN*cpus_weight(nodemask)) { sd = &per_cpu(allnodes_domains, i); *sd = SD_ALLNODES_INIT; sd->span = *cpu_map; group = cpu_to_allnodes_group(i); sd->groups = &sched_group_allnodes[group]; p = sd; } else p = NULL; sd = &per_cpu(node_domains, i); *sd = SD_NODE_INIT; sd->span = sched_domain_node_span(cpu_to_node(i)); sd->parent = p; cpus_and(sd->span, sd->span, *cpu_map); #endif p = sd; sd = &per_cpu(phys_domains, i); group = cpu_to_phys_group(i); *sd = SD_CPU_INIT; sd->span = nodemask; sd->parent = p; sd->groups = &sched_group_phys[group]; #ifdef CONFIG_SCHED_SMT p = sd; sd = &per_cpu(cpu_domains, i); group = cpu_to_cpu_group(i); *sd = SD_SIBLING_INIT; sd->span = cpu_sibling_map[i]; cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; sd->groups = &sched_group_cpus[group]; #endif } #ifdef CONFIG_SCHED_SMT /* Set up CPU (sibling) groups */ for_each_cpu_mask(i, *cpu_map) { cpumask_t this_sibling_map = cpu_sibling_map[i]; cpus_and(this_sibling_map, this_sibling_map, *cpu_map); if (i != first_cpu(this_sibling_map)) continue; init_sched_build_groups(sched_group_cpus, this_sibling_map, &cpu_to_cpu_group); } #endif /* Set up physical groups */ for (i = 0; i < MAX_NUMNODES; i++) { cpumask_t nodemask = node_to_cpumask(i); cpus_and(nodemask, nodemask, *cpu_map); if (cpus_empty(nodemask)) continue; init_sched_build_groups(sched_group_phys, nodemask, &cpu_to_phys_group); } #ifdef CONFIG_NUMA init_sched_build_groups(sched_group_allnodes, *cpu_map, &cpu_to_allnodes_group); for (i = 0; i < MAX_NUMNODES; i++) { /* Set up node groups */ struct sched_group *sg, *prev; cpumask_t nodemask = node_to_cpumask(i); cpumask_t domainspan; cpumask_t covered = CPU_MASK_NONE; int j; cpus_and(nodemask, nodemask, *cpu_map); if (cpus_empty(nodemask)) continue; domainspan = sched_domain_node_span(i); cpus_and(domainspan, domainspan, *cpu_map); sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); sched_group_nodes[i] = sg; for_each_cpu_mask(j, nodemask) { struct sched_domain *sd; sd = &per_cpu(node_domains, j); sd->groups = sg; if (sd->groups == NULL) { /* Turn off balancing if we have no groups */ sd->flags = 0; } } if (!sg) { printk(KERN_WARNING "Can not alloc domain group for node %d\n", i); continue; } sg->cpu_power = 0; sg->cpumask = nodemask; cpus_or(covered, covered, nodemask); prev = sg; for (j = 0; j < MAX_NUMNODES; j++) { cpumask_t tmp, notcovered; int n = (i + j) % MAX_NUMNODES; cpus_complement(notcovered, covered); cpus_and(tmp, notcovered, *cpu_map); cpus_and(tmp, tmp, domainspan); if (cpus_empty(tmp)) break; nodemask = node_to_cpumask(n); cpus_and(tmp, tmp, nodemask); if (cpus_empty(tmp)) continue; sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); if (!sg) { printk(KERN_WARNING "Can not alloc domain group for node %d\n", j); break; } sg->cpu_power = 0; sg->cpumask = tmp; cpus_or(covered, covered, tmp); prev->next = sg; prev = sg; } prev->next = sched_group_nodes[i]; } #endif /* Calculate CPU power for physical packages and nodes */ for_each_cpu_mask(i, *cpu_map) { int power; struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); power = SCHED_LOAD_SCALE; sd->groups->cpu_power = power; #endif sd = &per_cpu(phys_domains, i); power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * (cpus_weight(sd->groups->cpumask)-1) / 10; sd->groups->cpu_power = power; #ifdef CONFIG_NUMA sd = &per_cpu(allnodes_domains, i); if (sd->groups) { power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * (cpus_weight(sd->groups->cpumask)-1) / 10; sd->groups->cpu_power = power; } #endif } #ifdef CONFIG_NUMA for (i = 0; i < MAX_NUMNODES; i++) { struct sched_group *sg = sched_group_nodes[i]; int j; if (sg == NULL) continue; next_sg: for_each_cpu_mask(j, sg->cpumask) { struct sched_domain *sd; int power; sd = &per_cpu(phys_domains, j); if (j != first_cpu(sd->groups->cpumask)) { /* * Only add "power" once for each * physical package. */ continue; } power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * (cpus_weight(sd->groups->cpumask)-1) / 10; sg->cpu_power += power; } sg = sg->next; if (sg != sched_group_nodes[i]) goto next_sg; } #endif /* Attach the domains */ for_each_cpu_mask(i, *cpu_map) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); #else sd = &per_cpu(phys_domains, i); #endif cpu_attach_domain(sd, i); } } /* * Set up scheduler domains and groups. Callers must hold the hotplug lock. */ void arch_init_sched_domains(const cpumask_t *cpu_map) { cpumask_t cpu_default_map; /* * Setup mask for cpus without special case scheduling requirements. * For now this just excludes isolated cpus, but could be used to * exclude other special cases in the future. */ cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); build_sched_domains(&cpu_default_map); } void arch_destroy_sched_domains(const cpumask_t *cpu_map) { #ifdef CONFIG_NUMA int i; for (i = 0; i < MAX_NUMNODES; i++) { cpumask_t nodemask = node_to_cpumask(i); struct sched_group *oldsg, *sg = sched_group_nodes[i]; cpus_and(nodemask, nodemask, *cpu_map); if (cpus_empty(nodemask)) continue; if (sg == NULL) continue; sg = sg->next; next_sg: oldsg = sg; sg = sg->next; kfree(oldsg); if (oldsg != sched_group_nodes[i]) goto next_sg; sched_group_nodes[i] = NULL; } #endif }