aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/cpusets.txt16
-rw-r--r--kernel/cpuset.c89
2 files changed, 92 insertions, 13 deletions
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt
index 2f8f24eaefd9..ad944c060312 100644
--- a/Documentation/cpusets.txt
+++ b/Documentation/cpusets.txt
@@ -51,6 +51,14 @@ mems_allowed vector.
51 51
52If a cpuset is cpu or mem exclusive, no other cpuset, other than a direct 52If a cpuset is cpu or mem exclusive, no other cpuset, other than a direct
53ancestor or descendent, may share any of the same CPUs or Memory Nodes. 53ancestor or descendent, may share any of the same CPUs or Memory Nodes.
54A cpuset that is cpu exclusive has a sched domain associated with it.
55The sched domain consists of all cpus in the current cpuset that are not
56part of any exclusive child cpusets.
57This ensures that the scheduler load balacing code only balances
58against the cpus that are in the sched domain as defined above and not
59all of the cpus in the system. This removes any overhead due to
60load balancing code trying to pull tasks outside of the cpu exclusive
61cpuset only to be prevented by the tasks' cpus_allowed mask.
54 62
55User level code may create and destroy cpusets by name in the cpuset 63User level code may create and destroy cpusets by name in the cpuset
56virtual file system, manage the attributes and permissions of these 64virtual file system, manage the attributes and permissions of these
@@ -84,6 +92,9 @@ This can be especially valuable on:
84 and a database), or 92 and a database), or
85 * NUMA systems running large HPC applications with demanding 93 * NUMA systems running large HPC applications with demanding
86 performance characteristics. 94 performance characteristics.
95 * Also cpu_exclusive cpusets are useful for servers running orthogonal
96 workloads such as RT applications requiring low latency and HPC
97 applications that are throughput sensitive
87 98
88These subsets, or "soft partitions" must be able to be dynamically 99These subsets, or "soft partitions" must be able to be dynamically
89adjusted, as the job mix changes, without impacting other concurrently 100adjusted, as the job mix changes, without impacting other concurrently
@@ -125,6 +136,8 @@ Cpusets extends these two mechanisms as follows:
125 - A cpuset may be marked exclusive, which ensures that no other 136 - A cpuset may be marked exclusive, which ensures that no other
126 cpuset (except direct ancestors and descendents) may contain 137 cpuset (except direct ancestors and descendents) may contain
127 any overlapping CPUs or Memory Nodes. 138 any overlapping CPUs or Memory Nodes.
139 Also a cpu_exclusive cpuset would be associated with a sched
140 domain.
128 - You can list all the tasks (by pid) attached to any cpuset. 141 - You can list all the tasks (by pid) attached to any cpuset.
129 142
130The implementation of cpusets requires a few, simple hooks 143The implementation of cpusets requires a few, simple hooks
@@ -136,6 +149,9 @@ into the rest of the kernel, none in performance critical paths:
136 allowed in that tasks cpuset. 149 allowed in that tasks cpuset.
137 - in sched.c migrate_all_tasks(), to keep migrating tasks within 150 - in sched.c migrate_all_tasks(), to keep migrating tasks within
138 the CPUs allowed by their cpuset, if possible. 151 the CPUs allowed by their cpuset, if possible.
152 - in sched.c, a new API partition_sched_domains for handling
153 sched domain changes associated with cpu_exclusive cpusets
154 and related changes in both sched.c and arch/ia64/kernel/domain.c
139 - in the mbind and set_mempolicy system calls, to mask the requested 155 - in the mbind and set_mempolicy system calls, to mask the requested
140 Memory Nodes by what's allowed in that tasks cpuset. 156 Memory Nodes by what's allowed in that tasks cpuset.
141 - in page_alloc, to restrict memory to allowed nodes. 157 - in page_alloc, to restrict memory to allowed nodes.
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 79dd929f4084..984c0bf3807f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -595,10 +595,62 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
595 return 0; 595 return 0;
596} 596}
597 597
598/*
599 * For a given cpuset cur, partition the system as follows
600 * a. All cpus in the parent cpuset's cpus_allowed that are not part of any
601 * exclusive child cpusets
602 * b. All cpus in the current cpuset's cpus_allowed that are not part of any
603 * exclusive child cpusets
604 * Build these two partitions by calling partition_sched_domains
605 *
606 * Call with cpuset_sem held. May nest a call to the
607 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
608 */
609static void update_cpu_domains(struct cpuset *cur)
610{
611 struct cpuset *c, *par = cur->parent;
612 cpumask_t pspan, cspan;
613
614 if (par == NULL || cpus_empty(cur->cpus_allowed))
615 return;
616
617 /*
618 * Get all cpus from parent's cpus_allowed not part of exclusive
619 * children
620 */
621 pspan = par->cpus_allowed;
622 list_for_each_entry(c, &par->children, sibling) {
623 if (is_cpu_exclusive(c))
624 cpus_andnot(pspan, pspan, c->cpus_allowed);
625 }
626 if (is_removed(cur) || !is_cpu_exclusive(cur)) {
627 cpus_or(pspan, pspan, cur->cpus_allowed);
628 if (cpus_equal(pspan, cur->cpus_allowed))
629 return;
630 cspan = CPU_MASK_NONE;
631 } else {
632 if (cpus_empty(pspan))
633 return;
634 cspan = cur->cpus_allowed;
635 /*
636 * Get all cpus from current cpuset's cpus_allowed not part
637 * of exclusive children
638 */
639 list_for_each_entry(c, &cur->children, sibling) {
640 if (is_cpu_exclusive(c))
641 cpus_andnot(cspan, cspan, c->cpus_allowed);
642 }
643 }
644
645 lock_cpu_hotplug();
646 partition_sched_domains(&pspan, &cspan);
647 unlock_cpu_hotplug();
648}
649
598static int update_cpumask(struct cpuset *cs, char *buf) 650static int update_cpumask(struct cpuset *cs, char *buf)
599{ 651{
600 struct cpuset trialcs; 652 struct cpuset trialcs;
601 int retval; 653 int retval, cpus_unchanged;
602 654
603 trialcs = *cs; 655 trialcs = *cs;
604 retval = cpulist_parse(buf, trialcs.cpus_allowed); 656 retval = cpulist_parse(buf, trialcs.cpus_allowed);
@@ -608,9 +660,13 @@ static int update_cpumask(struct cpuset *cs, char *buf)
608 if (cpus_empty(trialcs.cpus_allowed)) 660 if (cpus_empty(trialcs.cpus_allowed))
609 return -ENOSPC; 661 return -ENOSPC;
610 retval = validate_change(cs, &trialcs); 662 retval = validate_change(cs, &trialcs);
611 if (retval == 0) 663 if (retval < 0)
612 cs->cpus_allowed = trialcs.cpus_allowed; 664 return retval;
613 return retval; 665 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
666 cs->cpus_allowed = trialcs.cpus_allowed;
667 if (is_cpu_exclusive(cs) && !cpus_unchanged)
668 update_cpu_domains(cs);
669 return 0;
614} 670}
615 671
616static int update_nodemask(struct cpuset *cs, char *buf) 672static int update_nodemask(struct cpuset *cs, char *buf)
@@ -646,7 +702,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
646{ 702{
647 int turning_on; 703 int turning_on;
648 struct cpuset trialcs; 704 struct cpuset trialcs;
649 int err; 705 int err, cpu_exclusive_changed;
650 706
651 turning_on = (simple_strtoul(buf, NULL, 10) != 0); 707 turning_on = (simple_strtoul(buf, NULL, 10) != 0);
652 708
@@ -657,13 +713,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
657 clear_bit(bit, &trialcs.flags); 713 clear_bit(bit, &trialcs.flags);
658 714
659 err = validate_change(cs, &trialcs); 715 err = validate_change(cs, &trialcs);
660 if (err == 0) { 716 if (err < 0)
661 if (turning_on) 717 return err;
662 set_bit(bit, &cs->flags); 718 cpu_exclusive_changed =
663 else 719 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
664 clear_bit(bit, &cs->flags); 720 if (turning_on)
665 } 721 set_bit(bit, &cs->flags);
666 return err; 722 else
723 clear_bit(bit, &cs->flags);
724
725 if (cpu_exclusive_changed)
726 update_cpu_domains(cs);
727 return 0;
667} 728}
668 729
669static int attach_task(struct cpuset *cs, char *buf) 730static int attach_task(struct cpuset *cs, char *buf)
@@ -1309,12 +1370,14 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1309 up(&cpuset_sem); 1370 up(&cpuset_sem);
1310 return -EBUSY; 1371 return -EBUSY;
1311 } 1372 }
1312 spin_lock(&cs->dentry->d_lock);
1313 parent = cs->parent; 1373 parent = cs->parent;
1314 set_bit(CS_REMOVED, &cs->flags); 1374 set_bit(CS_REMOVED, &cs->flags);
1375 if (is_cpu_exclusive(cs))
1376 update_cpu_domains(cs);
1315 list_del(&cs->sibling); /* delete my sibling from parent->children */ 1377 list_del(&cs->sibling); /* delete my sibling from parent->children */
1316 if (list_empty(&parent->children)) 1378 if (list_empty(&parent->children))
1317 check_for_release(parent); 1379 check_for_release(parent);
1380 spin_lock(&cs->dentry->d_lock);
1318 d = dget(cs->dentry); 1381 d = dget(cs->dentry);
1319 cs->dentry = NULL; 1382 cs->dentry = NULL;
1320 spin_unlock(&d->d_lock); 1383 spin_unlock(&d->d_lock);