diff options
-rw-r--r-- | Documentation/cpusets.txt | 16 | ||||
-rw-r--r-- | kernel/cpuset.c | 89 |
2 files changed, 92 insertions, 13 deletions
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt index 2f8f24eaefd9..ad944c060312 100644 --- a/Documentation/cpusets.txt +++ b/Documentation/cpusets.txt | |||
@@ -51,6 +51,14 @@ mems_allowed vector. | |||
51 | 51 | ||
52 | If a cpuset is cpu or mem exclusive, no other cpuset, other than a direct | 52 | If a cpuset is cpu or mem exclusive, no other cpuset, other than a direct |
53 | ancestor or descendent, may share any of the same CPUs or Memory Nodes. | 53 | ancestor or descendent, may share any of the same CPUs or Memory Nodes. |
54 | A cpuset that is cpu exclusive has a sched domain associated with it. | ||
55 | The sched domain consists of all cpus in the current cpuset that are not | ||
56 | part of any exclusive child cpusets. | ||
57 | This ensures that the scheduler load balacing code only balances | ||
58 | against the cpus that are in the sched domain as defined above and not | ||
59 | all of the cpus in the system. This removes any overhead due to | ||
60 | load balancing code trying to pull tasks outside of the cpu exclusive | ||
61 | cpuset only to be prevented by the tasks' cpus_allowed mask. | ||
54 | 62 | ||
55 | User level code may create and destroy cpusets by name in the cpuset | 63 | User level code may create and destroy cpusets by name in the cpuset |
56 | virtual file system, manage the attributes and permissions of these | 64 | virtual file system, manage the attributes and permissions of these |
@@ -84,6 +92,9 @@ This can be especially valuable on: | |||
84 | and a database), or | 92 | and a database), or |
85 | * NUMA systems running large HPC applications with demanding | 93 | * NUMA systems running large HPC applications with demanding |
86 | performance characteristics. | 94 | performance characteristics. |
95 | * Also cpu_exclusive cpusets are useful for servers running orthogonal | ||
96 | workloads such as RT applications requiring low latency and HPC | ||
97 | applications that are throughput sensitive | ||
87 | 98 | ||
88 | These subsets, or "soft partitions" must be able to be dynamically | 99 | These subsets, or "soft partitions" must be able to be dynamically |
89 | adjusted, as the job mix changes, without impacting other concurrently | 100 | adjusted, as the job mix changes, without impacting other concurrently |
@@ -125,6 +136,8 @@ Cpusets extends these two mechanisms as follows: | |||
125 | - A cpuset may be marked exclusive, which ensures that no other | 136 | - A cpuset may be marked exclusive, which ensures that no other |
126 | cpuset (except direct ancestors and descendents) may contain | 137 | cpuset (except direct ancestors and descendents) may contain |
127 | any overlapping CPUs or Memory Nodes. | 138 | any overlapping CPUs or Memory Nodes. |
139 | Also a cpu_exclusive cpuset would be associated with a sched | ||
140 | domain. | ||
128 | - You can list all the tasks (by pid) attached to any cpuset. | 141 | - You can list all the tasks (by pid) attached to any cpuset. |
129 | 142 | ||
130 | The implementation of cpusets requires a few, simple hooks | 143 | The implementation of cpusets requires a few, simple hooks |
@@ -136,6 +149,9 @@ into the rest of the kernel, none in performance critical paths: | |||
136 | allowed in that tasks cpuset. | 149 | allowed in that tasks cpuset. |
137 | - in sched.c migrate_all_tasks(), to keep migrating tasks within | 150 | - in sched.c migrate_all_tasks(), to keep migrating tasks within |
138 | the CPUs allowed by their cpuset, if possible. | 151 | the CPUs allowed by their cpuset, if possible. |
152 | - in sched.c, a new API partition_sched_domains for handling | ||
153 | sched domain changes associated with cpu_exclusive cpusets | ||
154 | and related changes in both sched.c and arch/ia64/kernel/domain.c | ||
139 | - in the mbind and set_mempolicy system calls, to mask the requested | 155 | - in the mbind and set_mempolicy system calls, to mask the requested |
140 | Memory Nodes by what's allowed in that tasks cpuset. | 156 | Memory Nodes by what's allowed in that tasks cpuset. |
141 | - in page_alloc, to restrict memory to allowed nodes. | 157 | - in page_alloc, to restrict memory to allowed nodes. |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 79dd929f4084..984c0bf3807f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -595,10 +595,62 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
595 | return 0; | 595 | return 0; |
596 | } | 596 | } |
597 | 597 | ||
598 | /* | ||
599 | * For a given cpuset cur, partition the system as follows | ||
600 | * a. All cpus in the parent cpuset's cpus_allowed that are not part of any | ||
601 | * exclusive child cpusets | ||
602 | * b. All cpus in the current cpuset's cpus_allowed that are not part of any | ||
603 | * exclusive child cpusets | ||
604 | * Build these two partitions by calling partition_sched_domains | ||
605 | * | ||
606 | * Call with cpuset_sem held. May nest a call to the | ||
607 | * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | ||
608 | */ | ||
609 | static void update_cpu_domains(struct cpuset *cur) | ||
610 | { | ||
611 | struct cpuset *c, *par = cur->parent; | ||
612 | cpumask_t pspan, cspan; | ||
613 | |||
614 | if (par == NULL || cpus_empty(cur->cpus_allowed)) | ||
615 | return; | ||
616 | |||
617 | /* | ||
618 | * Get all cpus from parent's cpus_allowed not part of exclusive | ||
619 | * children | ||
620 | */ | ||
621 | pspan = par->cpus_allowed; | ||
622 | list_for_each_entry(c, &par->children, sibling) { | ||
623 | if (is_cpu_exclusive(c)) | ||
624 | cpus_andnot(pspan, pspan, c->cpus_allowed); | ||
625 | } | ||
626 | if (is_removed(cur) || !is_cpu_exclusive(cur)) { | ||
627 | cpus_or(pspan, pspan, cur->cpus_allowed); | ||
628 | if (cpus_equal(pspan, cur->cpus_allowed)) | ||
629 | return; | ||
630 | cspan = CPU_MASK_NONE; | ||
631 | } else { | ||
632 | if (cpus_empty(pspan)) | ||
633 | return; | ||
634 | cspan = cur->cpus_allowed; | ||
635 | /* | ||
636 | * Get all cpus from current cpuset's cpus_allowed not part | ||
637 | * of exclusive children | ||
638 | */ | ||
639 | list_for_each_entry(c, &cur->children, sibling) { | ||
640 | if (is_cpu_exclusive(c)) | ||
641 | cpus_andnot(cspan, cspan, c->cpus_allowed); | ||
642 | } | ||
643 | } | ||
644 | |||
645 | lock_cpu_hotplug(); | ||
646 | partition_sched_domains(&pspan, &cspan); | ||
647 | unlock_cpu_hotplug(); | ||
648 | } | ||
649 | |||
598 | static int update_cpumask(struct cpuset *cs, char *buf) | 650 | static int update_cpumask(struct cpuset *cs, char *buf) |
599 | { | 651 | { |
600 | struct cpuset trialcs; | 652 | struct cpuset trialcs; |
601 | int retval; | 653 | int retval, cpus_unchanged; |
602 | 654 | ||
603 | trialcs = *cs; | 655 | trialcs = *cs; |
604 | retval = cpulist_parse(buf, trialcs.cpus_allowed); | 656 | retval = cpulist_parse(buf, trialcs.cpus_allowed); |
@@ -608,9 +660,13 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
608 | if (cpus_empty(trialcs.cpus_allowed)) | 660 | if (cpus_empty(trialcs.cpus_allowed)) |
609 | return -ENOSPC; | 661 | return -ENOSPC; |
610 | retval = validate_change(cs, &trialcs); | 662 | retval = validate_change(cs, &trialcs); |
611 | if (retval == 0) | 663 | if (retval < 0) |
612 | cs->cpus_allowed = trialcs.cpus_allowed; | 664 | return retval; |
613 | return retval; | 665 | cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); |
666 | cs->cpus_allowed = trialcs.cpus_allowed; | ||
667 | if (is_cpu_exclusive(cs) && !cpus_unchanged) | ||
668 | update_cpu_domains(cs); | ||
669 | return 0; | ||
614 | } | 670 | } |
615 | 671 | ||
616 | static int update_nodemask(struct cpuset *cs, char *buf) | 672 | static int update_nodemask(struct cpuset *cs, char *buf) |
@@ -646,7 +702,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
646 | { | 702 | { |
647 | int turning_on; | 703 | int turning_on; |
648 | struct cpuset trialcs; | 704 | struct cpuset trialcs; |
649 | int err; | 705 | int err, cpu_exclusive_changed; |
650 | 706 | ||
651 | turning_on = (simple_strtoul(buf, NULL, 10) != 0); | 707 | turning_on = (simple_strtoul(buf, NULL, 10) != 0); |
652 | 708 | ||
@@ -657,13 +713,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
657 | clear_bit(bit, &trialcs.flags); | 713 | clear_bit(bit, &trialcs.flags); |
658 | 714 | ||
659 | err = validate_change(cs, &trialcs); | 715 | err = validate_change(cs, &trialcs); |
660 | if (err == 0) { | 716 | if (err < 0) |
661 | if (turning_on) | 717 | return err; |
662 | set_bit(bit, &cs->flags); | 718 | cpu_exclusive_changed = |
663 | else | 719 | (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs)); |
664 | clear_bit(bit, &cs->flags); | 720 | if (turning_on) |
665 | } | 721 | set_bit(bit, &cs->flags); |
666 | return err; | 722 | else |
723 | clear_bit(bit, &cs->flags); | ||
724 | |||
725 | if (cpu_exclusive_changed) | ||
726 | update_cpu_domains(cs); | ||
727 | return 0; | ||
667 | } | 728 | } |
668 | 729 | ||
669 | static int attach_task(struct cpuset *cs, char *buf) | 730 | static int attach_task(struct cpuset *cs, char *buf) |
@@ -1309,12 +1370,14 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1309 | up(&cpuset_sem); | 1370 | up(&cpuset_sem); |
1310 | return -EBUSY; | 1371 | return -EBUSY; |
1311 | } | 1372 | } |
1312 | spin_lock(&cs->dentry->d_lock); | ||
1313 | parent = cs->parent; | 1373 | parent = cs->parent; |
1314 | set_bit(CS_REMOVED, &cs->flags); | 1374 | set_bit(CS_REMOVED, &cs->flags); |
1375 | if (is_cpu_exclusive(cs)) | ||
1376 | update_cpu_domains(cs); | ||
1315 | list_del(&cs->sibling); /* delete my sibling from parent->children */ | 1377 | list_del(&cs->sibling); /* delete my sibling from parent->children */ |
1316 | if (list_empty(&parent->children)) | 1378 | if (list_empty(&parent->children)) |
1317 | check_for_release(parent); | 1379 | check_for_release(parent); |
1380 | spin_lock(&cs->dentry->d_lock); | ||
1318 | d = dget(cs->dentry); | 1381 | d = dget(cs->dentry); |
1319 | cs->dentry = NULL; | 1382 | cs->dentry = NULL; |
1320 | spin_unlock(&d->d_lock); | 1383 | spin_unlock(&d->d_lock); |