diff options
-rw-r--r-- | Documentation/cpusets.txt | 141 | ||||
-rw-r--r-- | include/linux/sched.h | 2 | ||||
-rw-r--r-- | kernel/cpuset.c | 275 | ||||
-rw-r--r-- | kernel/sched.c | 95 |
4 files changed, 492 insertions, 21 deletions
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt index 85eeab5e7e3..141bef1c859 100644 --- a/Documentation/cpusets.txt +++ b/Documentation/cpusets.txt | |||
@@ -19,7 +19,8 @@ CONTENTS: | |||
19 | 1.4 What are exclusive cpusets ? | 19 | 1.4 What are exclusive cpusets ? |
20 | 1.5 What is memory_pressure ? | 20 | 1.5 What is memory_pressure ? |
21 | 1.6 What is memory spread ? | 21 | 1.6 What is memory spread ? |
22 | 1.7 How do I use cpusets ? | 22 | 1.7 What is sched_load_balance ? |
23 | 1.8 How do I use cpusets ? | ||
23 | 2. Usage Examples and Syntax | 24 | 2. Usage Examples and Syntax |
24 | 2.1 Basic Usage | 25 | 2.1 Basic Usage |
25 | 2.2 Adding/removing cpus | 26 | 2.2 Adding/removing cpus |
@@ -359,8 +360,144 @@ policy, especially for jobs that might have one thread reading in the | |||
359 | data set, the memory allocation across the nodes in the jobs cpuset | 360 | data set, the memory allocation across the nodes in the jobs cpuset |
360 | can become very uneven. | 361 | can become very uneven. |
361 | 362 | ||
363 | 1.7 What is sched_load_balance ? | ||
364 | -------------------------------- | ||
362 | 365 | ||
363 | 1.7 How do I use cpusets ? | 366 | The kernel scheduler (kernel/sched.c) automatically load balances |
367 | tasks. If one CPU is underutilized, kernel code running on that | ||
368 | CPU will look for tasks on other more overloaded CPUs and move those | ||
369 | tasks to itself, within the constraints of such placement mechanisms | ||
370 | as cpusets and sched_setaffinity. | ||
371 | |||
372 | The algorithmic cost of load balancing and its impact on key shared | ||
373 | kernel data structures such as the task list increases more than | ||
374 | linearly with the number of CPUs being balanced. So the scheduler | ||
375 | has support to partition the systems CPUs into a number of sched | ||
376 | domains such that it only load balances within each sched domain. | ||
377 | Each sched domain covers some subset of the CPUs in the system; | ||
378 | no two sched domains overlap; some CPUs might not be in any sched | ||
379 | domain and hence won't be load balanced. | ||
380 | |||
381 | Put simply, it costs less to balance between two smaller sched domains | ||
382 | than one big one, but doing so means that overloads in one of the | ||
383 | two domains won't be load balanced to the other one. | ||
384 | |||
385 | By default, there is one sched domain covering all CPUs, except those | ||
386 | marked isolated using the kernel boot time "isolcpus=" argument. | ||
387 | |||
388 | This default load balancing across all CPUs is not well suited for | ||
389 | the following two situations: | ||
390 | 1) On large systems, load balancing across many CPUs is expensive. | ||
391 | If the system is managed using cpusets to place independent jobs | ||
392 | on separate sets of CPUs, full load balancing is unnecessary. | ||
393 | 2) Systems supporting realtime on some CPUs need to minimize | ||
394 | system overhead on those CPUs, including avoiding task load | ||
395 | balancing if that is not needed. | ||
396 | |||
397 | When the per-cpuset flag "sched_load_balance" is enabled (the default | ||
398 | setting), it requests that all the CPUs in that cpusets allowed 'cpus' | ||
399 | be contained in a single sched domain, ensuring that load balancing | ||
400 | can move a task (not otherwised pinned, as by sched_setaffinity) | ||
401 | from any CPU in that cpuset to any other. | ||
402 | |||
403 | When the per-cpuset flag "sched_load_balance" is disabled, then the | ||
404 | scheduler will avoid load balancing across the CPUs in that cpuset, | ||
405 | --except-- in so far as is necessary because some overlapping cpuset | ||
406 | has "sched_load_balance" enabled. | ||
407 | |||
408 | So, for example, if the top cpuset has the flag "sched_load_balance" | ||
409 | enabled, then the scheduler will have one sched domain covering all | ||
410 | CPUs, and the setting of the "sched_load_balance" flag in any other | ||
411 | cpusets won't matter, as we're already fully load balancing. | ||
412 | |||
413 | Therefore in the above two situations, the top cpuset flag | ||
414 | "sched_load_balance" should be disabled, and only some of the smaller, | ||
415 | child cpusets have this flag enabled. | ||
416 | |||
417 | When doing this, you don't usually want to leave any unpinned tasks in | ||
418 | the top cpuset that might use non-trivial amounts of CPU, as such tasks | ||
419 | may be artificially constrained to some subset of CPUs, depending on | ||
420 | the particulars of this flag setting in descendent cpusets. Even if | ||
421 | such a task could use spare CPU cycles in some other CPUs, the kernel | ||
422 | scheduler might not consider the possibility of load balancing that | ||
423 | task to that underused CPU. | ||
424 | |||
425 | Of course, tasks pinned to a particular CPU can be left in a cpuset | ||
426 | that disables "sched_load_balance" as those tasks aren't going anywhere | ||
427 | else anyway. | ||
428 | |||
429 | There is an impedance mismatch here, between cpusets and sched domains. | ||
430 | Cpusets are hierarchical and nest. Sched domains are flat; they don't | ||
431 | overlap and each CPU is in at most one sched domain. | ||
432 | |||
433 | It is necessary for sched domains to be flat because load balancing | ||
434 | across partially overlapping sets of CPUs would risk unstable dynamics | ||
435 | that would be beyond our understanding. So if each of two partially | ||
436 | overlapping cpusets enables the flag 'sched_load_balance', then we | ||
437 | form a single sched domain that is a superset of both. We won't move | ||
438 | a task to a CPU outside it cpuset, but the scheduler load balancing | ||
439 | code might waste some compute cycles considering that possibility. | ||
440 | |||
441 | This mismatch is why there is not a simple one-to-one relation | ||
442 | between which cpusets have the flag "sched_load_balance" enabled, | ||
443 | and the sched domain configuration. If a cpuset enables the flag, it | ||
444 | will get balancing across all its CPUs, but if it disables the flag, | ||
445 | it will only be assured of no load balancing if no other overlapping | ||
446 | cpuset enables the flag. | ||
447 | |||
448 | If two cpusets have partially overlapping 'cpus' allowed, and only | ||
449 | one of them has this flag enabled, then the other may find its | ||
450 | tasks only partially load balanced, just on the overlapping CPUs. | ||
451 | This is just the general case of the top_cpuset example given a few | ||
452 | paragraphs above. In the general case, as in the top cpuset case, | ||
453 | don't leave tasks that might use non-trivial amounts of CPU in | ||
454 | such partially load balanced cpusets, as they may be artificially | ||
455 | constrained to some subset of the CPUs allowed to them, for lack of | ||
456 | load balancing to the other CPUs. | ||
457 | |||
458 | 1.7.1 sched_load_balance implementation details. | ||
459 | ------------------------------------------------ | ||
460 | |||
461 | The per-cpuset flag 'sched_load_balance' defaults to enabled (contrary | ||
462 | to most cpuset flags.) When enabled for a cpuset, the kernel will | ||
463 | ensure that it can load balance across all the CPUs in that cpuset | ||
464 | (makes sure that all the CPUs in the cpus_allowed of that cpuset are | ||
465 | in the same sched domain.) | ||
466 | |||
467 | If two overlapping cpusets both have 'sched_load_balance' enabled, | ||
468 | then they will be (must be) both in the same sched domain. | ||
469 | |||
470 | If, as is the default, the top cpuset has 'sched_load_balance' enabled, | ||
471 | then by the above that means there is a single sched domain covering | ||
472 | the whole system, regardless of any other cpuset settings. | ||
473 | |||
474 | The kernel commits to user space that it will avoid load balancing | ||
475 | where it can. It will pick as fine a granularity partition of sched | ||
476 | domains as it can while still providing load balancing for any set | ||
477 | of CPUs allowed to a cpuset having 'sched_load_balance' enabled. | ||
478 | |||
479 | The internal kernel cpuset to scheduler interface passes from the | ||
480 | cpuset code to the scheduler code a partition of the load balanced | ||
481 | CPUs in the system. This partition is a set of subsets (represented | ||
482 | as an array of cpumask_t) of CPUs, pairwise disjoint, that cover all | ||
483 | the CPUs that must be load balanced. | ||
484 | |||
485 | Whenever the 'sched_load_balance' flag changes, or CPUs come or go | ||
486 | from a cpuset with this flag enabled, or a cpuset with this flag | ||
487 | enabled is removed, the cpuset code builds a new such partition and | ||
488 | passes it to the scheduler sched domain setup code, to have the sched | ||
489 | domains rebuilt as necessary. | ||
490 | |||
491 | This partition exactly defines what sched domains the scheduler should | ||
492 | setup - one sched domain for each element (cpumask_t) in the partition. | ||
493 | |||
494 | The scheduler remembers the currently active sched domain partitions. | ||
495 | When the scheduler routine partition_sched_domains() is invoked from | ||
496 | the cpuset code to update these sched domains, it compares the new | ||
497 | partition requested with the current, and updates its sched domains, | ||
498 | removing the old and adding the new, for each change. | ||
499 | |||
500 | 1.8 How do I use cpusets ? | ||
364 | -------------------------- | 501 | -------------------------- |
365 | 502 | ||
366 | In order to minimize the impact of cpusets on critical kernel | 503 | In order to minimize the impact of cpusets on critical kernel |
diff --git a/include/linux/sched.h b/include/linux/sched.h index cbd8731a66e..4bbbe12880d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -737,6 +737,8 @@ struct sched_domain { | |||
737 | #endif | 737 | #endif |
738 | }; | 738 | }; |
739 | 739 | ||
740 | extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new); | ||
741 | |||
740 | #endif /* CONFIG_SMP */ | 742 | #endif /* CONFIG_SMP */ |
741 | 743 | ||
742 | /* | 744 | /* |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1133062395e..203ca52e78d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -4,7 +4,7 @@ | |||
4 | * Processor and Memory placement constraints for sets of tasks. | 4 | * Processor and Memory placement constraints for sets of tasks. |
5 | * | 5 | * |
6 | * Copyright (C) 2003 BULL SA. | 6 | * Copyright (C) 2003 BULL SA. |
7 | * Copyright (C) 2004-2006 Silicon Graphics, Inc. | 7 | * Copyright (C) 2004-2007 Silicon Graphics, Inc. |
8 | * Copyright (C) 2006 Google, Inc | 8 | * Copyright (C) 2006 Google, Inc |
9 | * | 9 | * |
10 | * Portions derived from Patrick Mochel's sysfs code. | 10 | * Portions derived from Patrick Mochel's sysfs code. |
@@ -54,6 +54,7 @@ | |||
54 | #include <asm/uaccess.h> | 54 | #include <asm/uaccess.h> |
55 | #include <asm/atomic.h> | 55 | #include <asm/atomic.h> |
56 | #include <linux/mutex.h> | 56 | #include <linux/mutex.h> |
57 | #include <linux/kfifo.h> | ||
57 | 58 | ||
58 | /* | 59 | /* |
59 | * Tracks how many cpusets are currently defined in system. | 60 | * Tracks how many cpusets are currently defined in system. |
@@ -91,6 +92,9 @@ struct cpuset { | |||
91 | int mems_generation; | 92 | int mems_generation; |
92 | 93 | ||
93 | struct fmeter fmeter; /* memory_pressure filter */ | 94 | struct fmeter fmeter; /* memory_pressure filter */ |
95 | |||
96 | /* partition number for rebuild_sched_domains() */ | ||
97 | int pn; | ||
94 | }; | 98 | }; |
95 | 99 | ||
96 | /* Retrieve the cpuset for a cgroup */ | 100 | /* Retrieve the cpuset for a cgroup */ |
@@ -113,6 +117,7 @@ typedef enum { | |||
113 | CS_CPU_EXCLUSIVE, | 117 | CS_CPU_EXCLUSIVE, |
114 | CS_MEM_EXCLUSIVE, | 118 | CS_MEM_EXCLUSIVE, |
115 | CS_MEMORY_MIGRATE, | 119 | CS_MEMORY_MIGRATE, |
120 | CS_SCHED_LOAD_BALANCE, | ||
116 | CS_SPREAD_PAGE, | 121 | CS_SPREAD_PAGE, |
117 | CS_SPREAD_SLAB, | 122 | CS_SPREAD_SLAB, |
118 | } cpuset_flagbits_t; | 123 | } cpuset_flagbits_t; |
@@ -128,6 +133,11 @@ static inline int is_mem_exclusive(const struct cpuset *cs) | |||
128 | return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); | 133 | return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); |
129 | } | 134 | } |
130 | 135 | ||
136 | static inline int is_sched_load_balance(const struct cpuset *cs) | ||
137 | { | ||
138 | return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | ||
139 | } | ||
140 | |||
131 | static inline int is_memory_migrate(const struct cpuset *cs) | 141 | static inline int is_memory_migrate(const struct cpuset *cs) |
132 | { | 142 | { |
133 | return test_bit(CS_MEMORY_MIGRATE, &cs->flags); | 143 | return test_bit(CS_MEMORY_MIGRATE, &cs->flags); |
@@ -482,6 +492,208 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
482 | } | 492 | } |
483 | 493 | ||
484 | /* | 494 | /* |
495 | * Helper routine for rebuild_sched_domains(). | ||
496 | * Do cpusets a, b have overlapping cpus_allowed masks? | ||
497 | */ | ||
498 | |||
499 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | ||
500 | { | ||
501 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); | ||
502 | } | ||
503 | |||
504 | /* | ||
505 | * rebuild_sched_domains() | ||
506 | * | ||
507 | * If the flag 'sched_load_balance' of any cpuset with non-empty | ||
508 | * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset | ||
509 | * which has that flag enabled, or if any cpuset with a non-empty | ||
510 | * 'cpus' is removed, then call this routine to rebuild the | ||
511 | * scheduler's dynamic sched domains. | ||
512 | * | ||
513 | * This routine builds a partial partition of the systems CPUs | ||
514 | * (the set of non-overlappping cpumask_t's in the array 'part' | ||
515 | * below), and passes that partial partition to the kernel/sched.c | ||
516 | * partition_sched_domains() routine, which will rebuild the | ||
517 | * schedulers load balancing domains (sched domains) as specified | ||
518 | * by that partial partition. A 'partial partition' is a set of | ||
519 | * non-overlapping subsets whose union is a subset of that set. | ||
520 | * | ||
521 | * See "What is sched_load_balance" in Documentation/cpusets.txt | ||
522 | * for a background explanation of this. | ||
523 | * | ||
524 | * Does not return errors, on the theory that the callers of this | ||
525 | * routine would rather not worry about failures to rebuild sched | ||
526 | * domains when operating in the severe memory shortage situations | ||
527 | * that could cause allocation failures below. | ||
528 | * | ||
529 | * Call with cgroup_mutex held. May take callback_mutex during | ||
530 | * call due to the kfifo_alloc() and kmalloc() calls. May nest | ||
531 | * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | ||
532 | * Must not be called holding callback_mutex, because we must not | ||
533 | * call lock_cpu_hotplug() while holding callback_mutex. Elsewhere | ||
534 | * the kernel nests callback_mutex inside lock_cpu_hotplug() calls. | ||
535 | * So the reverse nesting would risk an ABBA deadlock. | ||
536 | * | ||
537 | * The three key local variables below are: | ||
538 | * q - a kfifo queue of cpuset pointers, used to implement a | ||
539 | * top-down scan of all cpusets. This scan loads a pointer | ||
540 | * to each cpuset marked is_sched_load_balance into the | ||
541 | * array 'csa'. For our purposes, rebuilding the schedulers | ||
542 | * sched domains, we can ignore !is_sched_load_balance cpusets. | ||
543 | * csa - (for CpuSet Array) Array of pointers to all the cpusets | ||
544 | * that need to be load balanced, for convenient iterative | ||
545 | * access by the subsequent code that finds the best partition, | ||
546 | * i.e the set of domains (subsets) of CPUs such that the | ||
547 | * cpus_allowed of every cpuset marked is_sched_load_balance | ||
548 | * is a subset of one of these domains, while there are as | ||
549 | * many such domains as possible, each as small as possible. | ||
550 | * doms - Conversion of 'csa' to an array of cpumasks, for passing to | ||
551 | * the kernel/sched.c routine partition_sched_domains() in a | ||
552 | * convenient format, that can be easily compared to the prior | ||
553 | * value to determine what partition elements (sched domains) | ||
554 | * were changed (added or removed.) | ||
555 | * | ||
556 | * Finding the best partition (set of domains): | ||
557 | * The triple nested loops below over i, j, k scan over the | ||
558 | * load balanced cpusets (using the array of cpuset pointers in | ||
559 | * csa[]) looking for pairs of cpusets that have overlapping | ||
560 | * cpus_allowed, but which don't have the same 'pn' partition | ||
561 | * number and gives them in the same partition number. It keeps | ||
562 | * looping on the 'restart' label until it can no longer find | ||
563 | * any such pairs. | ||
564 | * | ||
565 | * The union of the cpus_allowed masks from the set of | ||
566 | * all cpusets having the same 'pn' value then form the one | ||
567 | * element of the partition (one sched domain) to be passed to | ||
568 | * partition_sched_domains(). | ||
569 | */ | ||
570 | |||
571 | static void rebuild_sched_domains(void) | ||
572 | { | ||
573 | struct kfifo *q; /* queue of cpusets to be scanned */ | ||
574 | struct cpuset *cp; /* scans q */ | ||
575 | struct cpuset **csa; /* array of all cpuset ptrs */ | ||
576 | int csn; /* how many cpuset ptrs in csa so far */ | ||
577 | int i, j, k; /* indices for partition finding loops */ | ||
578 | cpumask_t *doms; /* resulting partition; i.e. sched domains */ | ||
579 | int ndoms; /* number of sched domains in result */ | ||
580 | int nslot; /* next empty doms[] cpumask_t slot */ | ||
581 | |||
582 | q = NULL; | ||
583 | csa = NULL; | ||
584 | doms = NULL; | ||
585 | |||
586 | /* Special case for the 99% of systems with one, full, sched domain */ | ||
587 | if (is_sched_load_balance(&top_cpuset)) { | ||
588 | ndoms = 1; | ||
589 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); | ||
590 | if (!doms) | ||
591 | goto rebuild; | ||
592 | *doms = top_cpuset.cpus_allowed; | ||
593 | goto rebuild; | ||
594 | } | ||
595 | |||
596 | q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL); | ||
597 | if (IS_ERR(q)) | ||
598 | goto done; | ||
599 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); | ||
600 | if (!csa) | ||
601 | goto done; | ||
602 | csn = 0; | ||
603 | |||
604 | cp = &top_cpuset; | ||
605 | __kfifo_put(q, (void *)&cp, sizeof(cp)); | ||
606 | while (__kfifo_get(q, (void *)&cp, sizeof(cp))) { | ||
607 | struct cgroup *cont; | ||
608 | struct cpuset *child; /* scans child cpusets of cp */ | ||
609 | if (is_sched_load_balance(cp)) | ||
610 | csa[csn++] = cp; | ||
611 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | ||
612 | child = cgroup_cs(cont); | ||
613 | __kfifo_put(q, (void *)&child, sizeof(cp)); | ||
614 | } | ||
615 | } | ||
616 | |||
617 | for (i = 0; i < csn; i++) | ||
618 | csa[i]->pn = i; | ||
619 | ndoms = csn; | ||
620 | |||
621 | restart: | ||
622 | /* Find the best partition (set of sched domains) */ | ||
623 | for (i = 0; i < csn; i++) { | ||
624 | struct cpuset *a = csa[i]; | ||
625 | int apn = a->pn; | ||
626 | |||
627 | for (j = 0; j < csn; j++) { | ||
628 | struct cpuset *b = csa[j]; | ||
629 | int bpn = b->pn; | ||
630 | |||
631 | if (apn != bpn && cpusets_overlap(a, b)) { | ||
632 | for (k = 0; k < csn; k++) { | ||
633 | struct cpuset *c = csa[k]; | ||
634 | |||
635 | if (c->pn == bpn) | ||
636 | c->pn = apn; | ||
637 | } | ||
638 | ndoms--; /* one less element */ | ||
639 | goto restart; | ||
640 | } | ||
641 | } | ||
642 | } | ||
643 | |||
644 | /* Convert <csn, csa> to <ndoms, doms> */ | ||
645 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); | ||
646 | if (!doms) | ||
647 | goto rebuild; | ||
648 | |||
649 | for (nslot = 0, i = 0; i < csn; i++) { | ||
650 | struct cpuset *a = csa[i]; | ||
651 | int apn = a->pn; | ||
652 | |||
653 | if (apn >= 0) { | ||
654 | cpumask_t *dp = doms + nslot; | ||
655 | |||
656 | if (nslot == ndoms) { | ||
657 | static int warnings = 10; | ||
658 | if (warnings) { | ||
659 | printk(KERN_WARNING | ||
660 | "rebuild_sched_domains confused:" | ||
661 | " nslot %d, ndoms %d, csn %d, i %d," | ||
662 | " apn %d\n", | ||
663 | nslot, ndoms, csn, i, apn); | ||
664 | warnings--; | ||
665 | } | ||
666 | continue; | ||
667 | } | ||
668 | |||
669 | cpus_clear(*dp); | ||
670 | for (j = i; j < csn; j++) { | ||
671 | struct cpuset *b = csa[j]; | ||
672 | |||
673 | if (apn == b->pn) { | ||
674 | cpus_or(*dp, *dp, b->cpus_allowed); | ||
675 | b->pn = -1; | ||
676 | } | ||
677 | } | ||
678 | nslot++; | ||
679 | } | ||
680 | } | ||
681 | BUG_ON(nslot != ndoms); | ||
682 | |||
683 | rebuild: | ||
684 | /* Have scheduler rebuild sched domains */ | ||
685 | lock_cpu_hotplug(); | ||
686 | partition_sched_domains(ndoms, doms); | ||
687 | unlock_cpu_hotplug(); | ||
688 | |||
689 | done: | ||
690 | if (q && !IS_ERR(q)) | ||
691 | kfifo_free(q); | ||
692 | kfree(csa); | ||
693 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ | ||
694 | } | ||
695 | |||
696 | /* | ||
485 | * Call with manage_mutex held. May take callback_mutex during call. | 697 | * Call with manage_mutex held. May take callback_mutex during call. |
486 | */ | 698 | */ |
487 | 699 | ||
@@ -489,6 +701,7 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
489 | { | 701 | { |
490 | struct cpuset trialcs; | 702 | struct cpuset trialcs; |
491 | int retval; | 703 | int retval; |
704 | int cpus_changed, is_load_balanced; | ||
492 | 705 | ||
493 | /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ | 706 | /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ |
494 | if (cs == &top_cpuset) | 707 | if (cs == &top_cpuset) |
@@ -516,9 +729,17 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
516 | retval = validate_change(cs, &trialcs); | 729 | retval = validate_change(cs, &trialcs); |
517 | if (retval < 0) | 730 | if (retval < 0) |
518 | return retval; | 731 | return retval; |
732 | |||
733 | cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed); | ||
734 | is_load_balanced = is_sched_load_balance(&trialcs); | ||
735 | |||
519 | mutex_lock(&callback_mutex); | 736 | mutex_lock(&callback_mutex); |
520 | cs->cpus_allowed = trialcs.cpus_allowed; | 737 | cs->cpus_allowed = trialcs.cpus_allowed; |
521 | mutex_unlock(&callback_mutex); | 738 | mutex_unlock(&callback_mutex); |
739 | |||
740 | if (cpus_changed && is_load_balanced) | ||
741 | rebuild_sched_domains(); | ||
742 | |||
522 | return 0; | 743 | return 0; |
523 | } | 744 | } |
524 | 745 | ||
@@ -752,6 +973,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | |||
752 | /* | 973 | /* |
753 | * update_flag - read a 0 or a 1 in a file and update associated flag | 974 | * update_flag - read a 0 or a 1 in a file and update associated flag |
754 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, | 975 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, |
976 | * CS_SCHED_LOAD_BALANCE, | ||
755 | * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, | 977 | * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, |
756 | * CS_SPREAD_PAGE, CS_SPREAD_SLAB) | 978 | * CS_SPREAD_PAGE, CS_SPREAD_SLAB) |
757 | * cs: the cpuset to update | 979 | * cs: the cpuset to update |
@@ -765,6 +987,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
765 | int turning_on; | 987 | int turning_on; |
766 | struct cpuset trialcs; | 988 | struct cpuset trialcs; |
767 | int err; | 989 | int err; |
990 | int cpus_nonempty, balance_flag_changed; | ||
768 | 991 | ||
769 | turning_on = (simple_strtoul(buf, NULL, 10) != 0); | 992 | turning_on = (simple_strtoul(buf, NULL, 10) != 0); |
770 | 993 | ||
@@ -777,10 +1000,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
777 | err = validate_change(cs, &trialcs); | 1000 | err = validate_change(cs, &trialcs); |
778 | if (err < 0) | 1001 | if (err < 0) |
779 | return err; | 1002 | return err; |
1003 | |||
1004 | cpus_nonempty = !cpus_empty(trialcs.cpus_allowed); | ||
1005 | balance_flag_changed = (is_sched_load_balance(cs) != | ||
1006 | is_sched_load_balance(&trialcs)); | ||
1007 | |||
780 | mutex_lock(&callback_mutex); | 1008 | mutex_lock(&callback_mutex); |
781 | cs->flags = trialcs.flags; | 1009 | cs->flags = trialcs.flags; |
782 | mutex_unlock(&callback_mutex); | 1010 | mutex_unlock(&callback_mutex); |
783 | 1011 | ||
1012 | if (cpus_nonempty && balance_flag_changed) | ||
1013 | rebuild_sched_domains(); | ||
1014 | |||
784 | return 0; | 1015 | return 0; |
785 | } | 1016 | } |
786 | 1017 | ||
@@ -928,6 +1159,7 @@ typedef enum { | |||
928 | FILE_MEMLIST, | 1159 | FILE_MEMLIST, |
929 | FILE_CPU_EXCLUSIVE, | 1160 | FILE_CPU_EXCLUSIVE, |
930 | FILE_MEM_EXCLUSIVE, | 1161 | FILE_MEM_EXCLUSIVE, |
1162 | FILE_SCHED_LOAD_BALANCE, | ||
931 | FILE_MEMORY_PRESSURE_ENABLED, | 1163 | FILE_MEMORY_PRESSURE_ENABLED, |
932 | FILE_MEMORY_PRESSURE, | 1164 | FILE_MEMORY_PRESSURE, |
933 | FILE_SPREAD_PAGE, | 1165 | FILE_SPREAD_PAGE, |
@@ -946,7 +1178,7 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont, | |||
946 | int retval = 0; | 1178 | int retval = 0; |
947 | 1179 | ||
948 | /* Crude upper limit on largest legitimate cpulist user might write. */ | 1180 | /* Crude upper limit on largest legitimate cpulist user might write. */ |
949 | if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES)) | 1181 | if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES)) |
950 | return -E2BIG; | 1182 | return -E2BIG; |
951 | 1183 | ||
952 | /* +1 for nul-terminator */ | 1184 | /* +1 for nul-terminator */ |
@@ -979,6 +1211,9 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont, | |||
979 | case FILE_MEM_EXCLUSIVE: | 1211 | case FILE_MEM_EXCLUSIVE: |
980 | retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); | 1212 | retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); |
981 | break; | 1213 | break; |
1214 | case FILE_SCHED_LOAD_BALANCE: | ||
1215 | retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer); | ||
1216 | break; | ||
982 | case FILE_MEMORY_MIGRATE: | 1217 | case FILE_MEMORY_MIGRATE: |
983 | retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); | 1218 | retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); |
984 | break; | 1219 | break; |
@@ -1074,6 +1309,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont, | |||
1074 | case FILE_MEM_EXCLUSIVE: | 1309 | case FILE_MEM_EXCLUSIVE: |
1075 | *s++ = is_mem_exclusive(cs) ? '1' : '0'; | 1310 | *s++ = is_mem_exclusive(cs) ? '1' : '0'; |
1076 | break; | 1311 | break; |
1312 | case FILE_SCHED_LOAD_BALANCE: | ||
1313 | *s++ = is_sched_load_balance(cs) ? '1' : '0'; | ||
1314 | break; | ||
1077 | case FILE_MEMORY_MIGRATE: | 1315 | case FILE_MEMORY_MIGRATE: |
1078 | *s++ = is_memory_migrate(cs) ? '1' : '0'; | 1316 | *s++ = is_memory_migrate(cs) ? '1' : '0'; |
1079 | break; | 1317 | break; |
@@ -1137,6 +1375,13 @@ static struct cftype cft_mem_exclusive = { | |||
1137 | .private = FILE_MEM_EXCLUSIVE, | 1375 | .private = FILE_MEM_EXCLUSIVE, |
1138 | }; | 1376 | }; |
1139 | 1377 | ||
1378 | static struct cftype cft_sched_load_balance = { | ||
1379 | .name = "sched_load_balance", | ||
1380 | .read = cpuset_common_file_read, | ||
1381 | .write = cpuset_common_file_write, | ||
1382 | .private = FILE_SCHED_LOAD_BALANCE, | ||
1383 | }; | ||
1384 | |||
1140 | static struct cftype cft_memory_migrate = { | 1385 | static struct cftype cft_memory_migrate = { |
1141 | .name = "memory_migrate", | 1386 | .name = "memory_migrate", |
1142 | .read = cpuset_common_file_read, | 1387 | .read = cpuset_common_file_read, |
@@ -1186,6 +1431,8 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1186 | return err; | 1431 | return err; |
1187 | if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0) | 1432 | if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0) |
1188 | return err; | 1433 | return err; |
1434 | if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0) | ||
1435 | return err; | ||
1189 | if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0) | 1436 | if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0) |
1190 | return err; | 1437 | return err; |
1191 | if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0) | 1438 | if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0) |
@@ -1267,6 +1514,7 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1267 | set_bit(CS_SPREAD_PAGE, &cs->flags); | 1514 | set_bit(CS_SPREAD_PAGE, &cs->flags); |
1268 | if (is_spread_slab(parent)) | 1515 | if (is_spread_slab(parent)) |
1269 | set_bit(CS_SPREAD_SLAB, &cs->flags); | 1516 | set_bit(CS_SPREAD_SLAB, &cs->flags); |
1517 | set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags); | ||
1270 | cs->cpus_allowed = CPU_MASK_NONE; | 1518 | cs->cpus_allowed = CPU_MASK_NONE; |
1271 | cs->mems_allowed = NODE_MASK_NONE; | 1519 | cs->mems_allowed = NODE_MASK_NONE; |
1272 | cs->mems_generation = cpuset_mems_generation++; | 1520 | cs->mems_generation = cpuset_mems_generation++; |
@@ -1277,11 +1525,27 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1277 | return &cs->css ; | 1525 | return &cs->css ; |
1278 | } | 1526 | } |
1279 | 1527 | ||
1528 | /* | ||
1529 | * Locking note on the strange update_flag() call below: | ||
1530 | * | ||
1531 | * If the cpuset being removed has its flag 'sched_load_balance' | ||
1532 | * enabled, then simulate turning sched_load_balance off, which | ||
1533 | * will call rebuild_sched_domains(). The lock_cpu_hotplug() | ||
1534 | * call in rebuild_sched_domains() must not be made while holding | ||
1535 | * callback_mutex. Elsewhere the kernel nests callback_mutex inside | ||
1536 | * lock_cpu_hotplug() calls. So the reverse nesting would risk an | ||
1537 | * ABBA deadlock. | ||
1538 | */ | ||
1539 | |||
1280 | static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | 1540 | static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) |
1281 | { | 1541 | { |
1282 | struct cpuset *cs = cgroup_cs(cont); | 1542 | struct cpuset *cs = cgroup_cs(cont); |
1283 | 1543 | ||
1284 | cpuset_update_task_memory_state(); | 1544 | cpuset_update_task_memory_state(); |
1545 | |||
1546 | if (is_sched_load_balance(cs)) | ||
1547 | update_flag(CS_SCHED_LOAD_BALANCE, cs, "0"); | ||
1548 | |||
1285 | number_of_cpusets--; | 1549 | number_of_cpusets--; |
1286 | kfree(cs); | 1550 | kfree(cs); |
1287 | } | 1551 | } |
@@ -1326,6 +1590,7 @@ int __init cpuset_init(void) | |||
1326 | 1590 | ||
1327 | fmeter_init(&top_cpuset.fmeter); | 1591 | fmeter_init(&top_cpuset.fmeter); |
1328 | top_cpuset.mems_generation = cpuset_mems_generation++; | 1592 | top_cpuset.mems_generation = cpuset_mems_generation++; |
1593 | set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags); | ||
1329 | 1594 | ||
1330 | err = register_filesystem(&cpuset_fs_type); | 1595 | err = register_filesystem(&cpuset_fs_type); |
1331 | if (err < 0) | 1596 | if (err < 0) |
@@ -1412,8 +1677,8 @@ static void common_cpu_mem_hotplug_unplug(void) | |||
1412 | * cpu_online_map on each CPU hotplug (cpuhp) event. | 1677 | * cpu_online_map on each CPU hotplug (cpuhp) event. |
1413 | */ | 1678 | */ |
1414 | 1679 | ||
1415 | static int cpuset_handle_cpuhp(struct notifier_block *nb, | 1680 | static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, |
1416 | unsigned long phase, void *cpu) | 1681 | unsigned long phase, void *unused_cpu) |
1417 | { | 1682 | { |
1418 | if (phase == CPU_DYING || phase == CPU_DYING_FROZEN) | 1683 | if (phase == CPU_DYING || phase == CPU_DYING_FROZEN) |
1419 | return NOTIFY_DONE; | 1684 | return NOTIFY_DONE; |
@@ -1803,7 +2068,7 @@ void __cpuset_memory_pressure_bump(void) | |||
1803 | * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks | 2068 | * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks |
1804 | * cpuset to top_cpuset. | 2069 | * cpuset to top_cpuset. |
1805 | */ | 2070 | */ |
1806 | static int proc_cpuset_show(struct seq_file *m, void *v) | 2071 | static int proc_cpuset_show(struct seq_file *m, void *unused_v) |
1807 | { | 2072 | { |
1808 | struct pid *pid; | 2073 | struct pid *pid; |
1809 | struct task_struct *tsk; | 2074 | struct task_struct *tsk; |
diff --git a/kernel/sched.c b/kernel/sched.c index 5d5e107ebc4..39d6354af48 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -6376,26 +6376,31 @@ error: | |||
6376 | return -ENOMEM; | 6376 | return -ENOMEM; |
6377 | #endif | 6377 | #endif |
6378 | } | 6378 | } |
6379 | |||
6380 | static cpumask_t *doms_cur; /* current sched domains */ | ||
6381 | static int ndoms_cur; /* number of sched domains in 'doms_cur' */ | ||
6382 | |||
6383 | /* | ||
6384 | * Special case: If a kmalloc of a doms_cur partition (array of | ||
6385 | * cpumask_t) fails, then fallback to a single sched domain, | ||
6386 | * as determined by the single cpumask_t fallback_doms. | ||
6387 | */ | ||
6388 | static cpumask_t fallback_doms; | ||
6389 | |||
6379 | /* | 6390 | /* |
6380 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 6391 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
6392 | * For now this just excludes isolated cpus, but could be used to | ||
6393 | * exclude other special cases in the future. | ||
6381 | */ | 6394 | */ |
6382 | static int arch_init_sched_domains(const cpumask_t *cpu_map) | 6395 | static int arch_init_sched_domains(const cpumask_t *cpu_map) |
6383 | { | 6396 | { |
6384 | cpumask_t cpu_default_map; | 6397 | ndoms_cur = 1; |
6385 | int err; | 6398 | doms_cur = kmalloc(sizeof(cpumask_t), GFP_KERNEL); |
6386 | 6399 | if (!doms_cur) | |
6387 | /* | 6400 | doms_cur = &fallback_doms; |
6388 | * Setup mask for cpus without special case scheduling requirements. | 6401 | cpus_andnot(*doms_cur, *cpu_map, cpu_isolated_map); |
6389 | * For now this just excludes isolated cpus, but could be used to | ||
6390 | * exclude other special cases in the future. | ||
6391 | */ | ||
6392 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); | ||
6393 | |||
6394 | err = build_sched_domains(&cpu_default_map); | ||
6395 | |||
6396 | register_sched_domain_sysctl(); | 6402 | register_sched_domain_sysctl(); |
6397 | 6403 | return build_sched_domains(doms_cur); | |
6398 | return err; | ||
6399 | } | 6404 | } |
6400 | 6405 | ||
6401 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | 6406 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) |
@@ -6419,6 +6424,68 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
6419 | arch_destroy_sched_domains(cpu_map); | 6424 | arch_destroy_sched_domains(cpu_map); |
6420 | } | 6425 | } |
6421 | 6426 | ||
6427 | /* | ||
6428 | * Partition sched domains as specified by the 'ndoms_new' | ||
6429 | * cpumasks in the array doms_new[] of cpumasks. This compares | ||
6430 | * doms_new[] to the current sched domain partitioning, doms_cur[]. | ||
6431 | * It destroys each deleted domain and builds each new domain. | ||
6432 | * | ||
6433 | * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. | ||
6434 | * The masks don't intersect (don't overlap.) We should setup one | ||
6435 | * sched domain for each mask. CPUs not in any of the cpumasks will | ||
6436 | * not be load balanced. If the same cpumask appears both in the | ||
6437 | * current 'doms_cur' domains and in the new 'doms_new', we can leave | ||
6438 | * it as it is. | ||
6439 | * | ||
6440 | * The passed in 'doms_new' should be kmalloc'd. This routine takes | ||
6441 | * ownership of it and will kfree it when done with it. If the caller | ||
6442 | * failed the kmalloc call, then it can pass in doms_new == NULL, | ||
6443 | * and partition_sched_domains() will fallback to the single partition | ||
6444 | * 'fallback_doms'. | ||
6445 | * | ||
6446 | * Call with hotplug lock held | ||
6447 | */ | ||
6448 | void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) | ||
6449 | { | ||
6450 | int i, j; | ||
6451 | |||
6452 | if (doms_new == NULL) { | ||
6453 | ndoms_new = 1; | ||
6454 | doms_new = &fallback_doms; | ||
6455 | cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); | ||
6456 | } | ||
6457 | |||
6458 | /* Destroy deleted domains */ | ||
6459 | for (i = 0; i < ndoms_cur; i++) { | ||
6460 | for (j = 0; j < ndoms_new; j++) { | ||
6461 | if (cpus_equal(doms_cur[i], doms_new[j])) | ||
6462 | goto match1; | ||
6463 | } | ||
6464 | /* no match - a current sched domain not in new doms_new[] */ | ||
6465 | detach_destroy_domains(doms_cur + i); | ||
6466 | match1: | ||
6467 | ; | ||
6468 | } | ||
6469 | |||
6470 | /* Build new domains */ | ||
6471 | for (i = 0; i < ndoms_new; i++) { | ||
6472 | for (j = 0; j < ndoms_cur; j++) { | ||
6473 | if (cpus_equal(doms_new[i], doms_cur[j])) | ||
6474 | goto match2; | ||
6475 | } | ||
6476 | /* no match - add a new doms_new */ | ||
6477 | build_sched_domains(doms_new + i); | ||
6478 | match2: | ||
6479 | ; | ||
6480 | } | ||
6481 | |||
6482 | /* Remember the new sched domains */ | ||
6483 | if (doms_cur != &fallback_doms) | ||
6484 | kfree(doms_cur); | ||
6485 | doms_cur = doms_new; | ||
6486 | ndoms_cur = ndoms_new; | ||
6487 | } | ||
6488 | |||
6422 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 6489 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
6423 | static int arch_reinit_sched_domains(void) | 6490 | static int arch_reinit_sched_domains(void) |
6424 | { | 6491 | { |