diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-10-28 11:26:12 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-10-28 11:26:12 -0400 |
commit | 7a9787e1eba95a166265e6a260cf30af04ef0a99 (patch) | |
tree | e730a4565e0318140d2fbd2f0415d18a339d7336 /kernel/cpuset.c | |
parent | 41b9eb264c8407655db57b60b4457fe1b2ec9977 (diff) | |
parent | 0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff) |
Merge commit 'v2.6.28-rc2' into x86/pci-ioapic-boot-irq-quirks
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r-- | kernel/cpuset.c | 702 |
1 files changed, 380 insertions, 322 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 459d601947a8..3e00526f52ec 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -14,6 +14,8 @@ | |||
14 | * 2003-10-22 Updates by Stephen Hemminger. | 14 | * 2003-10-22 Updates by Stephen Hemminger. |
15 | * 2004 May-July Rework by Paul Jackson. | 15 | * 2004 May-July Rework by Paul Jackson. |
16 | * 2006 Rework by Paul Menage to use generic cgroups | 16 | * 2006 Rework by Paul Menage to use generic cgroups |
17 | * 2008 Rework of the scheduler domains and CPU hotplug handling | ||
18 | * by Max Krasnyansky | ||
17 | * | 19 | * |
18 | * This file is subject to the terms and conditions of the GNU General Public | 20 | * This file is subject to the terms and conditions of the GNU General Public |
19 | * License. See the file COPYING in the main directory of the Linux | 21 | * License. See the file COPYING in the main directory of the Linux |
@@ -54,7 +56,6 @@ | |||
54 | #include <asm/uaccess.h> | 56 | #include <asm/uaccess.h> |
55 | #include <asm/atomic.h> | 57 | #include <asm/atomic.h> |
56 | #include <linux/mutex.h> | 58 | #include <linux/mutex.h> |
57 | #include <linux/kfifo.h> | ||
58 | #include <linux/workqueue.h> | 59 | #include <linux/workqueue.h> |
59 | #include <linux/cgroup.h> | 60 | #include <linux/cgroup.h> |
60 | 61 | ||
@@ -227,10 +228,6 @@ static struct cpuset top_cpuset = { | |||
227 | * The task_struct fields mems_allowed and mems_generation may only | 228 | * The task_struct fields mems_allowed and mems_generation may only |
228 | * be accessed in the context of that task, so require no locks. | 229 | * be accessed in the context of that task, so require no locks. |
229 | * | 230 | * |
230 | * The cpuset_common_file_write handler for operations that modify | ||
231 | * the cpuset hierarchy holds cgroup_mutex across the entire operation, | ||
232 | * single threading all such cpuset modifications across the system. | ||
233 | * | ||
234 | * The cpuset_common_file_read() handlers only hold callback_mutex across | 231 | * The cpuset_common_file_read() handlers only hold callback_mutex across |
235 | * small pieces of code, such as when reading out possibly multi-word | 232 | * small pieces of code, such as when reading out possibly multi-word |
236 | * cpumasks and nodemasks. | 233 | * cpumasks and nodemasks. |
@@ -241,9 +238,11 @@ static struct cpuset top_cpuset = { | |||
241 | 238 | ||
242 | static DEFINE_MUTEX(callback_mutex); | 239 | static DEFINE_MUTEX(callback_mutex); |
243 | 240 | ||
244 | /* This is ugly, but preserves the userspace API for existing cpuset | 241 | /* |
242 | * This is ugly, but preserves the userspace API for existing cpuset | ||
245 | * users. If someone tries to mount the "cpuset" filesystem, we | 243 | * users. If someone tries to mount the "cpuset" filesystem, we |
246 | * silently switch it to mount "cgroup" instead */ | 244 | * silently switch it to mount "cgroup" instead |
245 | */ | ||
247 | static int cpuset_get_sb(struct file_system_type *fs_type, | 246 | static int cpuset_get_sb(struct file_system_type *fs_type, |
248 | int flags, const char *unused_dev_name, | 247 | int flags, const char *unused_dev_name, |
249 | void *data, struct vfsmount *mnt) | 248 | void *data, struct vfsmount *mnt) |
@@ -369,7 +368,7 @@ void cpuset_update_task_memory_state(void) | |||
369 | my_cpusets_mem_gen = top_cpuset.mems_generation; | 368 | my_cpusets_mem_gen = top_cpuset.mems_generation; |
370 | } else { | 369 | } else { |
371 | rcu_read_lock(); | 370 | rcu_read_lock(); |
372 | my_cpusets_mem_gen = task_cs(current)->mems_generation; | 371 | my_cpusets_mem_gen = task_cs(tsk)->mems_generation; |
373 | rcu_read_unlock(); | 372 | rcu_read_unlock(); |
374 | } | 373 | } |
375 | 374 | ||
@@ -478,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) | |||
478 | } | 477 | } |
479 | 478 | ||
480 | /* | 479 | /* |
481 | * Helper routine for rebuild_sched_domains(). | 480 | * Helper routine for generate_sched_domains(). |
482 | * Do cpusets a, b have overlapping cpus_allowed masks? | 481 | * Do cpusets a, b have overlapping cpus_allowed masks? |
483 | */ | 482 | */ |
484 | |||
485 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | 483 | static int cpusets_overlap(struct cpuset *a, struct cpuset *b) |
486 | { | 484 | { |
487 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); | 485 | return cpus_intersects(a->cpus_allowed, b->cpus_allowed); |
@@ -490,29 +488,48 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | |||
490 | static void | 488 | static void |
491 | update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | 489 | update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) |
492 | { | 490 | { |
493 | if (!dattr) | ||
494 | return; | ||
495 | if (dattr->relax_domain_level < c->relax_domain_level) | 491 | if (dattr->relax_domain_level < c->relax_domain_level) |
496 | dattr->relax_domain_level = c->relax_domain_level; | 492 | dattr->relax_domain_level = c->relax_domain_level; |
497 | return; | 493 | return; |
498 | } | 494 | } |
499 | 495 | ||
496 | static void | ||
497 | update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) | ||
498 | { | ||
499 | LIST_HEAD(q); | ||
500 | |||
501 | list_add(&c->stack_list, &q); | ||
502 | while (!list_empty(&q)) { | ||
503 | struct cpuset *cp; | ||
504 | struct cgroup *cont; | ||
505 | struct cpuset *child; | ||
506 | |||
507 | cp = list_first_entry(&q, struct cpuset, stack_list); | ||
508 | list_del(q.next); | ||
509 | |||
510 | if (cpus_empty(cp->cpus_allowed)) | ||
511 | continue; | ||
512 | |||
513 | if (is_sched_load_balance(cp)) | ||
514 | update_domain_attr(dattr, cp); | ||
515 | |||
516 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | ||
517 | child = cgroup_cs(cont); | ||
518 | list_add_tail(&child->stack_list, &q); | ||
519 | } | ||
520 | } | ||
521 | } | ||
522 | |||
500 | /* | 523 | /* |
501 | * rebuild_sched_domains() | 524 | * generate_sched_domains() |
502 | * | 525 | * |
503 | * If the flag 'sched_load_balance' of any cpuset with non-empty | 526 | * This function builds a partial partition of the systems CPUs |
504 | * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset | 527 | * A 'partial partition' is a set of non-overlapping subsets whose |
505 | * which has that flag enabled, or if any cpuset with a non-empty | 528 | * union is a subset of that set. |
506 | * 'cpus' is removed, then call this routine to rebuild the | 529 | * The output of this function needs to be passed to kernel/sched.c |
507 | * scheduler's dynamic sched domains. | 530 | * partition_sched_domains() routine, which will rebuild the scheduler's |
508 | * | 531 | * load balancing domains (sched domains) as specified by that partial |
509 | * This routine builds a partial partition of the systems CPUs | 532 | * partition. |
510 | * (the set of non-overlappping cpumask_t's in the array 'part' | ||
511 | * below), and passes that partial partition to the kernel/sched.c | ||
512 | * partition_sched_domains() routine, which will rebuild the | ||
513 | * schedulers load balancing domains (sched domains) as specified | ||
514 | * by that partial partition. A 'partial partition' is a set of | ||
515 | * non-overlapping subsets whose union is a subset of that set. | ||
516 | * | 533 | * |
517 | * See "What is sched_load_balance" in Documentation/cpusets.txt | 534 | * See "What is sched_load_balance" in Documentation/cpusets.txt |
518 | * for a background explanation of this. | 535 | * for a background explanation of this. |
@@ -522,16 +539,10 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | |||
522 | * domains when operating in the severe memory shortage situations | 539 | * domains when operating in the severe memory shortage situations |
523 | * that could cause allocation failures below. | 540 | * that could cause allocation failures below. |
524 | * | 541 | * |
525 | * Call with cgroup_mutex held. May take callback_mutex during | 542 | * Must be called with cgroup_lock held. |
526 | * call due to the kfifo_alloc() and kmalloc() calls. May nest | ||
527 | * a call to the get_online_cpus()/put_online_cpus() pair. | ||
528 | * Must not be called holding callback_mutex, because we must not | ||
529 | * call get_online_cpus() while holding callback_mutex. Elsewhere | ||
530 | * the kernel nests callback_mutex inside get_online_cpus() calls. | ||
531 | * So the reverse nesting would risk an ABBA deadlock. | ||
532 | * | 543 | * |
533 | * The three key local variables below are: | 544 | * The three key local variables below are: |
534 | * q - a kfifo queue of cpuset pointers, used to implement a | 545 | * q - a linked-list queue of cpuset pointers, used to implement a |
535 | * top-down scan of all cpusets. This scan loads a pointer | 546 | * top-down scan of all cpusets. This scan loads a pointer |
536 | * to each cpuset marked is_sched_load_balance into the | 547 | * to each cpuset marked is_sched_load_balance into the |
537 | * array 'csa'. For our purposes, rebuilding the schedulers | 548 | * array 'csa'. For our purposes, rebuilding the schedulers |
@@ -563,10 +574,10 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) | |||
563 | * element of the partition (one sched domain) to be passed to | 574 | * element of the partition (one sched domain) to be passed to |
564 | * partition_sched_domains(). | 575 | * partition_sched_domains(). |
565 | */ | 576 | */ |
566 | 577 | static int generate_sched_domains(cpumask_t **domains, | |
567 | static void rebuild_sched_domains(void) | 578 | struct sched_domain_attr **attributes) |
568 | { | 579 | { |
569 | struct kfifo *q; /* queue of cpusets to be scanned */ | 580 | LIST_HEAD(q); /* queue of cpusets to be scanned */ |
570 | struct cpuset *cp; /* scans q */ | 581 | struct cpuset *cp; /* scans q */ |
571 | struct cpuset **csa; /* array of all cpuset ptrs */ | 582 | struct cpuset **csa; /* array of all cpuset ptrs */ |
572 | int csn; /* how many cpuset ptrs in csa so far */ | 583 | int csn; /* how many cpuset ptrs in csa so far */ |
@@ -576,44 +587,58 @@ static void rebuild_sched_domains(void) | |||
576 | int ndoms; /* number of sched domains in result */ | 587 | int ndoms; /* number of sched domains in result */ |
577 | int nslot; /* next empty doms[] cpumask_t slot */ | 588 | int nslot; /* next empty doms[] cpumask_t slot */ |
578 | 589 | ||
579 | q = NULL; | 590 | ndoms = 0; |
580 | csa = NULL; | ||
581 | doms = NULL; | 591 | doms = NULL; |
582 | dattr = NULL; | 592 | dattr = NULL; |
593 | csa = NULL; | ||
583 | 594 | ||
584 | /* Special case for the 99% of systems with one, full, sched domain */ | 595 | /* Special case for the 99% of systems with one, full, sched domain */ |
585 | if (is_sched_load_balance(&top_cpuset)) { | 596 | if (is_sched_load_balance(&top_cpuset)) { |
586 | ndoms = 1; | ||
587 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); | 597 | doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); |
588 | if (!doms) | 598 | if (!doms) |
589 | goto rebuild; | 599 | goto done; |
600 | |||
590 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); | 601 | dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); |
591 | if (dattr) { | 602 | if (dattr) { |
592 | *dattr = SD_ATTR_INIT; | 603 | *dattr = SD_ATTR_INIT; |
593 | update_domain_attr(dattr, &top_cpuset); | 604 | update_domain_attr_tree(dattr, &top_cpuset); |
594 | } | 605 | } |
595 | *doms = top_cpuset.cpus_allowed; | 606 | *doms = top_cpuset.cpus_allowed; |
596 | goto rebuild; | ||
597 | } | ||
598 | 607 | ||
599 | q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL); | 608 | ndoms = 1; |
600 | if (IS_ERR(q)) | ||
601 | goto done; | 609 | goto done; |
610 | } | ||
611 | |||
602 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); | 612 | csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); |
603 | if (!csa) | 613 | if (!csa) |
604 | goto done; | 614 | goto done; |
605 | csn = 0; | 615 | csn = 0; |
606 | 616 | ||
607 | cp = &top_cpuset; | 617 | list_add(&top_cpuset.stack_list, &q); |
608 | __kfifo_put(q, (void *)&cp, sizeof(cp)); | 618 | while (!list_empty(&q)) { |
609 | while (__kfifo_get(q, (void *)&cp, sizeof(cp))) { | ||
610 | struct cgroup *cont; | 619 | struct cgroup *cont; |
611 | struct cpuset *child; /* scans child cpusets of cp */ | 620 | struct cpuset *child; /* scans child cpusets of cp */ |
612 | if (is_sched_load_balance(cp)) | 621 | |
622 | cp = list_first_entry(&q, struct cpuset, stack_list); | ||
623 | list_del(q.next); | ||
624 | |||
625 | if (cpus_empty(cp->cpus_allowed)) | ||
626 | continue; | ||
627 | |||
628 | /* | ||
629 | * All child cpusets contain a subset of the parent's cpus, so | ||
630 | * just skip them, and then we call update_domain_attr_tree() | ||
631 | * to calc relax_domain_level of the corresponding sched | ||
632 | * domain. | ||
633 | */ | ||
634 | if (is_sched_load_balance(cp)) { | ||
613 | csa[csn++] = cp; | 635 | csa[csn++] = cp; |
636 | continue; | ||
637 | } | ||
638 | |||
614 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | 639 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { |
615 | child = cgroup_cs(cont); | 640 | child = cgroup_cs(cont); |
616 | __kfifo_put(q, (void *)&child, sizeof(cp)); | 641 | list_add_tail(&child->stack_list, &q); |
617 | } | 642 | } |
618 | } | 643 | } |
619 | 644 | ||
@@ -644,91 +669,141 @@ restart: | |||
644 | } | 669 | } |
645 | } | 670 | } |
646 | 671 | ||
647 | /* Convert <csn, csa> to <ndoms, doms> */ | 672 | /* |
673 | * Now we know how many domains to create. | ||
674 | * Convert <csn, csa> to <ndoms, doms> and populate cpu masks. | ||
675 | */ | ||
648 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); | 676 | doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); |
649 | if (!doms) | 677 | if (!doms) { |
650 | goto rebuild; | 678 | ndoms = 0; |
679 | goto done; | ||
680 | } | ||
681 | |||
682 | /* | ||
683 | * The rest of the code, including the scheduler, can deal with | ||
684 | * dattr==NULL case. No need to abort if alloc fails. | ||
685 | */ | ||
651 | dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); | 686 | dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); |
652 | 687 | ||
653 | for (nslot = 0, i = 0; i < csn; i++) { | 688 | for (nslot = 0, i = 0; i < csn; i++) { |
654 | struct cpuset *a = csa[i]; | 689 | struct cpuset *a = csa[i]; |
690 | cpumask_t *dp; | ||
655 | int apn = a->pn; | 691 | int apn = a->pn; |
656 | 692 | ||
657 | if (apn >= 0) { | 693 | if (apn < 0) { |
658 | cpumask_t *dp = doms + nslot; | 694 | /* Skip completed partitions */ |
659 | 695 | continue; | |
660 | if (nslot == ndoms) { | 696 | } |
661 | static int warnings = 10; | 697 | |
662 | if (warnings) { | 698 | dp = doms + nslot; |
663 | printk(KERN_WARNING | 699 | |
664 | "rebuild_sched_domains confused:" | 700 | if (nslot == ndoms) { |
665 | " nslot %d, ndoms %d, csn %d, i %d," | 701 | static int warnings = 10; |
666 | " apn %d\n", | 702 | if (warnings) { |
667 | nslot, ndoms, csn, i, apn); | 703 | printk(KERN_WARNING |
668 | warnings--; | 704 | "rebuild_sched_domains confused:" |
669 | } | 705 | " nslot %d, ndoms %d, csn %d, i %d," |
670 | continue; | 706 | " apn %d\n", |
707 | nslot, ndoms, csn, i, apn); | ||
708 | warnings--; | ||
671 | } | 709 | } |
710 | continue; | ||
711 | } | ||
672 | 712 | ||
673 | cpus_clear(*dp); | 713 | cpus_clear(*dp); |
674 | if (dattr) | 714 | if (dattr) |
675 | *(dattr + nslot) = SD_ATTR_INIT; | 715 | *(dattr + nslot) = SD_ATTR_INIT; |
676 | for (j = i; j < csn; j++) { | 716 | for (j = i; j < csn; j++) { |
677 | struct cpuset *b = csa[j]; | 717 | struct cpuset *b = csa[j]; |
678 | 718 | ||
679 | if (apn == b->pn) { | 719 | if (apn == b->pn) { |
680 | cpus_or(*dp, *dp, b->cpus_allowed); | 720 | cpus_or(*dp, *dp, b->cpus_allowed); |
681 | b->pn = -1; | 721 | if (dattr) |
682 | update_domain_attr(dattr, b); | 722 | update_domain_attr_tree(dattr + nslot, b); |
683 | } | 723 | |
724 | /* Done with this partition */ | ||
725 | b->pn = -1; | ||
684 | } | 726 | } |
685 | nslot++; | ||
686 | } | 727 | } |
728 | nslot++; | ||
687 | } | 729 | } |
688 | BUG_ON(nslot != ndoms); | 730 | BUG_ON(nslot != ndoms); |
689 | 731 | ||
690 | rebuild: | ||
691 | /* Have scheduler rebuild sched domains */ | ||
692 | get_online_cpus(); | ||
693 | partition_sched_domains(ndoms, doms, dattr); | ||
694 | put_online_cpus(); | ||
695 | |||
696 | done: | 732 | done: |
697 | if (q && !IS_ERR(q)) | ||
698 | kfifo_free(q); | ||
699 | kfree(csa); | 733 | kfree(csa); |
700 | /* Don't kfree(doms) -- partition_sched_domains() does that. */ | 734 | |
701 | /* Don't kfree(dattr) -- partition_sched_domains() does that. */ | 735 | *domains = doms; |
736 | *attributes = dattr; | ||
737 | return ndoms; | ||
702 | } | 738 | } |
703 | 739 | ||
704 | static inline int started_after_time(struct task_struct *t1, | 740 | /* |
705 | struct timespec *time, | 741 | * Rebuild scheduler domains. |
706 | struct task_struct *t2) | 742 | * |
743 | * Call with neither cgroup_mutex held nor within get_online_cpus(). | ||
744 | * Takes both cgroup_mutex and get_online_cpus(). | ||
745 | * | ||
746 | * Cannot be directly called from cpuset code handling changes | ||
747 | * to the cpuset pseudo-filesystem, because it cannot be called | ||
748 | * from code that already holds cgroup_mutex. | ||
749 | */ | ||
750 | static void do_rebuild_sched_domains(struct work_struct *unused) | ||
707 | { | 751 | { |
708 | int start_diff = timespec_compare(&t1->start_time, time); | 752 | struct sched_domain_attr *attr; |
709 | if (start_diff > 0) { | 753 | cpumask_t *doms; |
710 | return 1; | 754 | int ndoms; |
711 | } else if (start_diff < 0) { | 755 | |
712 | return 0; | 756 | get_online_cpus(); |
713 | } else { | 757 | |
714 | /* | 758 | /* Generate domain masks and attrs */ |
715 | * Arbitrarily, if two processes started at the same | 759 | cgroup_lock(); |
716 | * time, we'll say that the lower pointer value | 760 | ndoms = generate_sched_domains(&doms, &attr); |
717 | * started first. Note that t2 may have exited by now | 761 | cgroup_unlock(); |
718 | * so this may not be a valid pointer any longer, but | 762 | |
719 | * that's fine - it still serves to distinguish | 763 | /* Have scheduler rebuild the domains */ |
720 | * between two tasks started (effectively) | 764 | partition_sched_domains(ndoms, doms, attr); |
721 | * simultaneously. | 765 | |
722 | */ | 766 | put_online_cpus(); |
723 | return t1 > t2; | 767 | } |
724 | } | 768 | |
769 | static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); | ||
770 | |||
771 | /* | ||
772 | * Rebuild scheduler domains, asynchronously via workqueue. | ||
773 | * | ||
774 | * If the flag 'sched_load_balance' of any cpuset with non-empty | ||
775 | * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset | ||
776 | * which has that flag enabled, or if any cpuset with a non-empty | ||
777 | * 'cpus' is removed, then call this routine to rebuild the | ||
778 | * scheduler's dynamic sched domains. | ||
779 | * | ||
780 | * The rebuild_sched_domains() and partition_sched_domains() | ||
781 | * routines must nest cgroup_lock() inside get_online_cpus(), | ||
782 | * but such cpuset changes as these must nest that locking the | ||
783 | * other way, holding cgroup_lock() for much of the code. | ||
784 | * | ||
785 | * So in order to avoid an ABBA deadlock, the cpuset code handling | ||
786 | * these user changes delegates the actual sched domain rebuilding | ||
787 | * to a separate workqueue thread, which ends up processing the | ||
788 | * above do_rebuild_sched_domains() function. | ||
789 | */ | ||
790 | static void async_rebuild_sched_domains(void) | ||
791 | { | ||
792 | schedule_work(&rebuild_sched_domains_work); | ||
725 | } | 793 | } |
726 | 794 | ||
727 | static inline int started_after(void *p1, void *p2) | 795 | /* |
796 | * Accomplishes the same scheduler domain rebuild as the above | ||
797 | * async_rebuild_sched_domains(), however it directly calls the | ||
798 | * rebuild routine synchronously rather than calling it via an | ||
799 | * asynchronous work thread. | ||
800 | * | ||
801 | * This can only be called from code that is not holding | ||
802 | * cgroup_mutex (not nested in a cgroup_lock() call.) | ||
803 | */ | ||
804 | void rebuild_sched_domains(void) | ||
728 | { | 805 | { |
729 | struct task_struct *t1 = p1; | 806 | do_rebuild_sched_domains(NULL); |
730 | struct task_struct *t2 = p2; | ||
731 | return started_after_time(t1, &t2->start_time, t2); | ||
732 | } | 807 | } |
733 | 808 | ||
734 | /** | 809 | /** |
@@ -766,15 +841,38 @@ static void cpuset_change_cpumask(struct task_struct *tsk, | |||
766 | } | 841 | } |
767 | 842 | ||
768 | /** | 843 | /** |
844 | * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset. | ||
845 | * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed | ||
846 | * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks() | ||
847 | * | ||
848 | * Called with cgroup_mutex held | ||
849 | * | ||
850 | * The cgroup_scan_tasks() function will scan all the tasks in a cgroup, | ||
851 | * calling callback functions for each. | ||
852 | * | ||
853 | * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0 | ||
854 | * if @heap != NULL. | ||
855 | */ | ||
856 | static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap) | ||
857 | { | ||
858 | struct cgroup_scanner scan; | ||
859 | |||
860 | scan.cg = cs->css.cgroup; | ||
861 | scan.test_task = cpuset_test_cpumask; | ||
862 | scan.process_task = cpuset_change_cpumask; | ||
863 | scan.heap = heap; | ||
864 | cgroup_scan_tasks(&scan); | ||
865 | } | ||
866 | |||
867 | /** | ||
769 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it | 868 | * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it |
770 | * @cs: the cpuset to consider | 869 | * @cs: the cpuset to consider |
771 | * @buf: buffer of cpu numbers written to this cpuset | 870 | * @buf: buffer of cpu numbers written to this cpuset |
772 | */ | 871 | */ |
773 | static int update_cpumask(struct cpuset *cs, char *buf) | 872 | static int update_cpumask(struct cpuset *cs, const char *buf) |
774 | { | 873 | { |
775 | struct cpuset trialcs; | ||
776 | struct cgroup_scanner scan; | ||
777 | struct ptr_heap heap; | 874 | struct ptr_heap heap; |
875 | struct cpuset trialcs; | ||
778 | int retval; | 876 | int retval; |
779 | int is_load_balanced; | 877 | int is_load_balanced; |
780 | 878 | ||
@@ -790,7 +888,6 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
790 | * that parsing. The validate_change() call ensures that cpusets | 888 | * that parsing. The validate_change() call ensures that cpusets |
791 | * with tasks have cpus. | 889 | * with tasks have cpus. |
792 | */ | 890 | */ |
793 | buf = strstrip(buf); | ||
794 | if (!*buf) { | 891 | if (!*buf) { |
795 | cpus_clear(trialcs.cpus_allowed); | 892 | cpus_clear(trialcs.cpus_allowed); |
796 | } else { | 893 | } else { |
@@ -809,7 +906,7 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
809 | if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) | 906 | if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) |
810 | return 0; | 907 | return 0; |
811 | 908 | ||
812 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); | 909 | retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); |
813 | if (retval) | 910 | if (retval) |
814 | return retval; | 911 | return retval; |
815 | 912 | ||
@@ -823,15 +920,12 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
823 | * Scan tasks in the cpuset, and update the cpumasks of any | 920 | * Scan tasks in the cpuset, and update the cpumasks of any |
824 | * that need an update. | 921 | * that need an update. |
825 | */ | 922 | */ |
826 | scan.cg = cs->css.cgroup; | 923 | update_tasks_cpumask(cs, &heap); |
827 | scan.test_task = cpuset_test_cpumask; | 924 | |
828 | scan.process_task = cpuset_change_cpumask; | ||
829 | scan.heap = &heap; | ||
830 | cgroup_scan_tasks(&scan); | ||
831 | heap_free(&heap); | 925 | heap_free(&heap); |
832 | 926 | ||
833 | if (is_load_balanced) | 927 | if (is_load_balanced) |
834 | rebuild_sched_domains(); | 928 | async_rebuild_sched_domains(); |
835 | return 0; | 929 | return 0; |
836 | } | 930 | } |
837 | 931 | ||
@@ -884,74 +978,25 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, | |||
884 | mutex_unlock(&callback_mutex); | 978 | mutex_unlock(&callback_mutex); |
885 | } | 979 | } |
886 | 980 | ||
887 | /* | ||
888 | * Handle user request to change the 'mems' memory placement | ||
889 | * of a cpuset. Needs to validate the request, update the | ||
890 | * cpusets mems_allowed and mems_generation, and for each | ||
891 | * task in the cpuset, rebind any vma mempolicies and if | ||
892 | * the cpuset is marked 'memory_migrate', migrate the tasks | ||
893 | * pages to the new memory. | ||
894 | * | ||
895 | * Call with cgroup_mutex held. May take callback_mutex during call. | ||
896 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | ||
897 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | ||
898 | * their mempolicies to the cpusets new mems_allowed. | ||
899 | */ | ||
900 | |||
901 | static void *cpuset_being_rebound; | 981 | static void *cpuset_being_rebound; |
902 | 982 | ||
903 | static int update_nodemask(struct cpuset *cs, char *buf) | 983 | /** |
984 | * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset. | ||
985 | * @cs: the cpuset in which each task's mems_allowed mask needs to be changed | ||
986 | * @oldmem: old mems_allowed of cpuset cs | ||
987 | * | ||
988 | * Called with cgroup_mutex held | ||
989 | * Return 0 if successful, -errno if not. | ||
990 | */ | ||
991 | static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem) | ||
904 | { | 992 | { |
905 | struct cpuset trialcs; | ||
906 | nodemask_t oldmem; | ||
907 | struct task_struct *p; | 993 | struct task_struct *p; |
908 | struct mm_struct **mmarray; | 994 | struct mm_struct **mmarray; |
909 | int i, n, ntasks; | 995 | int i, n, ntasks; |
910 | int migrate; | 996 | int migrate; |
911 | int fudge; | 997 | int fudge; |
912 | int retval; | ||
913 | struct cgroup_iter it; | 998 | struct cgroup_iter it; |
914 | 999 | int retval; | |
915 | /* | ||
916 | * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; | ||
917 | * it's read-only | ||
918 | */ | ||
919 | if (cs == &top_cpuset) | ||
920 | return -EACCES; | ||
921 | |||
922 | trialcs = *cs; | ||
923 | |||
924 | /* | ||
925 | * An empty mems_allowed is ok iff there are no tasks in the cpuset. | ||
926 | * Since nodelist_parse() fails on an empty mask, we special case | ||
927 | * that parsing. The validate_change() call ensures that cpusets | ||
928 | * with tasks have memory. | ||
929 | */ | ||
930 | buf = strstrip(buf); | ||
931 | if (!*buf) { | ||
932 | nodes_clear(trialcs.mems_allowed); | ||
933 | } else { | ||
934 | retval = nodelist_parse(buf, trialcs.mems_allowed); | ||
935 | if (retval < 0) | ||
936 | goto done; | ||
937 | |||
938 | if (!nodes_subset(trialcs.mems_allowed, | ||
939 | node_states[N_HIGH_MEMORY])) | ||
940 | return -EINVAL; | ||
941 | } | ||
942 | oldmem = cs->mems_allowed; | ||
943 | if (nodes_equal(oldmem, trialcs.mems_allowed)) { | ||
944 | retval = 0; /* Too easy - nothing to do */ | ||
945 | goto done; | ||
946 | } | ||
947 | retval = validate_change(cs, &trialcs); | ||
948 | if (retval < 0) | ||
949 | goto done; | ||
950 | |||
951 | mutex_lock(&callback_mutex); | ||
952 | cs->mems_allowed = trialcs.mems_allowed; | ||
953 | cs->mems_generation = cpuset_mems_generation++; | ||
954 | mutex_unlock(&callback_mutex); | ||
955 | 1000 | ||
956 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ | 1001 | cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ |
957 | 1002 | ||
@@ -1018,7 +1063,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
1018 | 1063 | ||
1019 | mpol_rebind_mm(mm, &cs->mems_allowed); | 1064 | mpol_rebind_mm(mm, &cs->mems_allowed); |
1020 | if (migrate) | 1065 | if (migrate) |
1021 | cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed); | 1066 | cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed); |
1022 | mmput(mm); | 1067 | mmput(mm); |
1023 | } | 1068 | } |
1024 | 1069 | ||
@@ -1030,6 +1075,70 @@ done: | |||
1030 | return retval; | 1075 | return retval; |
1031 | } | 1076 | } |
1032 | 1077 | ||
1078 | /* | ||
1079 | * Handle user request to change the 'mems' memory placement | ||
1080 | * of a cpuset. Needs to validate the request, update the | ||
1081 | * cpusets mems_allowed and mems_generation, and for each | ||
1082 | * task in the cpuset, rebind any vma mempolicies and if | ||
1083 | * the cpuset is marked 'memory_migrate', migrate the tasks | ||
1084 | * pages to the new memory. | ||
1085 | * | ||
1086 | * Call with cgroup_mutex held. May take callback_mutex during call. | ||
1087 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | ||
1088 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | ||
1089 | * their mempolicies to the cpusets new mems_allowed. | ||
1090 | */ | ||
1091 | static int update_nodemask(struct cpuset *cs, const char *buf) | ||
1092 | { | ||
1093 | struct cpuset trialcs; | ||
1094 | nodemask_t oldmem; | ||
1095 | int retval; | ||
1096 | |||
1097 | /* | ||
1098 | * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY]; | ||
1099 | * it's read-only | ||
1100 | */ | ||
1101 | if (cs == &top_cpuset) | ||
1102 | return -EACCES; | ||
1103 | |||
1104 | trialcs = *cs; | ||
1105 | |||
1106 | /* | ||
1107 | * An empty mems_allowed is ok iff there are no tasks in the cpuset. | ||
1108 | * Since nodelist_parse() fails on an empty mask, we special case | ||
1109 | * that parsing. The validate_change() call ensures that cpusets | ||
1110 | * with tasks have memory. | ||
1111 | */ | ||
1112 | if (!*buf) { | ||
1113 | nodes_clear(trialcs.mems_allowed); | ||
1114 | } else { | ||
1115 | retval = nodelist_parse(buf, trialcs.mems_allowed); | ||
1116 | if (retval < 0) | ||
1117 | goto done; | ||
1118 | |||
1119 | if (!nodes_subset(trialcs.mems_allowed, | ||
1120 | node_states[N_HIGH_MEMORY])) | ||
1121 | return -EINVAL; | ||
1122 | } | ||
1123 | oldmem = cs->mems_allowed; | ||
1124 | if (nodes_equal(oldmem, trialcs.mems_allowed)) { | ||
1125 | retval = 0; /* Too easy - nothing to do */ | ||
1126 | goto done; | ||
1127 | } | ||
1128 | retval = validate_change(cs, &trialcs); | ||
1129 | if (retval < 0) | ||
1130 | goto done; | ||
1131 | |||
1132 | mutex_lock(&callback_mutex); | ||
1133 | cs->mems_allowed = trialcs.mems_allowed; | ||
1134 | cs->mems_generation = cpuset_mems_generation++; | ||
1135 | mutex_unlock(&callback_mutex); | ||
1136 | |||
1137 | retval = update_tasks_nodemask(cs, &oldmem); | ||
1138 | done: | ||
1139 | return retval; | ||
1140 | } | ||
1141 | |||
1033 | int current_cpuset_is_being_rebound(void) | 1142 | int current_cpuset_is_being_rebound(void) |
1034 | { | 1143 | { |
1035 | return task_cs(current) == cpuset_being_rebound; | 1144 | return task_cs(current) == cpuset_being_rebound; |
@@ -1042,7 +1151,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) | |||
1042 | 1151 | ||
1043 | if (val != cs->relax_domain_level) { | 1152 | if (val != cs->relax_domain_level) { |
1044 | cs->relax_domain_level = val; | 1153 | cs->relax_domain_level = val; |
1045 | rebuild_sched_domains(); | 1154 | if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) |
1155 | async_rebuild_sched_domains(); | ||
1046 | } | 1156 | } |
1047 | 1157 | ||
1048 | return 0; | 1158 | return 0; |
@@ -1062,7 +1172,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | |||
1062 | { | 1172 | { |
1063 | struct cpuset trialcs; | 1173 | struct cpuset trialcs; |
1064 | int err; | 1174 | int err; |
1065 | int cpus_nonempty, balance_flag_changed; | 1175 | int balance_flag_changed; |
1066 | 1176 | ||
1067 | trialcs = *cs; | 1177 | trialcs = *cs; |
1068 | if (turning_on) | 1178 | if (turning_on) |
@@ -1074,7 +1184,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | |||
1074 | if (err < 0) | 1184 | if (err < 0) |
1075 | return err; | 1185 | return err; |
1076 | 1186 | ||
1077 | cpus_nonempty = !cpus_empty(trialcs.cpus_allowed); | ||
1078 | balance_flag_changed = (is_sched_load_balance(cs) != | 1187 | balance_flag_changed = (is_sched_load_balance(cs) != |
1079 | is_sched_load_balance(&trialcs)); | 1188 | is_sched_load_balance(&trialcs)); |
1080 | 1189 | ||
@@ -1082,8 +1191,8 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | |||
1082 | cs->flags = trialcs.flags; | 1191 | cs->flags = trialcs.flags; |
1083 | mutex_unlock(&callback_mutex); | 1192 | mutex_unlock(&callback_mutex); |
1084 | 1193 | ||
1085 | if (cpus_nonempty && balance_flag_changed) | 1194 | if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed) |
1086 | rebuild_sched_domains(); | 1195 | async_rebuild_sched_domains(); |
1087 | 1196 | ||
1088 | return 0; | 1197 | return 0; |
1089 | } | 1198 | } |
@@ -1254,72 +1363,14 @@ typedef enum { | |||
1254 | FILE_SPREAD_SLAB, | 1363 | FILE_SPREAD_SLAB, |
1255 | } cpuset_filetype_t; | 1364 | } cpuset_filetype_t; |
1256 | 1365 | ||
1257 | static ssize_t cpuset_common_file_write(struct cgroup *cont, | ||
1258 | struct cftype *cft, | ||
1259 | struct file *file, | ||
1260 | const char __user *userbuf, | ||
1261 | size_t nbytes, loff_t *unused_ppos) | ||
1262 | { | ||
1263 | struct cpuset *cs = cgroup_cs(cont); | ||
1264 | cpuset_filetype_t type = cft->private; | ||
1265 | char *buffer; | ||
1266 | int retval = 0; | ||
1267 | |||
1268 | /* Crude upper limit on largest legitimate cpulist user might write. */ | ||
1269 | if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES)) | ||
1270 | return -E2BIG; | ||
1271 | |||
1272 | /* +1 for nul-terminator */ | ||
1273 | buffer = kmalloc(nbytes + 1, GFP_KERNEL); | ||
1274 | if (!buffer) | ||
1275 | return -ENOMEM; | ||
1276 | |||
1277 | if (copy_from_user(buffer, userbuf, nbytes)) { | ||
1278 | retval = -EFAULT; | ||
1279 | goto out1; | ||
1280 | } | ||
1281 | buffer[nbytes] = 0; /* nul-terminate */ | ||
1282 | |||
1283 | cgroup_lock(); | ||
1284 | |||
1285 | if (cgroup_is_removed(cont)) { | ||
1286 | retval = -ENODEV; | ||
1287 | goto out2; | ||
1288 | } | ||
1289 | |||
1290 | switch (type) { | ||
1291 | case FILE_CPULIST: | ||
1292 | retval = update_cpumask(cs, buffer); | ||
1293 | break; | ||
1294 | case FILE_MEMLIST: | ||
1295 | retval = update_nodemask(cs, buffer); | ||
1296 | break; | ||
1297 | default: | ||
1298 | retval = -EINVAL; | ||
1299 | goto out2; | ||
1300 | } | ||
1301 | |||
1302 | if (retval == 0) | ||
1303 | retval = nbytes; | ||
1304 | out2: | ||
1305 | cgroup_unlock(); | ||
1306 | out1: | ||
1307 | kfree(buffer); | ||
1308 | return retval; | ||
1309 | } | ||
1310 | |||
1311 | static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) | 1366 | static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) |
1312 | { | 1367 | { |
1313 | int retval = 0; | 1368 | int retval = 0; |
1314 | struct cpuset *cs = cgroup_cs(cgrp); | 1369 | struct cpuset *cs = cgroup_cs(cgrp); |
1315 | cpuset_filetype_t type = cft->private; | 1370 | cpuset_filetype_t type = cft->private; |
1316 | 1371 | ||
1317 | cgroup_lock(); | 1372 | if (!cgroup_lock_live_group(cgrp)) |
1318 | |||
1319 | if (cgroup_is_removed(cgrp)) { | ||
1320 | cgroup_unlock(); | ||
1321 | return -ENODEV; | 1373 | return -ENODEV; |
1322 | } | ||
1323 | 1374 | ||
1324 | switch (type) { | 1375 | switch (type) { |
1325 | case FILE_CPU_EXCLUSIVE: | 1376 | case FILE_CPU_EXCLUSIVE: |
@@ -1365,12 +1416,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | |||
1365 | struct cpuset *cs = cgroup_cs(cgrp); | 1416 | struct cpuset *cs = cgroup_cs(cgrp); |
1366 | cpuset_filetype_t type = cft->private; | 1417 | cpuset_filetype_t type = cft->private; |
1367 | 1418 | ||
1368 | cgroup_lock(); | 1419 | if (!cgroup_lock_live_group(cgrp)) |
1369 | |||
1370 | if (cgroup_is_removed(cgrp)) { | ||
1371 | cgroup_unlock(); | ||
1372 | return -ENODEV; | 1420 | return -ENODEV; |
1373 | } | 1421 | |
1374 | switch (type) { | 1422 | switch (type) { |
1375 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: | 1423 | case FILE_SCHED_RELAX_DOMAIN_LEVEL: |
1376 | retval = update_relax_domain_level(cs, val); | 1424 | retval = update_relax_domain_level(cs, val); |
@@ -1384,6 +1432,32 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val) | |||
1384 | } | 1432 | } |
1385 | 1433 | ||
1386 | /* | 1434 | /* |
1435 | * Common handling for a write to a "cpus" or "mems" file. | ||
1436 | */ | ||
1437 | static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft, | ||
1438 | const char *buf) | ||
1439 | { | ||
1440 | int retval = 0; | ||
1441 | |||
1442 | if (!cgroup_lock_live_group(cgrp)) | ||
1443 | return -ENODEV; | ||
1444 | |||
1445 | switch (cft->private) { | ||
1446 | case FILE_CPULIST: | ||
1447 | retval = update_cpumask(cgroup_cs(cgrp), buf); | ||
1448 | break; | ||
1449 | case FILE_MEMLIST: | ||
1450 | retval = update_nodemask(cgroup_cs(cgrp), buf); | ||
1451 | break; | ||
1452 | default: | ||
1453 | retval = -EINVAL; | ||
1454 | break; | ||
1455 | } | ||
1456 | cgroup_unlock(); | ||
1457 | return retval; | ||
1458 | } | ||
1459 | |||
1460 | /* | ||
1387 | * These ascii lists should be read in a single call, by using a user | 1461 | * These ascii lists should be read in a single call, by using a user |
1388 | * buffer large enough to hold the entire map. If read in smaller | 1462 | * buffer large enough to hold the entire map. If read in smaller |
1389 | * chunks, there is no guarantee of atomicity. Since the display format | 1463 | * chunks, there is no guarantee of atomicity. Since the display format |
@@ -1479,6 +1553,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) | |||
1479 | default: | 1553 | default: |
1480 | BUG(); | 1554 | BUG(); |
1481 | } | 1555 | } |
1556 | |||
1557 | /* Unreachable but makes gcc happy */ | ||
1558 | return 0; | ||
1482 | } | 1559 | } |
1483 | 1560 | ||
1484 | static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) | 1561 | static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) |
@@ -1491,6 +1568,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) | |||
1491 | default: | 1568 | default: |
1492 | BUG(); | 1569 | BUG(); |
1493 | } | 1570 | } |
1571 | |||
1572 | /* Unrechable but makes gcc happy */ | ||
1573 | return 0; | ||
1494 | } | 1574 | } |
1495 | 1575 | ||
1496 | 1576 | ||
@@ -1502,14 +1582,16 @@ static struct cftype files[] = { | |||
1502 | { | 1582 | { |
1503 | .name = "cpus", | 1583 | .name = "cpus", |
1504 | .read = cpuset_common_file_read, | 1584 | .read = cpuset_common_file_read, |
1505 | .write = cpuset_common_file_write, | 1585 | .write_string = cpuset_write_resmask, |
1586 | .max_write_len = (100U + 6 * NR_CPUS), | ||
1506 | .private = FILE_CPULIST, | 1587 | .private = FILE_CPULIST, |
1507 | }, | 1588 | }, |
1508 | 1589 | ||
1509 | { | 1590 | { |
1510 | .name = "mems", | 1591 | .name = "mems", |
1511 | .read = cpuset_common_file_read, | 1592 | .read = cpuset_common_file_read, |
1512 | .write = cpuset_common_file_write, | 1593 | .write_string = cpuset_write_resmask, |
1594 | .max_write_len = (100U + 6 * MAX_NUMNODES), | ||
1513 | .private = FILE_MEMLIST, | 1595 | .private = FILE_MEMLIST, |
1514 | }, | 1596 | }, |
1515 | 1597 | ||
@@ -1677,15 +1759,9 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1677 | } | 1759 | } |
1678 | 1760 | ||
1679 | /* | 1761 | /* |
1680 | * Locking note on the strange update_flag() call below: | ||
1681 | * | ||
1682 | * If the cpuset being removed has its flag 'sched_load_balance' | 1762 | * If the cpuset being removed has its flag 'sched_load_balance' |
1683 | * enabled, then simulate turning sched_load_balance off, which | 1763 | * enabled, then simulate turning sched_load_balance off, which |
1684 | * will call rebuild_sched_domains(). The get_online_cpus() | 1764 | * will call async_rebuild_sched_domains(). |
1685 | * call in rebuild_sched_domains() must not be made while holding | ||
1686 | * callback_mutex. Elsewhere the kernel nests callback_mutex inside | ||
1687 | * get_online_cpus() calls. So the reverse nesting would risk an | ||
1688 | * ABBA deadlock. | ||
1689 | */ | 1765 | */ |
1690 | 1766 | ||
1691 | static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | 1767 | static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) |
@@ -1704,7 +1780,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | |||
1704 | struct cgroup_subsys cpuset_subsys = { | 1780 | struct cgroup_subsys cpuset_subsys = { |
1705 | .name = "cpuset", | 1781 | .name = "cpuset", |
1706 | .create = cpuset_create, | 1782 | .create = cpuset_create, |
1707 | .destroy = cpuset_destroy, | 1783 | .destroy = cpuset_destroy, |
1708 | .can_attach = cpuset_can_attach, | 1784 | .can_attach = cpuset_can_attach, |
1709 | .attach = cpuset_attach, | 1785 | .attach = cpuset_attach, |
1710 | .populate = cpuset_populate, | 1786 | .populate = cpuset_populate, |
@@ -1790,13 +1866,13 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) | |||
1790 | scan.scan.heap = NULL; | 1866 | scan.scan.heap = NULL; |
1791 | scan.to = to->css.cgroup; | 1867 | scan.to = to->css.cgroup; |
1792 | 1868 | ||
1793 | if (cgroup_scan_tasks((struct cgroup_scanner *)&scan)) | 1869 | if (cgroup_scan_tasks(&scan.scan)) |
1794 | printk(KERN_ERR "move_member_tasks_to_cpuset: " | 1870 | printk(KERN_ERR "move_member_tasks_to_cpuset: " |
1795 | "cgroup_scan_tasks failed\n"); | 1871 | "cgroup_scan_tasks failed\n"); |
1796 | } | 1872 | } |
1797 | 1873 | ||
1798 | /* | 1874 | /* |
1799 | * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs | 1875 | * If CPU and/or memory hotplug handlers, below, unplug any CPUs |
1800 | * or memory nodes, we need to walk over the cpuset hierarchy, | 1876 | * or memory nodes, we need to walk over the cpuset hierarchy, |
1801 | * removing that CPU or node from all cpusets. If this removes the | 1877 | * removing that CPU or node from all cpusets. If this removes the |
1802 | * last CPU or node from a cpuset, then move the tasks in the empty | 1878 | * last CPU or node from a cpuset, then move the tasks in the empty |
@@ -1844,31 +1920,31 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) | |||
1844 | * that has tasks along with an empty 'mems'. But if we did see such | 1920 | * that has tasks along with an empty 'mems'. But if we did see such |
1845 | * a cpuset, we'd handle it just like we do if its 'cpus' was empty. | 1921 | * a cpuset, we'd handle it just like we do if its 'cpus' was empty. |
1846 | */ | 1922 | */ |
1847 | static void scan_for_empty_cpusets(const struct cpuset *root) | 1923 | static void scan_for_empty_cpusets(struct cpuset *root) |
1848 | { | 1924 | { |
1925 | LIST_HEAD(queue); | ||
1849 | struct cpuset *cp; /* scans cpusets being updated */ | 1926 | struct cpuset *cp; /* scans cpusets being updated */ |
1850 | struct cpuset *child; /* scans child cpusets of cp */ | 1927 | struct cpuset *child; /* scans child cpusets of cp */ |
1851 | struct list_head queue; | ||
1852 | struct cgroup *cont; | 1928 | struct cgroup *cont; |
1853 | 1929 | nodemask_t oldmems; | |
1854 | INIT_LIST_HEAD(&queue); | ||
1855 | 1930 | ||
1856 | list_add_tail((struct list_head *)&root->stack_list, &queue); | 1931 | list_add_tail((struct list_head *)&root->stack_list, &queue); |
1857 | 1932 | ||
1858 | while (!list_empty(&queue)) { | 1933 | while (!list_empty(&queue)) { |
1859 | cp = container_of(queue.next, struct cpuset, stack_list); | 1934 | cp = list_first_entry(&queue, struct cpuset, stack_list); |
1860 | list_del(queue.next); | 1935 | list_del(queue.next); |
1861 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { | 1936 | list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { |
1862 | child = cgroup_cs(cont); | 1937 | child = cgroup_cs(cont); |
1863 | list_add_tail(&child->stack_list, &queue); | 1938 | list_add_tail(&child->stack_list, &queue); |
1864 | } | 1939 | } |
1865 | cont = cp->css.cgroup; | ||
1866 | 1940 | ||
1867 | /* Continue past cpusets with all cpus, mems online */ | 1941 | /* Continue past cpusets with all cpus, mems online */ |
1868 | if (cpus_subset(cp->cpus_allowed, cpu_online_map) && | 1942 | if (cpus_subset(cp->cpus_allowed, cpu_online_map) && |
1869 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) | 1943 | nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) |
1870 | continue; | 1944 | continue; |
1871 | 1945 | ||
1946 | oldmems = cp->mems_allowed; | ||
1947 | |||
1872 | /* Remove offline cpus and mems from this cpuset. */ | 1948 | /* Remove offline cpus and mems from this cpuset. */ |
1873 | mutex_lock(&callback_mutex); | 1949 | mutex_lock(&callback_mutex); |
1874 | cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); | 1950 | cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); |
@@ -1880,39 +1956,14 @@ static void scan_for_empty_cpusets(const struct cpuset *root) | |||
1880 | if (cpus_empty(cp->cpus_allowed) || | 1956 | if (cpus_empty(cp->cpus_allowed) || |
1881 | nodes_empty(cp->mems_allowed)) | 1957 | nodes_empty(cp->mems_allowed)) |
1882 | remove_tasks_in_empty_cpuset(cp); | 1958 | remove_tasks_in_empty_cpuset(cp); |
1959 | else { | ||
1960 | update_tasks_cpumask(cp, NULL); | ||
1961 | update_tasks_nodemask(cp, &oldmems); | ||
1962 | } | ||
1883 | } | 1963 | } |
1884 | } | 1964 | } |
1885 | 1965 | ||
1886 | /* | 1966 | /* |
1887 | * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track | ||
1888 | * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to | ||
1889 | * track what's online after any CPU or memory node hotplug or unplug event. | ||
1890 | * | ||
1891 | * Since there are two callers of this routine, one for CPU hotplug | ||
1892 | * events and one for memory node hotplug events, we could have coded | ||
1893 | * two separate routines here. We code it as a single common routine | ||
1894 | * in order to minimize text size. | ||
1895 | */ | ||
1896 | |||
1897 | static void common_cpu_mem_hotplug_unplug(int rebuild_sd) | ||
1898 | { | ||
1899 | cgroup_lock(); | ||
1900 | |||
1901 | top_cpuset.cpus_allowed = cpu_online_map; | ||
1902 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | ||
1903 | scan_for_empty_cpusets(&top_cpuset); | ||
1904 | |||
1905 | /* | ||
1906 | * Scheduler destroys domains on hotplug events. | ||
1907 | * Rebuild them based on the current settings. | ||
1908 | */ | ||
1909 | if (rebuild_sd) | ||
1910 | rebuild_sched_domains(); | ||
1911 | |||
1912 | cgroup_unlock(); | ||
1913 | } | ||
1914 | |||
1915 | /* | ||
1916 | * The top_cpuset tracks what CPUs and Memory Nodes are online, | 1967 | * The top_cpuset tracks what CPUs and Memory Nodes are online, |
1917 | * period. This is necessary in order to make cpusets transparent | 1968 | * period. This is necessary in order to make cpusets transparent |
1918 | * (of no affect) on systems that are actively using CPU hotplug | 1969 | * (of no affect) on systems that are actively using CPU hotplug |
@@ -1920,40 +1971,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd) | |||
1920 | * | 1971 | * |
1921 | * This routine ensures that top_cpuset.cpus_allowed tracks | 1972 | * This routine ensures that top_cpuset.cpus_allowed tracks |
1922 | * cpu_online_map on each CPU hotplug (cpuhp) event. | 1973 | * cpu_online_map on each CPU hotplug (cpuhp) event. |
1974 | * | ||
1975 | * Called within get_online_cpus(). Needs to call cgroup_lock() | ||
1976 | * before calling generate_sched_domains(). | ||
1923 | */ | 1977 | */ |
1924 | 1978 | static int cpuset_track_online_cpus(struct notifier_block *unused_nb, | |
1925 | static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, | ||
1926 | unsigned long phase, void *unused_cpu) | 1979 | unsigned long phase, void *unused_cpu) |
1927 | { | 1980 | { |
1981 | struct sched_domain_attr *attr; | ||
1982 | cpumask_t *doms; | ||
1983 | int ndoms; | ||
1984 | |||
1928 | switch (phase) { | 1985 | switch (phase) { |
1929 | case CPU_UP_CANCELED: | ||
1930 | case CPU_UP_CANCELED_FROZEN: | ||
1931 | case CPU_DOWN_FAILED: | ||
1932 | case CPU_DOWN_FAILED_FROZEN: | ||
1933 | case CPU_ONLINE: | 1986 | case CPU_ONLINE: |
1934 | case CPU_ONLINE_FROZEN: | 1987 | case CPU_ONLINE_FROZEN: |
1935 | case CPU_DEAD: | 1988 | case CPU_DEAD: |
1936 | case CPU_DEAD_FROZEN: | 1989 | case CPU_DEAD_FROZEN: |
1937 | common_cpu_mem_hotplug_unplug(1); | ||
1938 | break; | 1990 | break; |
1991 | |||
1939 | default: | 1992 | default: |
1940 | return NOTIFY_DONE; | 1993 | return NOTIFY_DONE; |
1941 | } | 1994 | } |
1942 | 1995 | ||
1996 | cgroup_lock(); | ||
1997 | top_cpuset.cpus_allowed = cpu_online_map; | ||
1998 | scan_for_empty_cpusets(&top_cpuset); | ||
1999 | ndoms = generate_sched_domains(&doms, &attr); | ||
2000 | cgroup_unlock(); | ||
2001 | |||
2002 | /* Have scheduler rebuild the domains */ | ||
2003 | partition_sched_domains(ndoms, doms, attr); | ||
2004 | |||
1943 | return NOTIFY_OK; | 2005 | return NOTIFY_OK; |
1944 | } | 2006 | } |
1945 | 2007 | ||
1946 | #ifdef CONFIG_MEMORY_HOTPLUG | 2008 | #ifdef CONFIG_MEMORY_HOTPLUG |
1947 | /* | 2009 | /* |
1948 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. | 2010 | * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. |
1949 | * Call this routine anytime after you change | 2011 | * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. |
1950 | * node_states[N_HIGH_MEMORY]. | 2012 | * See also the previous routine cpuset_track_online_cpus(). |
1951 | * See also the previous routine cpuset_handle_cpuhp(). | ||
1952 | */ | 2013 | */ |
1953 | |||
1954 | void cpuset_track_online_nodes(void) | 2014 | void cpuset_track_online_nodes(void) |
1955 | { | 2015 | { |
1956 | common_cpu_mem_hotplug_unplug(0); | 2016 | cgroup_lock(); |
2017 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | ||
2018 | scan_for_empty_cpusets(&top_cpuset); | ||
2019 | cgroup_unlock(); | ||
1957 | } | 2020 | } |
1958 | #endif | 2021 | #endif |
1959 | 2022 | ||
@@ -1968,11 +2031,10 @@ void __init cpuset_init_smp(void) | |||
1968 | top_cpuset.cpus_allowed = cpu_online_map; | 2031 | top_cpuset.cpus_allowed = cpu_online_map; |
1969 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; | 2032 | top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; |
1970 | 2033 | ||
1971 | hotcpu_notifier(cpuset_handle_cpuhp, 0); | 2034 | hotcpu_notifier(cpuset_track_online_cpus, 0); |
1972 | } | 2035 | } |
1973 | 2036 | ||
1974 | /** | 2037 | /** |
1975 | |||
1976 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. | 2038 | * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. |
1977 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. | 2039 | * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. |
1978 | * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. | 2040 | * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. |
@@ -2374,19 +2436,15 @@ const struct file_operations proc_cpuset_operations = { | |||
2374 | void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) | 2436 | void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) |
2375 | { | 2437 | { |
2376 | seq_printf(m, "Cpus_allowed:\t"); | 2438 | seq_printf(m, "Cpus_allowed:\t"); |
2377 | m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, | 2439 | seq_cpumask(m, &task->cpus_allowed); |
2378 | task->cpus_allowed); | ||
2379 | seq_printf(m, "\n"); | 2440 | seq_printf(m, "\n"); |
2380 | seq_printf(m, "Cpus_allowed_list:\t"); | 2441 | seq_printf(m, "Cpus_allowed_list:\t"); |
2381 | m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count, | 2442 | seq_cpumask_list(m, &task->cpus_allowed); |
2382 | task->cpus_allowed); | ||
2383 | seq_printf(m, "\n"); | 2443 | seq_printf(m, "\n"); |
2384 | seq_printf(m, "Mems_allowed:\t"); | 2444 | seq_printf(m, "Mems_allowed:\t"); |
2385 | m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, | 2445 | seq_nodemask(m, &task->mems_allowed); |
2386 | task->mems_allowed); | ||
2387 | seq_printf(m, "\n"); | 2446 | seq_printf(m, "\n"); |
2388 | seq_printf(m, "Mems_allowed_list:\t"); | 2447 | seq_printf(m, "Mems_allowed_list:\t"); |
2389 | m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count, | 2448 | seq_nodemask_list(m, &task->mems_allowed); |
2390 | task->mems_allowed); | ||
2391 | seq_printf(m, "\n"); | 2449 | seq_printf(m, "\n"); |
2392 | } | 2450 | } |