aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cpuset.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r--kernel/cpuset.c432
1 files changed, 225 insertions, 207 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 459d601947a8..d5ab79cf516d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -54,7 +54,6 @@
54#include <asm/uaccess.h> 54#include <asm/uaccess.h>
55#include <asm/atomic.h> 55#include <asm/atomic.h>
56#include <linux/mutex.h> 56#include <linux/mutex.h>
57#include <linux/kfifo.h>
58#include <linux/workqueue.h> 57#include <linux/workqueue.h>
59#include <linux/cgroup.h> 58#include <linux/cgroup.h>
60 59
@@ -227,10 +226,6 @@ static struct cpuset top_cpuset = {
227 * The task_struct fields mems_allowed and mems_generation may only 226 * The task_struct fields mems_allowed and mems_generation may only
228 * be accessed in the context of that task, so require no locks. 227 * be accessed in the context of that task, so require no locks.
229 * 228 *
230 * The cpuset_common_file_write handler for operations that modify
231 * the cpuset hierarchy holds cgroup_mutex across the entire operation,
232 * single threading all such cpuset modifications across the system.
233 *
234 * The cpuset_common_file_read() handlers only hold callback_mutex across 229 * The cpuset_common_file_read() handlers only hold callback_mutex across
235 * small pieces of code, such as when reading out possibly multi-word 230 * small pieces of code, such as when reading out possibly multi-word
236 * cpumasks and nodemasks. 231 * cpumasks and nodemasks.
@@ -369,7 +364,7 @@ void cpuset_update_task_memory_state(void)
369 my_cpusets_mem_gen = top_cpuset.mems_generation; 364 my_cpusets_mem_gen = top_cpuset.mems_generation;
370 } else { 365 } else {
371 rcu_read_lock(); 366 rcu_read_lock();
372 my_cpusets_mem_gen = task_cs(current)->mems_generation; 367 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
373 rcu_read_unlock(); 368 rcu_read_unlock();
374 } 369 }
375 370
@@ -490,21 +485,51 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
490static void 485static void
491update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) 486update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
492{ 487{
493 if (!dattr)
494 return;
495 if (dattr->relax_domain_level < c->relax_domain_level) 488 if (dattr->relax_domain_level < c->relax_domain_level)
496 dattr->relax_domain_level = c->relax_domain_level; 489 dattr->relax_domain_level = c->relax_domain_level;
497 return; 490 return;
498} 491}
499 492
493static void
494update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
495{
496 LIST_HEAD(q);
497
498 list_add(&c->stack_list, &q);
499 while (!list_empty(&q)) {
500 struct cpuset *cp;
501 struct cgroup *cont;
502 struct cpuset *child;
503
504 cp = list_first_entry(&q, struct cpuset, stack_list);
505 list_del(q.next);
506
507 if (cpus_empty(cp->cpus_allowed))
508 continue;
509
510 if (is_sched_load_balance(cp))
511 update_domain_attr(dattr, cp);
512
513 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
514 child = cgroup_cs(cont);
515 list_add_tail(&child->stack_list, &q);
516 }
517 }
518}
519
500/* 520/*
501 * rebuild_sched_domains() 521 * rebuild_sched_domains()
502 * 522 *
503 * If the flag 'sched_load_balance' of any cpuset with non-empty 523 * This routine will be called to rebuild the scheduler's dynamic
504 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset 524 * sched domains:
505 * which has that flag enabled, or if any cpuset with a non-empty 525 * - if the flag 'sched_load_balance' of any cpuset with non-empty
506 * 'cpus' is removed, then call this routine to rebuild the 526 * 'cpus' changes,
507 * scheduler's dynamic sched domains. 527 * - or if the 'cpus' allowed changes in any cpuset which has that
528 * flag enabled,
529 * - or if the 'sched_relax_domain_level' of any cpuset which has
530 * that flag enabled and with non-empty 'cpus' changes,
531 * - or if any cpuset with non-empty 'cpus' is removed,
532 * - or if a cpu gets offlined.
508 * 533 *
509 * This routine builds a partial partition of the systems CPUs 534 * This routine builds a partial partition of the systems CPUs
510 * (the set of non-overlappping cpumask_t's in the array 'part' 535 * (the set of non-overlappping cpumask_t's in the array 'part'
@@ -531,7 +556,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
531 * So the reverse nesting would risk an ABBA deadlock. 556 * So the reverse nesting would risk an ABBA deadlock.
532 * 557 *
533 * The three key local variables below are: 558 * The three key local variables below are:
534 * q - a kfifo queue of cpuset pointers, used to implement a 559 * q - a linked-list queue of cpuset pointers, used to implement a
535 * top-down scan of all cpusets. This scan loads a pointer 560 * top-down scan of all cpusets. This scan loads a pointer
536 * to each cpuset marked is_sched_load_balance into the 561 * to each cpuset marked is_sched_load_balance into the
537 * array 'csa'. For our purposes, rebuilding the schedulers 562 * array 'csa'. For our purposes, rebuilding the schedulers
@@ -564,9 +589,9 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
564 * partition_sched_domains(). 589 * partition_sched_domains().
565 */ 590 */
566 591
567static void rebuild_sched_domains(void) 592void rebuild_sched_domains(void)
568{ 593{
569 struct kfifo *q; /* queue of cpusets to be scanned */ 594 LIST_HEAD(q); /* queue of cpusets to be scanned*/
570 struct cpuset *cp; /* scans q */ 595 struct cpuset *cp; /* scans q */
571 struct cpuset **csa; /* array of all cpuset ptrs */ 596 struct cpuset **csa; /* array of all cpuset ptrs */
572 int csn; /* how many cpuset ptrs in csa so far */ 597 int csn; /* how many cpuset ptrs in csa so far */
@@ -576,7 +601,6 @@ static void rebuild_sched_domains(void)
576 int ndoms; /* number of sched domains in result */ 601 int ndoms; /* number of sched domains in result */
577 int nslot; /* next empty doms[] cpumask_t slot */ 602 int nslot; /* next empty doms[] cpumask_t slot */
578 603
579 q = NULL;
580 csa = NULL; 604 csa = NULL;
581 doms = NULL; 605 doms = NULL;
582 dattr = NULL; 606 dattr = NULL;
@@ -590,30 +614,42 @@ static void rebuild_sched_domains(void)
590 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); 614 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
591 if (dattr) { 615 if (dattr) {
592 *dattr = SD_ATTR_INIT; 616 *dattr = SD_ATTR_INIT;
593 update_domain_attr(dattr, &top_cpuset); 617 update_domain_attr_tree(dattr, &top_cpuset);
594 } 618 }
595 *doms = top_cpuset.cpus_allowed; 619 *doms = top_cpuset.cpus_allowed;
596 goto rebuild; 620 goto rebuild;
597 } 621 }
598 622
599 q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL);
600 if (IS_ERR(q))
601 goto done;
602 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); 623 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
603 if (!csa) 624 if (!csa)
604 goto done; 625 goto done;
605 csn = 0; 626 csn = 0;
606 627
607 cp = &top_cpuset; 628 list_add(&top_cpuset.stack_list, &q);
608 __kfifo_put(q, (void *)&cp, sizeof(cp)); 629 while (!list_empty(&q)) {
609 while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
610 struct cgroup *cont; 630 struct cgroup *cont;
611 struct cpuset *child; /* scans child cpusets of cp */ 631 struct cpuset *child; /* scans child cpusets of cp */
612 if (is_sched_load_balance(cp)) 632
633 cp = list_first_entry(&q, struct cpuset, stack_list);
634 list_del(q.next);
635
636 if (cpus_empty(cp->cpus_allowed))
637 continue;
638
639 /*
640 * All child cpusets contain a subset of the parent's cpus, so
641 * just skip them, and then we call update_domain_attr_tree()
642 * to calc relax_domain_level of the corresponding sched
643 * domain.
644 */
645 if (is_sched_load_balance(cp)) {
613 csa[csn++] = cp; 646 csa[csn++] = cp;
647 continue;
648 }
649
614 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 650 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
615 child = cgroup_cs(cont); 651 child = cgroup_cs(cont);
616 __kfifo_put(q, (void *)&child, sizeof(cp)); 652 list_add_tail(&child->stack_list, &q);
617 } 653 }
618 } 654 }
619 655
@@ -679,7 +715,9 @@ restart:
679 if (apn == b->pn) { 715 if (apn == b->pn) {
680 cpus_or(*dp, *dp, b->cpus_allowed); 716 cpus_or(*dp, *dp, b->cpus_allowed);
681 b->pn = -1; 717 b->pn = -1;
682 update_domain_attr(dattr, b); 718 if (dattr)
719 update_domain_attr_tree(dattr
720 + nslot, b);
683 } 721 }
684 } 722 }
685 nslot++; 723 nslot++;
@@ -694,43 +732,11 @@ rebuild:
694 put_online_cpus(); 732 put_online_cpus();
695 733
696done: 734done:
697 if (q && !IS_ERR(q))
698 kfifo_free(q);
699 kfree(csa); 735 kfree(csa);
700 /* Don't kfree(doms) -- partition_sched_domains() does that. */ 736 /* Don't kfree(doms) -- partition_sched_domains() does that. */
701 /* Don't kfree(dattr) -- partition_sched_domains() does that. */ 737 /* Don't kfree(dattr) -- partition_sched_domains() does that. */
702} 738}
703 739
704static inline int started_after_time(struct task_struct *t1,
705 struct timespec *time,
706 struct task_struct *t2)
707{
708 int start_diff = timespec_compare(&t1->start_time, time);
709 if (start_diff > 0) {
710 return 1;
711 } else if (start_diff < 0) {
712 return 0;
713 } else {
714 /*
715 * Arbitrarily, if two processes started at the same
716 * time, we'll say that the lower pointer value
717 * started first. Note that t2 may have exited by now
718 * so this may not be a valid pointer any longer, but
719 * that's fine - it still serves to distinguish
720 * between two tasks started (effectively)
721 * simultaneously.
722 */
723 return t1 > t2;
724 }
725}
726
727static inline int started_after(void *p1, void *p2)
728{
729 struct task_struct *t1 = p1;
730 struct task_struct *t2 = p2;
731 return started_after_time(t1, &t2->start_time, t2);
732}
733
734/** 740/**
735 * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's 741 * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's
736 * @tsk: task to test 742 * @tsk: task to test
@@ -766,15 +772,49 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
766} 772}
767 773
768/** 774/**
775 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
776 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
777 *
778 * Called with cgroup_mutex held
779 *
780 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
781 * calling callback functions for each.
782 *
783 * Return 0 if successful, -errno if not.
784 */
785static int update_tasks_cpumask(struct cpuset *cs)
786{
787 struct cgroup_scanner scan;
788 struct ptr_heap heap;
789 int retval;
790
791 /*
792 * cgroup_scan_tasks() will initialize heap->gt for us.
793 * heap_init() is still needed here for we should not change
794 * cs->cpus_allowed when heap_init() fails.
795 */
796 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
797 if (retval)
798 return retval;
799
800 scan.cg = cs->css.cgroup;
801 scan.test_task = cpuset_test_cpumask;
802 scan.process_task = cpuset_change_cpumask;
803 scan.heap = &heap;
804 retval = cgroup_scan_tasks(&scan);
805
806 heap_free(&heap);
807 return retval;
808}
809
810/**
769 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 811 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
770 * @cs: the cpuset to consider 812 * @cs: the cpuset to consider
771 * @buf: buffer of cpu numbers written to this cpuset 813 * @buf: buffer of cpu numbers written to this cpuset
772 */ 814 */
773static int update_cpumask(struct cpuset *cs, char *buf) 815static int update_cpumask(struct cpuset *cs, const char *buf)
774{ 816{
775 struct cpuset trialcs; 817 struct cpuset trialcs;
776 struct cgroup_scanner scan;
777 struct ptr_heap heap;
778 int retval; 818 int retval;
779 int is_load_balanced; 819 int is_load_balanced;
780 820
@@ -790,7 +830,6 @@ static int update_cpumask(struct cpuset *cs, char *buf)
790 * that parsing. The validate_change() call ensures that cpusets 830 * that parsing. The validate_change() call ensures that cpusets
791 * with tasks have cpus. 831 * with tasks have cpus.
792 */ 832 */
793 buf = strstrip(buf);
794 if (!*buf) { 833 if (!*buf) {
795 cpus_clear(trialcs.cpus_allowed); 834 cpus_clear(trialcs.cpus_allowed);
796 } else { 835 } else {
@@ -809,10 +848,6 @@ static int update_cpumask(struct cpuset *cs, char *buf)
809 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) 848 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
810 return 0; 849 return 0;
811 850
812 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after);
813 if (retval)
814 return retval;
815
816 is_load_balanced = is_sched_load_balance(&trialcs); 851 is_load_balanced = is_sched_load_balance(&trialcs);
817 852
818 mutex_lock(&callback_mutex); 853 mutex_lock(&callback_mutex);
@@ -823,12 +858,9 @@ static int update_cpumask(struct cpuset *cs, char *buf)
823 * Scan tasks in the cpuset, and update the cpumasks of any 858 * Scan tasks in the cpuset, and update the cpumasks of any
824 * that need an update. 859 * that need an update.
825 */ 860 */
826 scan.cg = cs->css.cgroup; 861 retval = update_tasks_cpumask(cs);
827 scan.test_task = cpuset_test_cpumask; 862 if (retval < 0)
828 scan.process_task = cpuset_change_cpumask; 863 return retval;
829 scan.heap = &heap;
830 cgroup_scan_tasks(&scan);
831 heap_free(&heap);
832 864
833 if (is_load_balanced) 865 if (is_load_balanced)
834 rebuild_sched_domains(); 866 rebuild_sched_domains();
@@ -884,74 +916,25 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
884 mutex_unlock(&callback_mutex); 916 mutex_unlock(&callback_mutex);
885} 917}
886 918
887/*
888 * Handle user request to change the 'mems' memory placement
889 * of a cpuset. Needs to validate the request, update the
890 * cpusets mems_allowed and mems_generation, and for each
891 * task in the cpuset, rebind any vma mempolicies and if
892 * the cpuset is marked 'memory_migrate', migrate the tasks
893 * pages to the new memory.
894 *
895 * Call with cgroup_mutex held. May take callback_mutex during call.
896 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
897 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
898 * their mempolicies to the cpusets new mems_allowed.
899 */
900
901static void *cpuset_being_rebound; 919static void *cpuset_being_rebound;
902 920
903static int update_nodemask(struct cpuset *cs, char *buf) 921/**
922 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
923 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
924 * @oldmem: old mems_allowed of cpuset cs
925 *
926 * Called with cgroup_mutex held
927 * Return 0 if successful, -errno if not.
928 */
929static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
904{ 930{
905 struct cpuset trialcs;
906 nodemask_t oldmem;
907 struct task_struct *p; 931 struct task_struct *p;
908 struct mm_struct **mmarray; 932 struct mm_struct **mmarray;
909 int i, n, ntasks; 933 int i, n, ntasks;
910 int migrate; 934 int migrate;
911 int fudge; 935 int fudge;
912 int retval;
913 struct cgroup_iter it; 936 struct cgroup_iter it;
914 937 int retval;
915 /*
916 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
917 * it's read-only
918 */
919 if (cs == &top_cpuset)
920 return -EACCES;
921
922 trialcs = *cs;
923
924 /*
925 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
926 * Since nodelist_parse() fails on an empty mask, we special case
927 * that parsing. The validate_change() call ensures that cpusets
928 * with tasks have memory.
929 */
930 buf = strstrip(buf);
931 if (!*buf) {
932 nodes_clear(trialcs.mems_allowed);
933 } else {
934 retval = nodelist_parse(buf, trialcs.mems_allowed);
935 if (retval < 0)
936 goto done;
937
938 if (!nodes_subset(trialcs.mems_allowed,
939 node_states[N_HIGH_MEMORY]))
940 return -EINVAL;
941 }
942 oldmem = cs->mems_allowed;
943 if (nodes_equal(oldmem, trialcs.mems_allowed)) {
944 retval = 0; /* Too easy - nothing to do */
945 goto done;
946 }
947 retval = validate_change(cs, &trialcs);
948 if (retval < 0)
949 goto done;
950
951 mutex_lock(&callback_mutex);
952 cs->mems_allowed = trialcs.mems_allowed;
953 cs->mems_generation = cpuset_mems_generation++;
954 mutex_unlock(&callback_mutex);
955 938
956 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 939 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
957 940
@@ -1018,7 +1001,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
1018 1001
1019 mpol_rebind_mm(mm, &cs->mems_allowed); 1002 mpol_rebind_mm(mm, &cs->mems_allowed);
1020 if (migrate) 1003 if (migrate)
1021 cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed); 1004 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1022 mmput(mm); 1005 mmput(mm);
1023 } 1006 }
1024 1007
@@ -1030,6 +1013,70 @@ done:
1030 return retval; 1013 return retval;
1031} 1014}
1032 1015
1016/*
1017 * Handle user request to change the 'mems' memory placement
1018 * of a cpuset. Needs to validate the request, update the
1019 * cpusets mems_allowed and mems_generation, and for each
1020 * task in the cpuset, rebind any vma mempolicies and if
1021 * the cpuset is marked 'memory_migrate', migrate the tasks
1022 * pages to the new memory.
1023 *
1024 * Call with cgroup_mutex held. May take callback_mutex during call.
1025 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1026 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1027 * their mempolicies to the cpusets new mems_allowed.
1028 */
1029static int update_nodemask(struct cpuset *cs, const char *buf)
1030{
1031 struct cpuset trialcs;
1032 nodemask_t oldmem;
1033 int retval;
1034
1035 /*
1036 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
1037 * it's read-only
1038 */
1039 if (cs == &top_cpuset)
1040 return -EACCES;
1041
1042 trialcs = *cs;
1043
1044 /*
1045 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
1046 * Since nodelist_parse() fails on an empty mask, we special case
1047 * that parsing. The validate_change() call ensures that cpusets
1048 * with tasks have memory.
1049 */
1050 if (!*buf) {
1051 nodes_clear(trialcs.mems_allowed);
1052 } else {
1053 retval = nodelist_parse(buf, trialcs.mems_allowed);
1054 if (retval < 0)
1055 goto done;
1056
1057 if (!nodes_subset(trialcs.mems_allowed,
1058 node_states[N_HIGH_MEMORY]))
1059 return -EINVAL;
1060 }
1061 oldmem = cs->mems_allowed;
1062 if (nodes_equal(oldmem, trialcs.mems_allowed)) {
1063 retval = 0; /* Too easy - nothing to do */
1064 goto done;
1065 }
1066 retval = validate_change(cs, &trialcs);
1067 if (retval < 0)
1068 goto done;
1069
1070 mutex_lock(&callback_mutex);
1071 cs->mems_allowed = trialcs.mems_allowed;
1072 cs->mems_generation = cpuset_mems_generation++;
1073 mutex_unlock(&callback_mutex);
1074
1075 retval = update_tasks_nodemask(cs, &oldmem);
1076done:
1077 return retval;
1078}
1079
1033int current_cpuset_is_being_rebound(void) 1080int current_cpuset_is_being_rebound(void)
1034{ 1081{
1035 return task_cs(current) == cpuset_being_rebound; 1082 return task_cs(current) == cpuset_being_rebound;
@@ -1042,7 +1089,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1042 1089
1043 if (val != cs->relax_domain_level) { 1090 if (val != cs->relax_domain_level) {
1044 cs->relax_domain_level = val; 1091 cs->relax_domain_level = val;
1045 rebuild_sched_domains(); 1092 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
1093 rebuild_sched_domains();
1046 } 1094 }
1047 1095
1048 return 0; 1096 return 0;
@@ -1254,72 +1302,14 @@ typedef enum {
1254 FILE_SPREAD_SLAB, 1302 FILE_SPREAD_SLAB,
1255} cpuset_filetype_t; 1303} cpuset_filetype_t;
1256 1304
1257static ssize_t cpuset_common_file_write(struct cgroup *cont,
1258 struct cftype *cft,
1259 struct file *file,
1260 const char __user *userbuf,
1261 size_t nbytes, loff_t *unused_ppos)
1262{
1263 struct cpuset *cs = cgroup_cs(cont);
1264 cpuset_filetype_t type = cft->private;
1265 char *buffer;
1266 int retval = 0;
1267
1268 /* Crude upper limit on largest legitimate cpulist user might write. */
1269 if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES))
1270 return -E2BIG;
1271
1272 /* +1 for nul-terminator */
1273 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
1274 if (!buffer)
1275 return -ENOMEM;
1276
1277 if (copy_from_user(buffer, userbuf, nbytes)) {
1278 retval = -EFAULT;
1279 goto out1;
1280 }
1281 buffer[nbytes] = 0; /* nul-terminate */
1282
1283 cgroup_lock();
1284
1285 if (cgroup_is_removed(cont)) {
1286 retval = -ENODEV;
1287 goto out2;
1288 }
1289
1290 switch (type) {
1291 case FILE_CPULIST:
1292 retval = update_cpumask(cs, buffer);
1293 break;
1294 case FILE_MEMLIST:
1295 retval = update_nodemask(cs, buffer);
1296 break;
1297 default:
1298 retval = -EINVAL;
1299 goto out2;
1300 }
1301
1302 if (retval == 0)
1303 retval = nbytes;
1304out2:
1305 cgroup_unlock();
1306out1:
1307 kfree(buffer);
1308 return retval;
1309}
1310
1311static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1305static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1312{ 1306{
1313 int retval = 0; 1307 int retval = 0;
1314 struct cpuset *cs = cgroup_cs(cgrp); 1308 struct cpuset *cs = cgroup_cs(cgrp);
1315 cpuset_filetype_t type = cft->private; 1309 cpuset_filetype_t type = cft->private;
1316 1310
1317 cgroup_lock(); 1311 if (!cgroup_lock_live_group(cgrp))
1318
1319 if (cgroup_is_removed(cgrp)) {
1320 cgroup_unlock();
1321 return -ENODEV; 1312 return -ENODEV;
1322 }
1323 1313
1324 switch (type) { 1314 switch (type) {
1325 case FILE_CPU_EXCLUSIVE: 1315 case FILE_CPU_EXCLUSIVE:
@@ -1365,12 +1355,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1365 struct cpuset *cs = cgroup_cs(cgrp); 1355 struct cpuset *cs = cgroup_cs(cgrp);
1366 cpuset_filetype_t type = cft->private; 1356 cpuset_filetype_t type = cft->private;
1367 1357
1368 cgroup_lock(); 1358 if (!cgroup_lock_live_group(cgrp))
1369
1370 if (cgroup_is_removed(cgrp)) {
1371 cgroup_unlock();
1372 return -ENODEV; 1359 return -ENODEV;
1373 } 1360
1374 switch (type) { 1361 switch (type) {
1375 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1362 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1376 retval = update_relax_domain_level(cs, val); 1363 retval = update_relax_domain_level(cs, val);
@@ -1384,6 +1371,32 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1384} 1371}
1385 1372
1386/* 1373/*
1374 * Common handling for a write to a "cpus" or "mems" file.
1375 */
1376static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1377 const char *buf)
1378{
1379 int retval = 0;
1380
1381 if (!cgroup_lock_live_group(cgrp))
1382 return -ENODEV;
1383
1384 switch (cft->private) {
1385 case FILE_CPULIST:
1386 retval = update_cpumask(cgroup_cs(cgrp), buf);
1387 break;
1388 case FILE_MEMLIST:
1389 retval = update_nodemask(cgroup_cs(cgrp), buf);
1390 break;
1391 default:
1392 retval = -EINVAL;
1393 break;
1394 }
1395 cgroup_unlock();
1396 return retval;
1397}
1398
1399/*
1387 * These ascii lists should be read in a single call, by using a user 1400 * These ascii lists should be read in a single call, by using a user
1388 * buffer large enough to hold the entire map. If read in smaller 1401 * buffer large enough to hold the entire map. If read in smaller
1389 * chunks, there is no guarantee of atomicity. Since the display format 1402 * chunks, there is no guarantee of atomicity. Since the display format
@@ -1502,14 +1515,16 @@ static struct cftype files[] = {
1502 { 1515 {
1503 .name = "cpus", 1516 .name = "cpus",
1504 .read = cpuset_common_file_read, 1517 .read = cpuset_common_file_read,
1505 .write = cpuset_common_file_write, 1518 .write_string = cpuset_write_resmask,
1519 .max_write_len = (100U + 6 * NR_CPUS),
1506 .private = FILE_CPULIST, 1520 .private = FILE_CPULIST,
1507 }, 1521 },
1508 1522
1509 { 1523 {
1510 .name = "mems", 1524 .name = "mems",
1511 .read = cpuset_common_file_read, 1525 .read = cpuset_common_file_read,
1512 .write = cpuset_common_file_write, 1526 .write_string = cpuset_write_resmask,
1527 .max_write_len = (100U + 6 * MAX_NUMNODES),
1513 .private = FILE_MEMLIST, 1528 .private = FILE_MEMLIST,
1514 }, 1529 },
1515 1530
@@ -1790,7 +1805,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1790 scan.scan.heap = NULL; 1805 scan.scan.heap = NULL;
1791 scan.to = to->css.cgroup; 1806 scan.to = to->css.cgroup;
1792 1807
1793 if (cgroup_scan_tasks((struct cgroup_scanner *)&scan)) 1808 if (cgroup_scan_tasks(&scan.scan))
1794 printk(KERN_ERR "move_member_tasks_to_cpuset: " 1809 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1795 "cgroup_scan_tasks failed\n"); 1810 "cgroup_scan_tasks failed\n");
1796} 1811}
@@ -1846,29 +1861,29 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1846 */ 1861 */
1847static void scan_for_empty_cpusets(const struct cpuset *root) 1862static void scan_for_empty_cpusets(const struct cpuset *root)
1848{ 1863{
1864 LIST_HEAD(queue);
1849 struct cpuset *cp; /* scans cpusets being updated */ 1865 struct cpuset *cp; /* scans cpusets being updated */
1850 struct cpuset *child; /* scans child cpusets of cp */ 1866 struct cpuset *child; /* scans child cpusets of cp */
1851 struct list_head queue;
1852 struct cgroup *cont; 1867 struct cgroup *cont;
1853 1868 nodemask_t oldmems;
1854 INIT_LIST_HEAD(&queue);
1855 1869
1856 list_add_tail((struct list_head *)&root->stack_list, &queue); 1870 list_add_tail((struct list_head *)&root->stack_list, &queue);
1857 1871
1858 while (!list_empty(&queue)) { 1872 while (!list_empty(&queue)) {
1859 cp = container_of(queue.next, struct cpuset, stack_list); 1873 cp = list_first_entry(&queue, struct cpuset, stack_list);
1860 list_del(queue.next); 1874 list_del(queue.next);
1861 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 1875 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
1862 child = cgroup_cs(cont); 1876 child = cgroup_cs(cont);
1863 list_add_tail(&child->stack_list, &queue); 1877 list_add_tail(&child->stack_list, &queue);
1864 } 1878 }
1865 cont = cp->css.cgroup;
1866 1879
1867 /* Continue past cpusets with all cpus, mems online */ 1880 /* Continue past cpusets with all cpus, mems online */
1868 if (cpus_subset(cp->cpus_allowed, cpu_online_map) && 1881 if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
1869 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 1882 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
1870 continue; 1883 continue;
1871 1884
1885 oldmems = cp->mems_allowed;
1886
1872 /* Remove offline cpus and mems from this cpuset. */ 1887 /* Remove offline cpus and mems from this cpuset. */
1873 mutex_lock(&callback_mutex); 1888 mutex_lock(&callback_mutex);
1874 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); 1889 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
@@ -1880,6 +1895,10 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
1880 if (cpus_empty(cp->cpus_allowed) || 1895 if (cpus_empty(cp->cpus_allowed) ||
1881 nodes_empty(cp->mems_allowed)) 1896 nodes_empty(cp->mems_allowed))
1882 remove_tasks_in_empty_cpuset(cp); 1897 remove_tasks_in_empty_cpuset(cp);
1898 else {
1899 update_tasks_cpumask(cp);
1900 update_tasks_nodemask(cp, &oldmems);
1901 }
1883 } 1902 }
1884} 1903}
1885 1904
@@ -1972,7 +1991,6 @@ void __init cpuset_init_smp(void)
1972} 1991}
1973 1992
1974/** 1993/**
1975
1976 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 1994 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
1977 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 1995 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
1978 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. 1996 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set.