aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cpuset.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r--kernel/cpuset.c275
1 files changed, 270 insertions, 5 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 1133062395e2..203ca52e78dd 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -4,7 +4,7 @@
4 * Processor and Memory placement constraints for sets of tasks. 4 * Processor and Memory placement constraints for sets of tasks.
5 * 5 *
6 * Copyright (C) 2003 BULL SA. 6 * Copyright (C) 2003 BULL SA.
7 * Copyright (C) 2004-2006 Silicon Graphics, Inc. 7 * Copyright (C) 2004-2007 Silicon Graphics, Inc.
8 * Copyright (C) 2006 Google, Inc 8 * Copyright (C) 2006 Google, Inc
9 * 9 *
10 * Portions derived from Patrick Mochel's sysfs code. 10 * Portions derived from Patrick Mochel's sysfs code.
@@ -54,6 +54,7 @@
54#include <asm/uaccess.h> 54#include <asm/uaccess.h>
55#include <asm/atomic.h> 55#include <asm/atomic.h>
56#include <linux/mutex.h> 56#include <linux/mutex.h>
57#include <linux/kfifo.h>
57 58
58/* 59/*
59 * Tracks how many cpusets are currently defined in system. 60 * Tracks how many cpusets are currently defined in system.
@@ -91,6 +92,9 @@ struct cpuset {
91 int mems_generation; 92 int mems_generation;
92 93
93 struct fmeter fmeter; /* memory_pressure filter */ 94 struct fmeter fmeter; /* memory_pressure filter */
95
96 /* partition number for rebuild_sched_domains() */
97 int pn;
94}; 98};
95 99
96/* Retrieve the cpuset for a cgroup */ 100/* Retrieve the cpuset for a cgroup */
@@ -113,6 +117,7 @@ typedef enum {
113 CS_CPU_EXCLUSIVE, 117 CS_CPU_EXCLUSIVE,
114 CS_MEM_EXCLUSIVE, 118 CS_MEM_EXCLUSIVE,
115 CS_MEMORY_MIGRATE, 119 CS_MEMORY_MIGRATE,
120 CS_SCHED_LOAD_BALANCE,
116 CS_SPREAD_PAGE, 121 CS_SPREAD_PAGE,
117 CS_SPREAD_SLAB, 122 CS_SPREAD_SLAB,
118} cpuset_flagbits_t; 123} cpuset_flagbits_t;
@@ -128,6 +133,11 @@ static inline int is_mem_exclusive(const struct cpuset *cs)
128 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); 133 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
129} 134}
130 135
136static inline int is_sched_load_balance(const struct cpuset *cs)
137{
138 return test_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
139}
140
131static inline int is_memory_migrate(const struct cpuset *cs) 141static inline int is_memory_migrate(const struct cpuset *cs)
132{ 142{
133 return test_bit(CS_MEMORY_MIGRATE, &cs->flags); 143 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
@@ -482,6 +492,208 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
482} 492}
483 493
484/* 494/*
495 * Helper routine for rebuild_sched_domains().
496 * Do cpusets a, b have overlapping cpus_allowed masks?
497 */
498
499static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
500{
501 return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
502}
503
504/*
505 * rebuild_sched_domains()
506 *
507 * If the flag 'sched_load_balance' of any cpuset with non-empty
508 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
509 * which has that flag enabled, or if any cpuset with a non-empty
510 * 'cpus' is removed, then call this routine to rebuild the
511 * scheduler's dynamic sched domains.
512 *
513 * This routine builds a partial partition of the systems CPUs
514 * (the set of non-overlappping cpumask_t's in the array 'part'
515 * below), and passes that partial partition to the kernel/sched.c
516 * partition_sched_domains() routine, which will rebuild the
517 * schedulers load balancing domains (sched domains) as specified
518 * by that partial partition. A 'partial partition' is a set of
519 * non-overlapping subsets whose union is a subset of that set.
520 *
521 * See "What is sched_load_balance" in Documentation/cpusets.txt
522 * for a background explanation of this.
523 *
524 * Does not return errors, on the theory that the callers of this
525 * routine would rather not worry about failures to rebuild sched
526 * domains when operating in the severe memory shortage situations
527 * that could cause allocation failures below.
528 *
529 * Call with cgroup_mutex held. May take callback_mutex during
530 * call due to the kfifo_alloc() and kmalloc() calls. May nest
531 * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
532 * Must not be called holding callback_mutex, because we must not
533 * call lock_cpu_hotplug() while holding callback_mutex. Elsewhere
534 * the kernel nests callback_mutex inside lock_cpu_hotplug() calls.
535 * So the reverse nesting would risk an ABBA deadlock.
536 *
537 * The three key local variables below are:
538 * q - a kfifo queue of cpuset pointers, used to implement a
539 * top-down scan of all cpusets. This scan loads a pointer
540 * to each cpuset marked is_sched_load_balance into the
541 * array 'csa'. For our purposes, rebuilding the schedulers
542 * sched domains, we can ignore !is_sched_load_balance cpusets.
543 * csa - (for CpuSet Array) Array of pointers to all the cpusets
544 * that need to be load balanced, for convenient iterative
545 * access by the subsequent code that finds the best partition,
546 * i.e the set of domains (subsets) of CPUs such that the
547 * cpus_allowed of every cpuset marked is_sched_load_balance
548 * is a subset of one of these domains, while there are as
549 * many such domains as possible, each as small as possible.
550 * doms - Conversion of 'csa' to an array of cpumasks, for passing to
551 * the kernel/sched.c routine partition_sched_domains() in a
552 * convenient format, that can be easily compared to the prior
553 * value to determine what partition elements (sched domains)
554 * were changed (added or removed.)
555 *
556 * Finding the best partition (set of domains):
557 * The triple nested loops below over i, j, k scan over the
558 * load balanced cpusets (using the array of cpuset pointers in
559 * csa[]) looking for pairs of cpusets that have overlapping
560 * cpus_allowed, but which don't have the same 'pn' partition
561 * number and gives them in the same partition number. It keeps
562 * looping on the 'restart' label until it can no longer find
563 * any such pairs.
564 *
565 * The union of the cpus_allowed masks from the set of
566 * all cpusets having the same 'pn' value then form the one
567 * element of the partition (one sched domain) to be passed to
568 * partition_sched_domains().
569 */
570
571static void rebuild_sched_domains(void)
572{
573 struct kfifo *q; /* queue of cpusets to be scanned */
574 struct cpuset *cp; /* scans q */
575 struct cpuset **csa; /* array of all cpuset ptrs */
576 int csn; /* how many cpuset ptrs in csa so far */
577 int i, j, k; /* indices for partition finding loops */
578 cpumask_t *doms; /* resulting partition; i.e. sched domains */
579 int ndoms; /* number of sched domains in result */
580 int nslot; /* next empty doms[] cpumask_t slot */
581
582 q = NULL;
583 csa = NULL;
584 doms = NULL;
585
586 /* Special case for the 99% of systems with one, full, sched domain */
587 if (is_sched_load_balance(&top_cpuset)) {
588 ndoms = 1;
589 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
590 if (!doms)
591 goto rebuild;
592 *doms = top_cpuset.cpus_allowed;
593 goto rebuild;
594 }
595
596 q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL);
597 if (IS_ERR(q))
598 goto done;
599 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
600 if (!csa)
601 goto done;
602 csn = 0;
603
604 cp = &top_cpuset;
605 __kfifo_put(q, (void *)&cp, sizeof(cp));
606 while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
607 struct cgroup *cont;
608 struct cpuset *child; /* scans child cpusets of cp */
609 if (is_sched_load_balance(cp))
610 csa[csn++] = cp;
611 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
612 child = cgroup_cs(cont);
613 __kfifo_put(q, (void *)&child, sizeof(cp));
614 }
615 }
616
617 for (i = 0; i < csn; i++)
618 csa[i]->pn = i;
619 ndoms = csn;
620
621restart:
622 /* Find the best partition (set of sched domains) */
623 for (i = 0; i < csn; i++) {
624 struct cpuset *a = csa[i];
625 int apn = a->pn;
626
627 for (j = 0; j < csn; j++) {
628 struct cpuset *b = csa[j];
629 int bpn = b->pn;
630
631 if (apn != bpn && cpusets_overlap(a, b)) {
632 for (k = 0; k < csn; k++) {
633 struct cpuset *c = csa[k];
634
635 if (c->pn == bpn)
636 c->pn = apn;
637 }
638 ndoms--; /* one less element */
639 goto restart;
640 }
641 }
642 }
643
644 /* Convert <csn, csa> to <ndoms, doms> */
645 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
646 if (!doms)
647 goto rebuild;
648
649 for (nslot = 0, i = 0; i < csn; i++) {
650 struct cpuset *a = csa[i];
651 int apn = a->pn;
652
653 if (apn >= 0) {
654 cpumask_t *dp = doms + nslot;
655
656 if (nslot == ndoms) {
657 static int warnings = 10;
658 if (warnings) {
659 printk(KERN_WARNING
660 "rebuild_sched_domains confused:"
661 " nslot %d, ndoms %d, csn %d, i %d,"
662 " apn %d\n",
663 nslot, ndoms, csn, i, apn);
664 warnings--;
665 }
666 continue;
667 }
668
669 cpus_clear(*dp);
670 for (j = i; j < csn; j++) {
671 struct cpuset *b = csa[j];
672
673 if (apn == b->pn) {
674 cpus_or(*dp, *dp, b->cpus_allowed);
675 b->pn = -1;
676 }
677 }
678 nslot++;
679 }
680 }
681 BUG_ON(nslot != ndoms);
682
683rebuild:
684 /* Have scheduler rebuild sched domains */
685 lock_cpu_hotplug();
686 partition_sched_domains(ndoms, doms);
687 unlock_cpu_hotplug();
688
689done:
690 if (q && !IS_ERR(q))
691 kfifo_free(q);
692 kfree(csa);
693 /* Don't kfree(doms) -- partition_sched_domains() does that. */
694}
695
696/*
485 * Call with manage_mutex held. May take callback_mutex during call. 697 * Call with manage_mutex held. May take callback_mutex during call.
486 */ 698 */
487 699
@@ -489,6 +701,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
489{ 701{
490 struct cpuset trialcs; 702 struct cpuset trialcs;
491 int retval; 703 int retval;
704 int cpus_changed, is_load_balanced;
492 705
493 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ 706 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
494 if (cs == &top_cpuset) 707 if (cs == &top_cpuset)
@@ -516,9 +729,17 @@ static int update_cpumask(struct cpuset *cs, char *buf)
516 retval = validate_change(cs, &trialcs); 729 retval = validate_change(cs, &trialcs);
517 if (retval < 0) 730 if (retval < 0)
518 return retval; 731 return retval;
732
733 cpus_changed = !cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
734 is_load_balanced = is_sched_load_balance(&trialcs);
735
519 mutex_lock(&callback_mutex); 736 mutex_lock(&callback_mutex);
520 cs->cpus_allowed = trialcs.cpus_allowed; 737 cs->cpus_allowed = trialcs.cpus_allowed;
521 mutex_unlock(&callback_mutex); 738 mutex_unlock(&callback_mutex);
739
740 if (cpus_changed && is_load_balanced)
741 rebuild_sched_domains();
742
522 return 0; 743 return 0;
523} 744}
524 745
@@ -752,6 +973,7 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
752/* 973/*
753 * update_flag - read a 0 or a 1 in a file and update associated flag 974 * update_flag - read a 0 or a 1 in a file and update associated flag
754 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 975 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
976 * CS_SCHED_LOAD_BALANCE,
755 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, 977 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
756 * CS_SPREAD_PAGE, CS_SPREAD_SLAB) 978 * CS_SPREAD_PAGE, CS_SPREAD_SLAB)
757 * cs: the cpuset to update 979 * cs: the cpuset to update
@@ -765,6 +987,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
765 int turning_on; 987 int turning_on;
766 struct cpuset trialcs; 988 struct cpuset trialcs;
767 int err; 989 int err;
990 int cpus_nonempty, balance_flag_changed;
768 991
769 turning_on = (simple_strtoul(buf, NULL, 10) != 0); 992 turning_on = (simple_strtoul(buf, NULL, 10) != 0);
770 993
@@ -777,10 +1000,18 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
777 err = validate_change(cs, &trialcs); 1000 err = validate_change(cs, &trialcs);
778 if (err < 0) 1001 if (err < 0)
779 return err; 1002 return err;
1003
1004 cpus_nonempty = !cpus_empty(trialcs.cpus_allowed);
1005 balance_flag_changed = (is_sched_load_balance(cs) !=
1006 is_sched_load_balance(&trialcs));
1007
780 mutex_lock(&callback_mutex); 1008 mutex_lock(&callback_mutex);
781 cs->flags = trialcs.flags; 1009 cs->flags = trialcs.flags;
782 mutex_unlock(&callback_mutex); 1010 mutex_unlock(&callback_mutex);
783 1011
1012 if (cpus_nonempty && balance_flag_changed)
1013 rebuild_sched_domains();
1014
784 return 0; 1015 return 0;
785} 1016}
786 1017
@@ -928,6 +1159,7 @@ typedef enum {
928 FILE_MEMLIST, 1159 FILE_MEMLIST,
929 FILE_CPU_EXCLUSIVE, 1160 FILE_CPU_EXCLUSIVE,
930 FILE_MEM_EXCLUSIVE, 1161 FILE_MEM_EXCLUSIVE,
1162 FILE_SCHED_LOAD_BALANCE,
931 FILE_MEMORY_PRESSURE_ENABLED, 1163 FILE_MEMORY_PRESSURE_ENABLED,
932 FILE_MEMORY_PRESSURE, 1164 FILE_MEMORY_PRESSURE,
933 FILE_SPREAD_PAGE, 1165 FILE_SPREAD_PAGE,
@@ -946,7 +1178,7 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
946 int retval = 0; 1178 int retval = 0;
947 1179
948 /* Crude upper limit on largest legitimate cpulist user might write. */ 1180 /* Crude upper limit on largest legitimate cpulist user might write. */
949 if (nbytes > 100 + 6 * max(NR_CPUS, MAX_NUMNODES)) 1181 if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES))
950 return -E2BIG; 1182 return -E2BIG;
951 1183
952 /* +1 for nul-terminator */ 1184 /* +1 for nul-terminator */
@@ -979,6 +1211,9 @@ static ssize_t cpuset_common_file_write(struct cgroup *cont,
979 case FILE_MEM_EXCLUSIVE: 1211 case FILE_MEM_EXCLUSIVE:
980 retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer); 1212 retval = update_flag(CS_MEM_EXCLUSIVE, cs, buffer);
981 break; 1213 break;
1214 case FILE_SCHED_LOAD_BALANCE:
1215 retval = update_flag(CS_SCHED_LOAD_BALANCE, cs, buffer);
1216 break;
982 case FILE_MEMORY_MIGRATE: 1217 case FILE_MEMORY_MIGRATE:
983 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); 1218 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
984 break; 1219 break;
@@ -1074,6 +1309,9 @@ static ssize_t cpuset_common_file_read(struct cgroup *cont,
1074 case FILE_MEM_EXCLUSIVE: 1309 case FILE_MEM_EXCLUSIVE:
1075 *s++ = is_mem_exclusive(cs) ? '1' : '0'; 1310 *s++ = is_mem_exclusive(cs) ? '1' : '0';
1076 break; 1311 break;
1312 case FILE_SCHED_LOAD_BALANCE:
1313 *s++ = is_sched_load_balance(cs) ? '1' : '0';
1314 break;
1077 case FILE_MEMORY_MIGRATE: 1315 case FILE_MEMORY_MIGRATE:
1078 *s++ = is_memory_migrate(cs) ? '1' : '0'; 1316 *s++ = is_memory_migrate(cs) ? '1' : '0';
1079 break; 1317 break;
@@ -1137,6 +1375,13 @@ static struct cftype cft_mem_exclusive = {
1137 .private = FILE_MEM_EXCLUSIVE, 1375 .private = FILE_MEM_EXCLUSIVE,
1138}; 1376};
1139 1377
1378static struct cftype cft_sched_load_balance = {
1379 .name = "sched_load_balance",
1380 .read = cpuset_common_file_read,
1381 .write = cpuset_common_file_write,
1382 .private = FILE_SCHED_LOAD_BALANCE,
1383};
1384
1140static struct cftype cft_memory_migrate = { 1385static struct cftype cft_memory_migrate = {
1141 .name = "memory_migrate", 1386 .name = "memory_migrate",
1142 .read = cpuset_common_file_read, 1387 .read = cpuset_common_file_read,
@@ -1186,6 +1431,8 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
1186 return err; 1431 return err;
1187 if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0) 1432 if ((err = cgroup_add_file(cont, ss, &cft_memory_migrate)) < 0)
1188 return err; 1433 return err;
1434 if ((err = cgroup_add_file(cont, ss, &cft_sched_load_balance)) < 0)
1435 return err;
1189 if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0) 1436 if ((err = cgroup_add_file(cont, ss, &cft_memory_pressure)) < 0)
1190 return err; 1437 return err;
1191 if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0) 1438 if ((err = cgroup_add_file(cont, ss, &cft_spread_page)) < 0)
@@ -1267,6 +1514,7 @@ static struct cgroup_subsys_state *cpuset_create(
1267 set_bit(CS_SPREAD_PAGE, &cs->flags); 1514 set_bit(CS_SPREAD_PAGE, &cs->flags);
1268 if (is_spread_slab(parent)) 1515 if (is_spread_slab(parent))
1269 set_bit(CS_SPREAD_SLAB, &cs->flags); 1516 set_bit(CS_SPREAD_SLAB, &cs->flags);
1517 set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
1270 cs->cpus_allowed = CPU_MASK_NONE; 1518 cs->cpus_allowed = CPU_MASK_NONE;
1271 cs->mems_allowed = NODE_MASK_NONE; 1519 cs->mems_allowed = NODE_MASK_NONE;
1272 cs->mems_generation = cpuset_mems_generation++; 1520 cs->mems_generation = cpuset_mems_generation++;
@@ -1277,11 +1525,27 @@ static struct cgroup_subsys_state *cpuset_create(
1277 return &cs->css ; 1525 return &cs->css ;
1278} 1526}
1279 1527
1528/*
1529 * Locking note on the strange update_flag() call below:
1530 *
1531 * If the cpuset being removed has its flag 'sched_load_balance'
1532 * enabled, then simulate turning sched_load_balance off, which
1533 * will call rebuild_sched_domains(). The lock_cpu_hotplug()
1534 * call in rebuild_sched_domains() must not be made while holding
1535 * callback_mutex. Elsewhere the kernel nests callback_mutex inside
1536 * lock_cpu_hotplug() calls. So the reverse nesting would risk an
1537 * ABBA deadlock.
1538 */
1539
1280static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 1540static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1281{ 1541{
1282 struct cpuset *cs = cgroup_cs(cont); 1542 struct cpuset *cs = cgroup_cs(cont);
1283 1543
1284 cpuset_update_task_memory_state(); 1544 cpuset_update_task_memory_state();
1545
1546 if (is_sched_load_balance(cs))
1547 update_flag(CS_SCHED_LOAD_BALANCE, cs, "0");
1548
1285 number_of_cpusets--; 1549 number_of_cpusets--;
1286 kfree(cs); 1550 kfree(cs);
1287} 1551}
@@ -1326,6 +1590,7 @@ int __init cpuset_init(void)
1326 1590
1327 fmeter_init(&top_cpuset.fmeter); 1591 fmeter_init(&top_cpuset.fmeter);
1328 top_cpuset.mems_generation = cpuset_mems_generation++; 1592 top_cpuset.mems_generation = cpuset_mems_generation++;
1593 set_bit(CS_SCHED_LOAD_BALANCE, &top_cpuset.flags);
1329 1594
1330 err = register_filesystem(&cpuset_fs_type); 1595 err = register_filesystem(&cpuset_fs_type);
1331 if (err < 0) 1596 if (err < 0)
@@ -1412,8 +1677,8 @@ static void common_cpu_mem_hotplug_unplug(void)
1412 * cpu_online_map on each CPU hotplug (cpuhp) event. 1677 * cpu_online_map on each CPU hotplug (cpuhp) event.
1413 */ 1678 */
1414 1679
1415static int cpuset_handle_cpuhp(struct notifier_block *nb, 1680static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
1416 unsigned long phase, void *cpu) 1681 unsigned long phase, void *unused_cpu)
1417{ 1682{
1418 if (phase == CPU_DYING || phase == CPU_DYING_FROZEN) 1683 if (phase == CPU_DYING || phase == CPU_DYING_FROZEN)
1419 return NOTIFY_DONE; 1684 return NOTIFY_DONE;
@@ -1803,7 +2068,7 @@ void __cpuset_memory_pressure_bump(void)
1803 * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks 2068 * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
1804 * cpuset to top_cpuset. 2069 * cpuset to top_cpuset.
1805 */ 2070 */
1806static int proc_cpuset_show(struct seq_file *m, void *v) 2071static int proc_cpuset_show(struct seq_file *m, void *unused_v)
1807{ 2072{
1808 struct pid *pid; 2073 struct pid *pid;
1809 struct task_struct *tsk; 2074 struct task_struct *tsk;