aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cpuset.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r--kernel/cpuset.c204
1 files changed, 157 insertions, 47 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 00e8f2575512..8ab1b4e518b8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -228,13 +228,7 @@ static struct dentry_operations cpuset_dops = {
228 228
229static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name) 229static struct dentry *cpuset_get_dentry(struct dentry *parent, const char *name)
230{ 230{
231 struct qstr qstr; 231 struct dentry *d = lookup_one_len(name, parent, strlen(name));
232 struct dentry *d;
233
234 qstr.name = name;
235 qstr.len = strlen(name);
236 qstr.hash = full_name_hash(name, qstr.len);
237 d = lookup_hash(&qstr, parent);
238 if (!IS_ERR(d)) 232 if (!IS_ERR(d))
239 d->d_op = &cpuset_dops; 233 d->d_op = &cpuset_dops;
240 return d; 234 return d;
@@ -404,21 +398,31 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen)
404 * to continue to serve a useful existence. Next time it's released, 398 * to continue to serve a useful existence. Next time it's released,
405 * we will get notified again, if it still has 'notify_on_release' set. 399 * we will get notified again, if it still has 'notify_on_release' set.
406 * 400 *
407 * Note final arg to call_usermodehelper() is 0 - that means 401 * The final arg to call_usermodehelper() is 0, which means don't
408 * don't wait. Since we are holding the global cpuset_sem here, 402 * wait. The separate /sbin/cpuset_release_agent task is forked by
409 * and we are asking another thread (started from keventd) to rmdir a 403 * call_usermodehelper(), then control in this thread returns here,
410 * cpuset, we can't wait - or we'd deadlock with the removing thread 404 * without waiting for the release agent task. We don't bother to
411 * on cpuset_sem. 405 * wait because the caller of this routine has no use for the exit
406 * status of the /sbin/cpuset_release_agent task, so no sense holding
407 * our caller up for that.
408 *
409 * The simple act of forking that task might require more memory,
410 * which might need cpuset_sem. So this routine must be called while
411 * cpuset_sem is not held, to avoid a possible deadlock. See also
412 * comments for check_for_release(), below.
412 */ 413 */
413 414
414static int cpuset_release_agent(char *cpuset_str) 415static void cpuset_release_agent(const char *pathbuf)
415{ 416{
416 char *argv[3], *envp[3]; 417 char *argv[3], *envp[3];
417 int i; 418 int i;
418 419
420 if (!pathbuf)
421 return;
422
419 i = 0; 423 i = 0;
420 argv[i++] = "/sbin/cpuset_release_agent"; 424 argv[i++] = "/sbin/cpuset_release_agent";
421 argv[i++] = cpuset_str; 425 argv[i++] = (char *)pathbuf;
422 argv[i] = NULL; 426 argv[i] = NULL;
423 427
424 i = 0; 428 i = 0;
@@ -427,17 +431,29 @@ static int cpuset_release_agent(char *cpuset_str)
427 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; 431 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
428 envp[i] = NULL; 432 envp[i] = NULL;
429 433
430 return call_usermodehelper(argv[0], argv, envp, 0); 434 call_usermodehelper(argv[0], argv, envp, 0);
435 kfree(pathbuf);
431} 436}
432 437
433/* 438/*
434 * Either cs->count of using tasks transitioned to zero, or the 439 * Either cs->count of using tasks transitioned to zero, or the
435 * cs->children list of child cpusets just became empty. If this 440 * cs->children list of child cpusets just became empty. If this
436 * cs is notify_on_release() and now both the user count is zero and 441 * cs is notify_on_release() and now both the user count is zero and
437 * the list of children is empty, send notice to user land. 442 * the list of children is empty, prepare cpuset path in a kmalloc'd
443 * buffer, to be returned via ppathbuf, so that the caller can invoke
444 * cpuset_release_agent() with it later on, once cpuset_sem is dropped.
445 * Call here with cpuset_sem held.
446 *
447 * This check_for_release() routine is responsible for kmalloc'ing
448 * pathbuf. The above cpuset_release_agent() is responsible for
449 * kfree'ing pathbuf. The caller of these routines is responsible
450 * for providing a pathbuf pointer, initialized to NULL, then
451 * calling check_for_release() with cpuset_sem held and the address
452 * of the pathbuf pointer, then dropping cpuset_sem, then calling
453 * cpuset_release_agent() with pathbuf, as set by check_for_release().
438 */ 454 */
439 455
440static void check_for_release(struct cpuset *cs) 456static void check_for_release(struct cpuset *cs, char **ppathbuf)
441{ 457{
442 if (notify_on_release(cs) && atomic_read(&cs->count) == 0 && 458 if (notify_on_release(cs) && atomic_read(&cs->count) == 0 &&
443 list_empty(&cs->children)) { 459 list_empty(&cs->children)) {
@@ -447,10 +463,9 @@ static void check_for_release(struct cpuset *cs)
447 if (!buf) 463 if (!buf)
448 return; 464 return;
449 if (cpuset_path(cs, buf, PAGE_SIZE) < 0) 465 if (cpuset_path(cs, buf, PAGE_SIZE) < 0)
450 goto out; 466 kfree(buf);
451 cpuset_release_agent(buf); 467 else
452out: 468 *ppathbuf = buf;
453 kfree(buf);
454 } 469 }
455} 470}
456 471
@@ -601,10 +616,75 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
601 return 0; 616 return 0;
602} 617}
603 618
619/*
620 * For a given cpuset cur, partition the system as follows
621 * a. All cpus in the parent cpuset's cpus_allowed that are not part of any
622 * exclusive child cpusets
623 * b. All cpus in the current cpuset's cpus_allowed that are not part of any
624 * exclusive child cpusets
625 * Build these two partitions by calling partition_sched_domains
626 *
627 * Call with cpuset_sem held. May nest a call to the
628 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
629 */
630
631/*
632 * Hack to avoid 2.6.13 partial node dynamic sched domain bug.
633 * Disable letting 'cpu_exclusive' cpusets define dynamic sched
634 * domains, until the sched domain can handle partial nodes.
635 * Remove this #if hackery when sched domains fixed.
636 */
637#if 0
638static void update_cpu_domains(struct cpuset *cur)
639{
640 struct cpuset *c, *par = cur->parent;
641 cpumask_t pspan, cspan;
642
643 if (par == NULL || cpus_empty(cur->cpus_allowed))
644 return;
645
646 /*
647 * Get all cpus from parent's cpus_allowed not part of exclusive
648 * children
649 */
650 pspan = par->cpus_allowed;
651 list_for_each_entry(c, &par->children, sibling) {
652 if (is_cpu_exclusive(c))
653 cpus_andnot(pspan, pspan, c->cpus_allowed);
654 }
655 if (is_removed(cur) || !is_cpu_exclusive(cur)) {
656 cpus_or(pspan, pspan, cur->cpus_allowed);
657 if (cpus_equal(pspan, cur->cpus_allowed))
658 return;
659 cspan = CPU_MASK_NONE;
660 } else {
661 if (cpus_empty(pspan))
662 return;
663 cspan = cur->cpus_allowed;
664 /*
665 * Get all cpus from current cpuset's cpus_allowed not part
666 * of exclusive children
667 */
668 list_for_each_entry(c, &cur->children, sibling) {
669 if (is_cpu_exclusive(c))
670 cpus_andnot(cspan, cspan, c->cpus_allowed);
671 }
672 }
673
674 lock_cpu_hotplug();
675 partition_sched_domains(&pspan, &cspan);
676 unlock_cpu_hotplug();
677}
678#else
679static void update_cpu_domains(struct cpuset *cur)
680{
681}
682#endif
683
604static int update_cpumask(struct cpuset *cs, char *buf) 684static int update_cpumask(struct cpuset *cs, char *buf)
605{ 685{
606 struct cpuset trialcs; 686 struct cpuset trialcs;
607 int retval; 687 int retval, cpus_unchanged;
608 688
609 trialcs = *cs; 689 trialcs = *cs;
610 retval = cpulist_parse(buf, trialcs.cpus_allowed); 690 retval = cpulist_parse(buf, trialcs.cpus_allowed);
@@ -614,9 +694,13 @@ static int update_cpumask(struct cpuset *cs, char *buf)
614 if (cpus_empty(trialcs.cpus_allowed)) 694 if (cpus_empty(trialcs.cpus_allowed))
615 return -ENOSPC; 695 return -ENOSPC;
616 retval = validate_change(cs, &trialcs); 696 retval = validate_change(cs, &trialcs);
617 if (retval == 0) 697 if (retval < 0)
618 cs->cpus_allowed = trialcs.cpus_allowed; 698 return retval;
619 return retval; 699 cpus_unchanged = cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed);
700 cs->cpus_allowed = trialcs.cpus_allowed;
701 if (is_cpu_exclusive(cs) && !cpus_unchanged)
702 update_cpu_domains(cs);
703 return 0;
620} 704}
621 705
622static int update_nodemask(struct cpuset *cs, char *buf) 706static int update_nodemask(struct cpuset *cs, char *buf)
@@ -652,7 +736,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
652{ 736{
653 int turning_on; 737 int turning_on;
654 struct cpuset trialcs; 738 struct cpuset trialcs;
655 int err; 739 int err, cpu_exclusive_changed;
656 740
657 turning_on = (simple_strtoul(buf, NULL, 10) != 0); 741 turning_on = (simple_strtoul(buf, NULL, 10) != 0);
658 742
@@ -663,23 +747,28 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
663 clear_bit(bit, &trialcs.flags); 747 clear_bit(bit, &trialcs.flags);
664 748
665 err = validate_change(cs, &trialcs); 749 err = validate_change(cs, &trialcs);
666 if (err == 0) { 750 if (err < 0)
667 if (turning_on) 751 return err;
668 set_bit(bit, &cs->flags); 752 cpu_exclusive_changed =
669 else 753 (is_cpu_exclusive(cs) != is_cpu_exclusive(&trialcs));
670 clear_bit(bit, &cs->flags); 754 if (turning_on)
671 } 755 set_bit(bit, &cs->flags);
672 return err; 756 else
757 clear_bit(bit, &cs->flags);
758
759 if (cpu_exclusive_changed)
760 update_cpu_domains(cs);
761 return 0;
673} 762}
674 763
675static int attach_task(struct cpuset *cs, char *buf) 764static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
676{ 765{
677 pid_t pid; 766 pid_t pid;
678 struct task_struct *tsk; 767 struct task_struct *tsk;
679 struct cpuset *oldcs; 768 struct cpuset *oldcs;
680 cpumask_t cpus; 769 cpumask_t cpus;
681 770
682 if (sscanf(buf, "%d", &pid) != 1) 771 if (sscanf(pidbuf, "%d", &pid) != 1)
683 return -EIO; 772 return -EIO;
684 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 773 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
685 return -ENOSPC; 774 return -ENOSPC;
@@ -722,7 +811,7 @@ static int attach_task(struct cpuset *cs, char *buf)
722 811
723 put_task_struct(tsk); 812 put_task_struct(tsk);
724 if (atomic_dec_and_test(&oldcs->count)) 813 if (atomic_dec_and_test(&oldcs->count))
725 check_for_release(oldcs); 814 check_for_release(oldcs, ppathbuf);
726 return 0; 815 return 0;
727} 816}
728 817
@@ -746,6 +835,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
746 struct cftype *cft = __d_cft(file->f_dentry); 835 struct cftype *cft = __d_cft(file->f_dentry);
747 cpuset_filetype_t type = cft->private; 836 cpuset_filetype_t type = cft->private;
748 char *buffer; 837 char *buffer;
838 char *pathbuf = NULL;
749 int retval = 0; 839 int retval = 0;
750 840
751 /* Crude upper limit on largest legitimate cpulist user might write. */ 841 /* Crude upper limit on largest legitimate cpulist user might write. */
@@ -786,7 +876,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
786 retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); 876 retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
787 break; 877 break;
788 case FILE_TASKLIST: 878 case FILE_TASKLIST:
789 retval = attach_task(cs, buffer); 879 retval = attach_task(cs, buffer, &pathbuf);
790 break; 880 break;
791 default: 881 default:
792 retval = -EINVAL; 882 retval = -EINVAL;
@@ -797,6 +887,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
797 retval = nbytes; 887 retval = nbytes;
798out2: 888out2:
799 up(&cpuset_sem); 889 up(&cpuset_sem);
890 cpuset_release_agent(pathbuf);
800out1: 891out1:
801 kfree(buffer); 892 kfree(buffer);
802 return retval; 893 return retval;
@@ -1302,6 +1393,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1302 struct cpuset *cs = dentry->d_fsdata; 1393 struct cpuset *cs = dentry->d_fsdata;
1303 struct dentry *d; 1394 struct dentry *d;
1304 struct cpuset *parent; 1395 struct cpuset *parent;
1396 char *pathbuf = NULL;
1305 1397
1306 /* the vfs holds both inode->i_sem already */ 1398 /* the vfs holds both inode->i_sem already */
1307 1399
@@ -1315,18 +1407,21 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1315 up(&cpuset_sem); 1407 up(&cpuset_sem);
1316 return -EBUSY; 1408 return -EBUSY;
1317 } 1409 }
1318 spin_lock(&cs->dentry->d_lock);
1319 parent = cs->parent; 1410 parent = cs->parent;
1320 set_bit(CS_REMOVED, &cs->flags); 1411 set_bit(CS_REMOVED, &cs->flags);
1412 if (is_cpu_exclusive(cs))
1413 update_cpu_domains(cs);
1321 list_del(&cs->sibling); /* delete my sibling from parent->children */ 1414 list_del(&cs->sibling); /* delete my sibling from parent->children */
1322 if (list_empty(&parent->children)) 1415 if (list_empty(&parent->children))
1323 check_for_release(parent); 1416 check_for_release(parent, &pathbuf);
1417 spin_lock(&cs->dentry->d_lock);
1324 d = dget(cs->dentry); 1418 d = dget(cs->dentry);
1325 cs->dentry = NULL; 1419 cs->dentry = NULL;
1326 spin_unlock(&d->d_lock); 1420 spin_unlock(&d->d_lock);
1327 cpuset_d_remove_dir(d); 1421 cpuset_d_remove_dir(d);
1328 dput(d); 1422 dput(d);
1329 up(&cpuset_sem); 1423 up(&cpuset_sem);
1424 cpuset_release_agent(pathbuf);
1330 return 0; 1425 return 0;
1331} 1426}
1332 1427
@@ -1383,10 +1478,10 @@ void __init cpuset_init_smp(void)
1383 1478
1384/** 1479/**
1385 * cpuset_fork - attach newly forked task to its parents cpuset. 1480 * cpuset_fork - attach newly forked task to its parents cpuset.
1386 * @p: pointer to task_struct of forking parent process. 1481 * @tsk: pointer to task_struct of forking parent process.
1387 * 1482 *
1388 * Description: By default, on fork, a task inherits its 1483 * Description: By default, on fork, a task inherits its
1389 * parents cpuset. The pointer to the shared cpuset is 1484 * parent's cpuset. The pointer to the shared cpuset is
1390 * automatically copied in fork.c by dup_task_struct(). 1485 * automatically copied in fork.c by dup_task_struct().
1391 * This cpuset_fork() routine need only increment the usage 1486 * This cpuset_fork() routine need only increment the usage
1392 * counter in that cpuset. 1487 * counter in that cpuset.
@@ -1414,7 +1509,6 @@ void cpuset_fork(struct task_struct *tsk)
1414 * by the cpuset_sem semaphore. If you don't hold cpuset_sem, 1509 * by the cpuset_sem semaphore. If you don't hold cpuset_sem,
1415 * then a zero cpuset use count is a license to any other task to 1510 * then a zero cpuset use count is a license to any other task to
1416 * nuke the cpuset immediately. 1511 * nuke the cpuset immediately.
1417 *
1418 **/ 1512 **/
1419 1513
1420void cpuset_exit(struct task_struct *tsk) 1514void cpuset_exit(struct task_struct *tsk)
@@ -1427,10 +1521,13 @@ void cpuset_exit(struct task_struct *tsk)
1427 task_unlock(tsk); 1521 task_unlock(tsk);
1428 1522
1429 if (notify_on_release(cs)) { 1523 if (notify_on_release(cs)) {
1524 char *pathbuf = NULL;
1525
1430 down(&cpuset_sem); 1526 down(&cpuset_sem);
1431 if (atomic_dec_and_test(&cs->count)) 1527 if (atomic_dec_and_test(&cs->count))
1432 check_for_release(cs); 1528 check_for_release(cs, &pathbuf);
1433 up(&cpuset_sem); 1529 up(&cpuset_sem);
1530 cpuset_release_agent(pathbuf);
1434 } else { 1531 } else {
1435 atomic_dec(&cs->count); 1532 atomic_dec(&cs->count);
1436 } 1533 }
@@ -1464,7 +1561,9 @@ void cpuset_init_current_mems_allowed(void)
1464 current->mems_allowed = NODE_MASK_ALL; 1561 current->mems_allowed = NODE_MASK_ALL;
1465} 1562}
1466 1563
1467/* 1564/**
1565 * cpuset_update_current_mems_allowed - update mems parameters to new values
1566 *
1468 * If the current tasks cpusets mems_allowed changed behind our backs, 1567 * If the current tasks cpusets mems_allowed changed behind our backs,
1469 * update current->mems_allowed and mems_generation to the new value. 1568 * update current->mems_allowed and mems_generation to the new value.
1470 * Do not call this routine if in_interrupt(). 1569 * Do not call this routine if in_interrupt().
@@ -1483,13 +1582,20 @@ void cpuset_update_current_mems_allowed(void)
1483 } 1582 }
1484} 1583}
1485 1584
1585/**
1586 * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed
1587 * @nodes: pointer to a node bitmap that is and-ed with mems_allowed
1588 */
1486void cpuset_restrict_to_mems_allowed(unsigned long *nodes) 1589void cpuset_restrict_to_mems_allowed(unsigned long *nodes)
1487{ 1590{
1488 bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed), 1591 bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed),
1489 MAX_NUMNODES); 1592 MAX_NUMNODES);
1490} 1593}
1491 1594
1492/* 1595/**
1596 * cpuset_zonelist_valid_mems_allowed - check zonelist vs. curremt mems_allowed
1597 * @zl: the zonelist to be checked
1598 *
1493 * Are any of the nodes on zonelist zl allowed in current->mems_allowed? 1599 * Are any of the nodes on zonelist zl allowed in current->mems_allowed?
1494 */ 1600 */
1495int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) 1601int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
@@ -1505,8 +1611,12 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
1505 return 0; 1611 return 0;
1506} 1612}
1507 1613
1508/* 1614/**
1509 * Is 'current' valid, and is zone z allowed in current->mems_allowed? 1615 * cpuset_zone_allowed - is zone z allowed in current->mems_allowed
1616 * @z: zone in question
1617 *
1618 * Is zone z allowed in current->mems_allowed, or is
1619 * the CPU in interrupt context? (zone is always allowed in this case)
1510 */ 1620 */
1511int cpuset_zone_allowed(struct zone *z) 1621int cpuset_zone_allowed(struct zone *z)
1512{ 1622{