diff options
author | Paul Jackson <pj@sgi.com> | 2006-01-08 04:01:59 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-01-08 23:13:44 -0500 |
commit | 4225399a66b315d4d1fb1cb61b75dda201c832e3 (patch) | |
tree | c8bd976bc6590c5fe859c6129abb93072d99cfa8 /kernel | |
parent | 202f72d5d1b5c2c084f63ef996c736d208b447b5 (diff) |
[PATCH] cpuset: rebind vma mempolicies fix
Fix more of longstanding bug in cpuset/mempolicy interaction.
NUMA mempolicies (mm/mempolicy.c) are constrained by the current tasks cpuset
to just the Memory Nodes allowed by that cpuset. The kernel maintains
internal state for each mempolicy, tracking what nodes are used for the
MPOL_INTERLEAVE, MPOL_BIND or MPOL_PREFERRED policies.
When a tasks cpuset memory placement changes, whether because the cpuset
changed, or because the task was attached to a different cpuset, then the
tasks mempolicies have to be rebound to the new cpuset placement, so as to
preserve the cpuset-relative numbering of the nodes in that policy.
An earlier fix handled such mempolicy rebinding for mempolicies attached to a
task.
This fix rebinds mempolicies attached to vma's (address ranges in a tasks
address space.) Due to the need to hold the task->mm->mmap_sem semaphore while
updating vma's, the rebinding of vma mempolicies has to be done when the
cpuset memory placement is changed, at which time mmap_sem can be safely
acquired. The tasks mempolicy is rebound later, when the task next attempts
to allocate memory and notices that its task->cpuset_mems_generation is
out-of-date with its cpusets mems_generation.
Because walking the tasklist to find all tasks attached to a changing cpuset
requires holding tasklist_lock, a spinlock, one cannot update the vma's of the
affected tasks while doing the tasklist scan. In general, one cannot acquire
a semaphore (which can sleep) while already holding a spinlock (such as
tasklist_lock). So a list of mm references has to be built up during the
tasklist scan, then the tasklist lock dropped, then for each mm, its mmap_sem
acquired, and the vma's in that mm rebound.
Once the tasklist lock is dropped, affected tasks may fork new tasks, before
their mm's are rebound. A kernel global 'cpuset_being_rebound' is set to
point to the cpuset being rebound (there can only be one; cpuset modifications
are done under a global 'manage_sem' semaphore), and the mpol_copy code that
is used to copy a tasks mempolicies during fork catches such forking tasks,
and ensures their children are also rebound.
When a task is moved to a different cpuset, it is easier, as there is only one
task involved. It's mm->vma's are scanned, using the same
mpol_rebind_policy() as used above.
It may happen that both the mpol_copy hook and the update done via the
tasklist scan update the same mm twice. This is ok, as the mempolicies of
each vma in an mm keep track of what mems_allowed they are relative to, and
safely no-op a second request to rebind to the same nodes.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpuset.c | 90 |
1 files changed, 90 insertions, 0 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6004719f26ee..19f87565be17 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -812,12 +812,24 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
812 | } | 812 | } |
813 | 813 | ||
814 | /* | 814 | /* |
815 | * Handle user request to change the 'mems' memory placement | ||
816 | * of a cpuset. Needs to validate the request, update the | ||
817 | * cpusets mems_allowed and mems_generation, and for each | ||
818 | * task in the cpuset, rebind any vma mempolicies. | ||
819 | * | ||
815 | * Call with manage_sem held. May take callback_sem during call. | 820 | * Call with manage_sem held. May take callback_sem during call. |
821 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | ||
822 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | ||
823 | * their mempolicies to the cpusets new mems_allowed. | ||
816 | */ | 824 | */ |
817 | 825 | ||
818 | static int update_nodemask(struct cpuset *cs, char *buf) | 826 | static int update_nodemask(struct cpuset *cs, char *buf) |
819 | { | 827 | { |
820 | struct cpuset trialcs; | 828 | struct cpuset trialcs; |
829 | struct task_struct *g, *p; | ||
830 | struct mm_struct **mmarray; | ||
831 | int i, n, ntasks; | ||
832 | int fudge; | ||
821 | int retval; | 833 | int retval; |
822 | 834 | ||
823 | trialcs = *cs; | 835 | trialcs = *cs; |
@@ -839,6 +851,76 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
839 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 851 | cs->mems_generation = atomic_read(&cpuset_mems_generation); |
840 | up(&callback_sem); | 852 | up(&callback_sem); |
841 | 853 | ||
854 | set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ | ||
855 | |||
856 | fudge = 10; /* spare mmarray[] slots */ | ||
857 | fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ | ||
858 | retval = -ENOMEM; | ||
859 | |||
860 | /* | ||
861 | * Allocate mmarray[] to hold mm reference for each task | ||
862 | * in cpuset cs. Can't kmalloc GFP_KERNEL while holding | ||
863 | * tasklist_lock. We could use GFP_ATOMIC, but with a | ||
864 | * few more lines of code, we can retry until we get a big | ||
865 | * enough mmarray[] w/o using GFP_ATOMIC. | ||
866 | */ | ||
867 | while (1) { | ||
868 | ntasks = atomic_read(&cs->count); /* guess */ | ||
869 | ntasks += fudge; | ||
870 | mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); | ||
871 | if (!mmarray) | ||
872 | goto done; | ||
873 | write_lock_irq(&tasklist_lock); /* block fork */ | ||
874 | if (atomic_read(&cs->count) <= ntasks) | ||
875 | break; /* got enough */ | ||
876 | write_unlock_irq(&tasklist_lock); /* try again */ | ||
877 | kfree(mmarray); | ||
878 | } | ||
879 | |||
880 | n = 0; | ||
881 | |||
882 | /* Load up mmarray[] with mm reference for each task in cpuset. */ | ||
883 | do_each_thread(g, p) { | ||
884 | struct mm_struct *mm; | ||
885 | |||
886 | if (n >= ntasks) { | ||
887 | printk(KERN_WARNING | ||
888 | "Cpuset mempolicy rebind incomplete.\n"); | ||
889 | continue; | ||
890 | } | ||
891 | if (p->cpuset != cs) | ||
892 | continue; | ||
893 | mm = get_task_mm(p); | ||
894 | if (!mm) | ||
895 | continue; | ||
896 | mmarray[n++] = mm; | ||
897 | } while_each_thread(g, p); | ||
898 | write_unlock_irq(&tasklist_lock); | ||
899 | |||
900 | /* | ||
901 | * Now that we've dropped the tasklist spinlock, we can | ||
902 | * rebind the vma mempolicies of each mm in mmarray[] to their | ||
903 | * new cpuset, and release that mm. The mpol_rebind_mm() | ||
904 | * call takes mmap_sem, which we couldn't take while holding | ||
905 | * tasklist_lock. Forks can happen again now - the mpol_copy() | ||
906 | * cpuset_being_rebound check will catch such forks, and rebind | ||
907 | * their vma mempolicies too. Because we still hold the global | ||
908 | * cpuset manage_sem, we know that no other rebind effort will | ||
909 | * be contending for the global variable cpuset_being_rebound. | ||
910 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | ||
911 | * is idempotent. | ||
912 | */ | ||
913 | for (i = 0; i < n; i++) { | ||
914 | struct mm_struct *mm = mmarray[i]; | ||
915 | |||
916 | mpol_rebind_mm(mm, &cs->mems_allowed); | ||
917 | mmput(mm); | ||
918 | } | ||
919 | |||
920 | /* We're done rebinding vma's to this cpusets new mems_allowed. */ | ||
921 | kfree(mmarray); | ||
922 | set_cpuset_being_rebound(NULL); | ||
923 | retval = 0; | ||
842 | done: | 924 | done: |
843 | return retval; | 925 | return retval; |
844 | } | 926 | } |
@@ -1011,6 +1093,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
1011 | struct cpuset *oldcs; | 1093 | struct cpuset *oldcs; |
1012 | cpumask_t cpus; | 1094 | cpumask_t cpus; |
1013 | nodemask_t from, to; | 1095 | nodemask_t from, to; |
1096 | struct mm_struct *mm; | ||
1014 | 1097 | ||
1015 | if (sscanf(pidbuf, "%d", &pid) != 1) | 1098 | if (sscanf(pidbuf, "%d", &pid) != 1) |
1016 | return -EIO; | 1099 | return -EIO; |
@@ -1060,6 +1143,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
1060 | to = cs->mems_allowed; | 1143 | to = cs->mems_allowed; |
1061 | 1144 | ||
1062 | up(&callback_sem); | 1145 | up(&callback_sem); |
1146 | |||
1147 | mm = get_task_mm(tsk); | ||
1148 | if (mm) { | ||
1149 | mpol_rebind_mm(mm, &to); | ||
1150 | mmput(mm); | ||
1151 | } | ||
1152 | |||
1063 | if (is_memory_migrate(cs)) | 1153 | if (is_memory_migrate(cs)) |
1064 | do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); | 1154 | do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); |
1065 | put_task_struct(tsk); | 1155 | put_task_struct(tsk); |