diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpuset.c | 90 |
1 files changed, 90 insertions, 0 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6004719f26ee..19f87565be17 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -812,12 +812,24 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
812 | } | 812 | } |
813 | 813 | ||
814 | /* | 814 | /* |
815 | * Handle user request to change the 'mems' memory placement | ||
816 | * of a cpuset. Needs to validate the request, update the | ||
817 | * cpusets mems_allowed and mems_generation, and for each | ||
818 | * task in the cpuset, rebind any vma mempolicies. | ||
819 | * | ||
815 | * Call with manage_sem held. May take callback_sem during call. | 820 | * Call with manage_sem held. May take callback_sem during call. |
821 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | ||
822 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | ||
823 | * their mempolicies to the cpusets new mems_allowed. | ||
816 | */ | 824 | */ |
817 | 825 | ||
818 | static int update_nodemask(struct cpuset *cs, char *buf) | 826 | static int update_nodemask(struct cpuset *cs, char *buf) |
819 | { | 827 | { |
820 | struct cpuset trialcs; | 828 | struct cpuset trialcs; |
829 | struct task_struct *g, *p; | ||
830 | struct mm_struct **mmarray; | ||
831 | int i, n, ntasks; | ||
832 | int fudge; | ||
821 | int retval; | 833 | int retval; |
822 | 834 | ||
823 | trialcs = *cs; | 835 | trialcs = *cs; |
@@ -839,6 +851,76 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
839 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 851 | cs->mems_generation = atomic_read(&cpuset_mems_generation); |
840 | up(&callback_sem); | 852 | up(&callback_sem); |
841 | 853 | ||
854 | set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ | ||
855 | |||
856 | fudge = 10; /* spare mmarray[] slots */ | ||
857 | fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ | ||
858 | retval = -ENOMEM; | ||
859 | |||
860 | /* | ||
861 | * Allocate mmarray[] to hold mm reference for each task | ||
862 | * in cpuset cs. Can't kmalloc GFP_KERNEL while holding | ||
863 | * tasklist_lock. We could use GFP_ATOMIC, but with a | ||
864 | * few more lines of code, we can retry until we get a big | ||
865 | * enough mmarray[] w/o using GFP_ATOMIC. | ||
866 | */ | ||
867 | while (1) { | ||
868 | ntasks = atomic_read(&cs->count); /* guess */ | ||
869 | ntasks += fudge; | ||
870 | mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); | ||
871 | if (!mmarray) | ||
872 | goto done; | ||
873 | write_lock_irq(&tasklist_lock); /* block fork */ | ||
874 | if (atomic_read(&cs->count) <= ntasks) | ||
875 | break; /* got enough */ | ||
876 | write_unlock_irq(&tasklist_lock); /* try again */ | ||
877 | kfree(mmarray); | ||
878 | } | ||
879 | |||
880 | n = 0; | ||
881 | |||
882 | /* Load up mmarray[] with mm reference for each task in cpuset. */ | ||
883 | do_each_thread(g, p) { | ||
884 | struct mm_struct *mm; | ||
885 | |||
886 | if (n >= ntasks) { | ||
887 | printk(KERN_WARNING | ||
888 | "Cpuset mempolicy rebind incomplete.\n"); | ||
889 | continue; | ||
890 | } | ||
891 | if (p->cpuset != cs) | ||
892 | continue; | ||
893 | mm = get_task_mm(p); | ||
894 | if (!mm) | ||
895 | continue; | ||
896 | mmarray[n++] = mm; | ||
897 | } while_each_thread(g, p); | ||
898 | write_unlock_irq(&tasklist_lock); | ||
899 | |||
900 | /* | ||
901 | * Now that we've dropped the tasklist spinlock, we can | ||
902 | * rebind the vma mempolicies of each mm in mmarray[] to their | ||
903 | * new cpuset, and release that mm. The mpol_rebind_mm() | ||
904 | * call takes mmap_sem, which we couldn't take while holding | ||
905 | * tasklist_lock. Forks can happen again now - the mpol_copy() | ||
906 | * cpuset_being_rebound check will catch such forks, and rebind | ||
907 | * their vma mempolicies too. Because we still hold the global | ||
908 | * cpuset manage_sem, we know that no other rebind effort will | ||
909 | * be contending for the global variable cpuset_being_rebound. | ||
910 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | ||
911 | * is idempotent. | ||
912 | */ | ||
913 | for (i = 0; i < n; i++) { | ||
914 | struct mm_struct *mm = mmarray[i]; | ||
915 | |||
916 | mpol_rebind_mm(mm, &cs->mems_allowed); | ||
917 | mmput(mm); | ||
918 | } | ||
919 | |||
920 | /* We're done rebinding vma's to this cpusets new mems_allowed. */ | ||
921 | kfree(mmarray); | ||
922 | set_cpuset_being_rebound(NULL); | ||
923 | retval = 0; | ||
842 | done: | 924 | done: |
843 | return retval; | 925 | return retval; |
844 | } | 926 | } |
@@ -1011,6 +1093,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
1011 | struct cpuset *oldcs; | 1093 | struct cpuset *oldcs; |
1012 | cpumask_t cpus; | 1094 | cpumask_t cpus; |
1013 | nodemask_t from, to; | 1095 | nodemask_t from, to; |
1096 | struct mm_struct *mm; | ||
1014 | 1097 | ||
1015 | if (sscanf(pidbuf, "%d", &pid) != 1) | 1098 | if (sscanf(pidbuf, "%d", &pid) != 1) |
1016 | return -EIO; | 1099 | return -EIO; |
@@ -1060,6 +1143,13 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
1060 | to = cs->mems_allowed; | 1143 | to = cs->mems_allowed; |
1061 | 1144 | ||
1062 | up(&callback_sem); | 1145 | up(&callback_sem); |
1146 | |||
1147 | mm = get_task_mm(tsk); | ||
1148 | if (mm) { | ||
1149 | mpol_rebind_mm(mm, &to); | ||
1150 | mmput(mm); | ||
1151 | } | ||
1152 | |||
1063 | if (is_memory_migrate(cs)) | 1153 | if (is_memory_migrate(cs)) |
1064 | do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); | 1154 | do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); |
1065 | put_task_struct(tsk); | 1155 | put_task_struct(tsk); |