aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cpuset.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r--kernel/cpuset.c582
1 files changed, 483 insertions, 99 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 7430640f9816..fe2f71f92ae0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -39,6 +39,7 @@
39#include <linux/namei.h> 39#include <linux/namei.h>
40#include <linux/pagemap.h> 40#include <linux/pagemap.h>
41#include <linux/proc_fs.h> 41#include <linux/proc_fs.h>
42#include <linux/rcupdate.h>
42#include <linux/sched.h> 43#include <linux/sched.h>
43#include <linux/seq_file.h> 44#include <linux/seq_file.h>
44#include <linux/slab.h> 45#include <linux/slab.h>
@@ -54,7 +55,23 @@
54#include <asm/atomic.h> 55#include <asm/atomic.h>
55#include <asm/semaphore.h> 56#include <asm/semaphore.h>
56 57
57#define CPUSET_SUPER_MAGIC 0x27e0eb 58#define CPUSET_SUPER_MAGIC 0x27e0eb
59
60/*
61 * Tracks how many cpusets are currently defined in system.
62 * When there is only one cpuset (the root cpuset) we can
63 * short circuit some hooks.
64 */
65int number_of_cpusets __read_mostly;
66
67/* See "Frequency meter" comments, below. */
68
69struct fmeter {
70 int cnt; /* unprocessed events count */
71 int val; /* most recent output value */
72 time_t time; /* clock (secs) when val computed */
73 spinlock_t lock; /* guards read or write of above */
74};
58 75
59struct cpuset { 76struct cpuset {
60 unsigned long flags; /* "unsigned long" so bitops work */ 77 unsigned long flags; /* "unsigned long" so bitops work */
@@ -80,13 +97,16 @@ struct cpuset {
80 * Copy of global cpuset_mems_generation as of the most 97 * Copy of global cpuset_mems_generation as of the most
81 * recent time this cpuset changed its mems_allowed. 98 * recent time this cpuset changed its mems_allowed.
82 */ 99 */
83 int mems_generation; 100 int mems_generation;
101
102 struct fmeter fmeter; /* memory_pressure filter */
84}; 103};
85 104
86/* bits in struct cpuset flags field */ 105/* bits in struct cpuset flags field */
87typedef enum { 106typedef enum {
88 CS_CPU_EXCLUSIVE, 107 CS_CPU_EXCLUSIVE,
89 CS_MEM_EXCLUSIVE, 108 CS_MEM_EXCLUSIVE,
109 CS_MEMORY_MIGRATE,
90 CS_REMOVED, 110 CS_REMOVED,
91 CS_NOTIFY_ON_RELEASE 111 CS_NOTIFY_ON_RELEASE
92} cpuset_flagbits_t; 112} cpuset_flagbits_t;
@@ -112,6 +132,11 @@ static inline int notify_on_release(const struct cpuset *cs)
112 return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 132 return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
113} 133}
114 134
135static inline int is_memory_migrate(const struct cpuset *cs)
136{
137 return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags);
138}
139
115/* 140/*
116 * Increment this atomic integer everytime any cpuset changes its 141 * Increment this atomic integer everytime any cpuset changes its
117 * mems_allowed value. Users of cpusets can track this generation 142 * mems_allowed value. Users of cpusets can track this generation
@@ -137,13 +162,10 @@ static struct cpuset top_cpuset = {
137 .count = ATOMIC_INIT(0), 162 .count = ATOMIC_INIT(0),
138 .sibling = LIST_HEAD_INIT(top_cpuset.sibling), 163 .sibling = LIST_HEAD_INIT(top_cpuset.sibling),
139 .children = LIST_HEAD_INIT(top_cpuset.children), 164 .children = LIST_HEAD_INIT(top_cpuset.children),
140 .parent = NULL,
141 .dentry = NULL,
142 .mems_generation = 0,
143}; 165};
144 166
145static struct vfsmount *cpuset_mount; 167static struct vfsmount *cpuset_mount;
146static struct super_block *cpuset_sb = NULL; 168static struct super_block *cpuset_sb;
147 169
148/* 170/*
149 * We have two global cpuset semaphores below. They can nest. 171 * We have two global cpuset semaphores below. They can nest.
@@ -227,6 +249,11 @@ static struct super_block *cpuset_sb = NULL;
227 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock 249 * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
228 * (task->alloc_lock) already in the task_struct routinely used for 250 * (task->alloc_lock) already in the task_struct routinely used for
229 * such matters. 251 * such matters.
252 *
253 * P.S. One more locking exception. RCU is used to guard the
254 * update of a tasks cpuset pointer by attach_task() and the
255 * access of task->cpuset->mems_generation via that pointer in
256 * the routine cpuset_update_task_memory_state().
230 */ 257 */
231 258
232static DECLARE_MUTEX(manage_sem); 259static DECLARE_MUTEX(manage_sem);
@@ -304,7 +331,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
304 spin_lock(&dcache_lock); 331 spin_lock(&dcache_lock);
305 node = dentry->d_subdirs.next; 332 node = dentry->d_subdirs.next;
306 while (node != &dentry->d_subdirs) { 333 while (node != &dentry->d_subdirs) {
307 struct dentry *d = list_entry(node, struct dentry, d_child); 334 struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
308 list_del_init(node); 335 list_del_init(node);
309 if (d->d_inode) { 336 if (d->d_inode) {
310 d = dget_locked(d); 337 d = dget_locked(d);
@@ -316,7 +343,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
316 } 343 }
317 node = dentry->d_subdirs.next; 344 node = dentry->d_subdirs.next;
318 } 345 }
319 list_del_init(&dentry->d_child); 346 list_del_init(&dentry->d_u.d_child);
320 spin_unlock(&dcache_lock); 347 spin_unlock(&dcache_lock);
321 remove_dir(dentry); 348 remove_dir(dentry);
322} 349}
@@ -570,20 +597,43 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
570 BUG_ON(!nodes_intersects(*pmask, node_online_map)); 597 BUG_ON(!nodes_intersects(*pmask, node_online_map));
571} 598}
572 599
573/* 600/**
574 * Refresh current tasks mems_allowed and mems_generation from current 601 * cpuset_update_task_memory_state - update task memory placement
575 * tasks cpuset. 602 *
603 * If the current tasks cpusets mems_allowed changed behind our
604 * backs, update current->mems_allowed, mems_generation and task NUMA
605 * mempolicy to the new value.
576 * 606 *
577 * Call without callback_sem or task_lock() held. May be called with 607 * Task mempolicy is updated by rebinding it relative to the
578 * or without manage_sem held. Will acquire task_lock() and might 608 * current->cpuset if a task has its memory placement changed.
579 * acquire callback_sem during call. 609 * Do not call this routine if in_interrupt().
580 * 610 *
581 * The task_lock() is required to dereference current->cpuset safely. 611 * Call without callback_sem or task_lock() held. May be called
582 * Without it, we could pick up the pointer value of current->cpuset 612 * with or without manage_sem held. Doesn't need task_lock to guard
583 * in one instruction, and then attach_task could give us a different 613 * against another task changing a non-NULL cpuset pointer to NULL,
584 * cpuset, and then the cpuset we had could be removed and freed, 614 * as that is only done by a task on itself, and if the current task
585 * and then on our next instruction, we could dereference a no longer 615 * is here, it is not simultaneously in the exit code NULL'ing its
586 * valid cpuset pointer to get its mems_generation field. 616 * cpuset pointer. This routine also might acquire callback_sem and
617 * current->mm->mmap_sem during call.
618 *
619 * Reading current->cpuset->mems_generation doesn't need task_lock
620 * to guard the current->cpuset derefence, because it is guarded
621 * from concurrent freeing of current->cpuset by attach_task(),
622 * using RCU.
623 *
624 * The rcu_dereference() is technically probably not needed,
625 * as I don't actually mind if I see a new cpuset pointer but
626 * an old value of mems_generation. However this really only
627 * matters on alpha systems using cpusets heavily. If I dropped
628 * that rcu_dereference(), it would save them a memory barrier.
629 * For all other arch's, rcu_dereference is a no-op anyway, and for
630 * alpha systems not using cpusets, another planned optimization,
631 * avoiding the rcu critical section for tasks in the root cpuset
632 * which is statically allocated, so can't vanish, will make this
633 * irrelevant. Better to use RCU as intended, than to engage in
634 * some cute trick to save a memory barrier that is impossible to
635 * test, for alpha systems using cpusets heavily, which might not
636 * even exist.
587 * 637 *
588 * This routine is needed to update the per-task mems_allowed data, 638 * This routine is needed to update the per-task mems_allowed data,
589 * within the tasks context, when it is trying to allocate memory 639 * within the tasks context, when it is trying to allocate memory
@@ -591,27 +641,31 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
591 * task has been modifying its cpuset. 641 * task has been modifying its cpuset.
592 */ 642 */
593 643
594static void refresh_mems(void) 644void cpuset_update_task_memory_state()
595{ 645{
596 int my_cpusets_mem_gen; 646 int my_cpusets_mem_gen;
647 struct task_struct *tsk = current;
648 struct cpuset *cs;
597 649
598 task_lock(current); 650 if (tsk->cpuset == &top_cpuset) {
599 my_cpusets_mem_gen = current->cpuset->mems_generation; 651 /* Don't need rcu for top_cpuset. It's never freed. */
600 task_unlock(current); 652 my_cpusets_mem_gen = top_cpuset.mems_generation;
601 653 } else {
602 if (current->cpuset_mems_generation != my_cpusets_mem_gen) { 654 rcu_read_lock();
603 struct cpuset *cs; 655 cs = rcu_dereference(tsk->cpuset);
604 nodemask_t oldmem = current->mems_allowed; 656 my_cpusets_mem_gen = cs->mems_generation;
657 rcu_read_unlock();
658 }
605 659
660 if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
606 down(&callback_sem); 661 down(&callback_sem);
607 task_lock(current); 662 task_lock(tsk);
608 cs = current->cpuset; 663 cs = tsk->cpuset; /* Maybe changed when task not locked */
609 guarantee_online_mems(cs, &current->mems_allowed); 664 guarantee_online_mems(cs, &tsk->mems_allowed);
610 current->cpuset_mems_generation = cs->mems_generation; 665 tsk->cpuset_mems_generation = cs->mems_generation;
611 task_unlock(current); 666 task_unlock(tsk);
612 up(&callback_sem); 667 up(&callback_sem);
613 if (!nodes_equal(oldmem, current->mems_allowed)) 668 mpol_rebind_task(tsk, &tsk->mems_allowed);
614 numa_policy_rebind(&oldmem, &current->mems_allowed);
615 } 669 }
616} 670}
617 671
@@ -766,36 +820,150 @@ static int update_cpumask(struct cpuset *cs, char *buf)
766} 820}
767 821
768/* 822/*
823 * Handle user request to change the 'mems' memory placement
824 * of a cpuset. Needs to validate the request, update the
825 * cpusets mems_allowed and mems_generation, and for each
826 * task in the cpuset, rebind any vma mempolicies and if
827 * the cpuset is marked 'memory_migrate', migrate the tasks
828 * pages to the new memory.
829 *
769 * Call with manage_sem held. May take callback_sem during call. 830 * Call with manage_sem held. May take callback_sem during call.
831 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
832 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
833 * their mempolicies to the cpusets new mems_allowed.
770 */ 834 */
771 835
772static int update_nodemask(struct cpuset *cs, char *buf) 836static int update_nodemask(struct cpuset *cs, char *buf)
773{ 837{
774 struct cpuset trialcs; 838 struct cpuset trialcs;
839 nodemask_t oldmem;
840 struct task_struct *g, *p;
841 struct mm_struct **mmarray;
842 int i, n, ntasks;
843 int migrate;
844 int fudge;
775 int retval; 845 int retval;
776 846
777 trialcs = *cs; 847 trialcs = *cs;
778 retval = nodelist_parse(buf, trialcs.mems_allowed); 848 retval = nodelist_parse(buf, trialcs.mems_allowed);
779 if (retval < 0) 849 if (retval < 0)
780 return retval; 850 goto done;
781 nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); 851 nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map);
782 if (nodes_empty(trialcs.mems_allowed)) 852 oldmem = cs->mems_allowed;
783 return -ENOSPC; 853 if (nodes_equal(oldmem, trialcs.mems_allowed)) {
854 retval = 0; /* Too easy - nothing to do */
855 goto done;
856 }
857 if (nodes_empty(trialcs.mems_allowed)) {
858 retval = -ENOSPC;
859 goto done;
860 }
784 retval = validate_change(cs, &trialcs); 861 retval = validate_change(cs, &trialcs);
785 if (retval == 0) { 862 if (retval < 0)
786 down(&callback_sem); 863 goto done;
787 cs->mems_allowed = trialcs.mems_allowed; 864
788 atomic_inc(&cpuset_mems_generation); 865 down(&callback_sem);
789 cs->mems_generation = atomic_read(&cpuset_mems_generation); 866 cs->mems_allowed = trialcs.mems_allowed;
790 up(&callback_sem); 867 atomic_inc(&cpuset_mems_generation);
868 cs->mems_generation = atomic_read(&cpuset_mems_generation);
869 up(&callback_sem);
870
871 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */
872
873 fudge = 10; /* spare mmarray[] slots */
874 fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */
875 retval = -ENOMEM;
876
877 /*
878 * Allocate mmarray[] to hold mm reference for each task
879 * in cpuset cs. Can't kmalloc GFP_KERNEL while holding
880 * tasklist_lock. We could use GFP_ATOMIC, but with a
881 * few more lines of code, we can retry until we get a big
882 * enough mmarray[] w/o using GFP_ATOMIC.
883 */
884 while (1) {
885 ntasks = atomic_read(&cs->count); /* guess */
886 ntasks += fudge;
887 mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
888 if (!mmarray)
889 goto done;
890 write_lock_irq(&tasklist_lock); /* block fork */
891 if (atomic_read(&cs->count) <= ntasks)
892 break; /* got enough */
893 write_unlock_irq(&tasklist_lock); /* try again */
894 kfree(mmarray);
791 } 895 }
896
897 n = 0;
898
899 /* Load up mmarray[] with mm reference for each task in cpuset. */
900 do_each_thread(g, p) {
901 struct mm_struct *mm;
902
903 if (n >= ntasks) {
904 printk(KERN_WARNING
905 "Cpuset mempolicy rebind incomplete.\n");
906 continue;
907 }
908 if (p->cpuset != cs)
909 continue;
910 mm = get_task_mm(p);
911 if (!mm)
912 continue;
913 mmarray[n++] = mm;
914 } while_each_thread(g, p);
915 write_unlock_irq(&tasklist_lock);
916
917 /*
918 * Now that we've dropped the tasklist spinlock, we can
919 * rebind the vma mempolicies of each mm in mmarray[] to their
920 * new cpuset, and release that mm. The mpol_rebind_mm()
921 * call takes mmap_sem, which we couldn't take while holding
922 * tasklist_lock. Forks can happen again now - the mpol_copy()
923 * cpuset_being_rebound check will catch such forks, and rebind
924 * their vma mempolicies too. Because we still hold the global
925 * cpuset manage_sem, we know that no other rebind effort will
926 * be contending for the global variable cpuset_being_rebound.
927 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
928 * is idempotent. Also migrate pages in each mm to new nodes.
929 */
930 migrate = is_memory_migrate(cs);
931 for (i = 0; i < n; i++) {
932 struct mm_struct *mm = mmarray[i];
933
934 mpol_rebind_mm(mm, &cs->mems_allowed);
935 if (migrate) {
936 do_migrate_pages(mm, &oldmem, &cs->mems_allowed,
937 MPOL_MF_MOVE_ALL);
938 }
939 mmput(mm);
940 }
941
942 /* We're done rebinding vma's to this cpusets new mems_allowed. */
943 kfree(mmarray);
944 set_cpuset_being_rebound(NULL);
945 retval = 0;
946done:
792 return retval; 947 return retval;
793} 948}
794 949
795/* 950/*
951 * Call with manage_sem held.
952 */
953
954static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
955{
956 if (simple_strtoul(buf, NULL, 10) != 0)
957 cpuset_memory_pressure_enabled = 1;
958 else
959 cpuset_memory_pressure_enabled = 0;
960 return 0;
961}
962
963/*
796 * update_flag - read a 0 or a 1 in a file and update associated flag 964 * update_flag - read a 0 or a 1 in a file and update associated flag
797 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 965 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
798 * CS_NOTIFY_ON_RELEASE) 966 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE)
799 * cs: the cpuset to update 967 * cs: the cpuset to update
800 * buf: the buffer where we read the 0 or 1 968 * buf: the buffer where we read the 0 or 1
801 * 969 *
@@ -834,6 +1002,104 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
834} 1002}
835 1003
836/* 1004/*
1005 * Frequency meter - How fast is some event occuring?
1006 *
1007 * These routines manage a digitally filtered, constant time based,
1008 * event frequency meter. There are four routines:
1009 * fmeter_init() - initialize a frequency meter.
1010 * fmeter_markevent() - called each time the event happens.
1011 * fmeter_getrate() - returns the recent rate of such events.
1012 * fmeter_update() - internal routine used to update fmeter.
1013 *
1014 * A common data structure is passed to each of these routines,
1015 * which is used to keep track of the state required to manage the
1016 * frequency meter and its digital filter.
1017 *
1018 * The filter works on the number of events marked per unit time.
1019 * The filter is single-pole low-pass recursive (IIR). The time unit
1020 * is 1 second. Arithmetic is done using 32-bit integers scaled to
1021 * simulate 3 decimal digits of precision (multiplied by 1000).
1022 *
1023 * With an FM_COEF of 933, and a time base of 1 second, the filter
1024 * has a half-life of 10 seconds, meaning that if the events quit
1025 * happening, then the rate returned from the fmeter_getrate()
1026 * will be cut in half each 10 seconds, until it converges to zero.
1027 *
1028 * It is not worth doing a real infinitely recursive filter. If more
1029 * than FM_MAXTICKS ticks have elapsed since the last filter event,
1030 * just compute FM_MAXTICKS ticks worth, by which point the level
1031 * will be stable.
1032 *
1033 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
1034 * arithmetic overflow in the fmeter_update() routine.
1035 *
1036 * Given the simple 32 bit integer arithmetic used, this meter works
1037 * best for reporting rates between one per millisecond (msec) and
1038 * one per 32 (approx) seconds. At constant rates faster than one
1039 * per msec it maxes out at values just under 1,000,000. At constant
1040 * rates between one per msec, and one per second it will stabilize
1041 * to a value N*1000, where N is the rate of events per second.
1042 * At constant rates between one per second and one per 32 seconds,
1043 * it will be choppy, moving up on the seconds that have an event,
1044 * and then decaying until the next event. At rates slower than
1045 * about one in 32 seconds, it decays all the way back to zero between
1046 * each event.
1047 */
1048
1049#define FM_COEF 933 /* coefficient for half-life of 10 secs */
1050#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
1051#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
1052#define FM_SCALE 1000 /* faux fixed point scale */
1053
1054/* Initialize a frequency meter */
1055static void fmeter_init(struct fmeter *fmp)
1056{
1057 fmp->cnt = 0;
1058 fmp->val = 0;
1059 fmp->time = 0;
1060 spin_lock_init(&fmp->lock);
1061}
1062
1063/* Internal meter update - process cnt events and update value */
1064static void fmeter_update(struct fmeter *fmp)
1065{
1066 time_t now = get_seconds();
1067 time_t ticks = now - fmp->time;
1068
1069 if (ticks == 0)
1070 return;
1071
1072 ticks = min(FM_MAXTICKS, ticks);
1073 while (ticks-- > 0)
1074 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
1075 fmp->time = now;
1076
1077 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
1078 fmp->cnt = 0;
1079}
1080
1081/* Process any previous ticks, then bump cnt by one (times scale). */
1082static void fmeter_markevent(struct fmeter *fmp)
1083{
1084 spin_lock(&fmp->lock);
1085 fmeter_update(fmp);
1086 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
1087 spin_unlock(&fmp->lock);
1088}
1089
1090/* Process any previous ticks, then return current value. */
1091static int fmeter_getrate(struct fmeter *fmp)
1092{
1093 int val;
1094
1095 spin_lock(&fmp->lock);
1096 fmeter_update(fmp);
1097 val = fmp->val;
1098 spin_unlock(&fmp->lock);
1099 return val;
1100}
1101
1102/*
837 * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly 1103 * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly
838 * writing the path of the old cpuset in 'ppathbuf' if it needs to be 1104 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
839 * notified on release. 1105 * notified on release.
@@ -848,6 +1114,8 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
848 struct task_struct *tsk; 1114 struct task_struct *tsk;
849 struct cpuset *oldcs; 1115 struct cpuset *oldcs;
850 cpumask_t cpus; 1116 cpumask_t cpus;
1117 nodemask_t from, to;
1118 struct mm_struct *mm;
851 1119
852 if (sscanf(pidbuf, "%d", &pid) != 1) 1120 if (sscanf(pidbuf, "%d", &pid) != 1)
853 return -EIO; 1121 return -EIO;
@@ -887,14 +1155,27 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
887 return -ESRCH; 1155 return -ESRCH;
888 } 1156 }
889 atomic_inc(&cs->count); 1157 atomic_inc(&cs->count);
890 tsk->cpuset = cs; 1158 rcu_assign_pointer(tsk->cpuset, cs);
891 task_unlock(tsk); 1159 task_unlock(tsk);
892 1160
893 guarantee_online_cpus(cs, &cpus); 1161 guarantee_online_cpus(cs, &cpus);
894 set_cpus_allowed(tsk, cpus); 1162 set_cpus_allowed(tsk, cpus);
895 1163
1164 from = oldcs->mems_allowed;
1165 to = cs->mems_allowed;
1166
896 up(&callback_sem); 1167 up(&callback_sem);
1168
1169 mm = get_task_mm(tsk);
1170 if (mm) {
1171 mpol_rebind_mm(mm, &to);
1172 mmput(mm);
1173 }
1174
1175 if (is_memory_migrate(cs))
1176 do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
897 put_task_struct(tsk); 1177 put_task_struct(tsk);
1178 synchronize_rcu();
898 if (atomic_dec_and_test(&oldcs->count)) 1179 if (atomic_dec_and_test(&oldcs->count))
899 check_for_release(oldcs, ppathbuf); 1180 check_for_release(oldcs, ppathbuf);
900 return 0; 1181 return 0;
@@ -905,11 +1186,14 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
905typedef enum { 1186typedef enum {
906 FILE_ROOT, 1187 FILE_ROOT,
907 FILE_DIR, 1188 FILE_DIR,
1189 FILE_MEMORY_MIGRATE,
908 FILE_CPULIST, 1190 FILE_CPULIST,
909 FILE_MEMLIST, 1191 FILE_MEMLIST,
910 FILE_CPU_EXCLUSIVE, 1192 FILE_CPU_EXCLUSIVE,
911 FILE_MEM_EXCLUSIVE, 1193 FILE_MEM_EXCLUSIVE,
912 FILE_NOTIFY_ON_RELEASE, 1194 FILE_NOTIFY_ON_RELEASE,
1195 FILE_MEMORY_PRESSURE_ENABLED,
1196 FILE_MEMORY_PRESSURE,
913 FILE_TASKLIST, 1197 FILE_TASKLIST,
914} cpuset_filetype_t; 1198} cpuset_filetype_t;
915 1199
@@ -960,6 +1244,15 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
960 case FILE_NOTIFY_ON_RELEASE: 1244 case FILE_NOTIFY_ON_RELEASE:
961 retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); 1245 retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
962 break; 1246 break;
1247 case FILE_MEMORY_MIGRATE:
1248 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
1249 break;
1250 case FILE_MEMORY_PRESSURE_ENABLED:
1251 retval = update_memory_pressure_enabled(cs, buffer);
1252 break;
1253 case FILE_MEMORY_PRESSURE:
1254 retval = -EACCES;
1255 break;
963 case FILE_TASKLIST: 1256 case FILE_TASKLIST:
964 retval = attach_task(cs, buffer, &pathbuf); 1257 retval = attach_task(cs, buffer, &pathbuf);
965 break; 1258 break;
@@ -1060,6 +1353,15 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
1060 case FILE_NOTIFY_ON_RELEASE: 1353 case FILE_NOTIFY_ON_RELEASE:
1061 *s++ = notify_on_release(cs) ? '1' : '0'; 1354 *s++ = notify_on_release(cs) ? '1' : '0';
1062 break; 1355 break;
1356 case FILE_MEMORY_MIGRATE:
1357 *s++ = is_memory_migrate(cs) ? '1' : '0';
1358 break;
1359 case FILE_MEMORY_PRESSURE_ENABLED:
1360 *s++ = cpuset_memory_pressure_enabled ? '1' : '0';
1361 break;
1362 case FILE_MEMORY_PRESSURE:
1363 s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
1364 break;
1063 default: 1365 default:
1064 retval = -EINVAL; 1366 retval = -EINVAL;
1065 goto out; 1367 goto out;
@@ -1178,7 +1480,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode)
1178 1480
1179/* 1481/*
1180 * cpuset_create_dir - create a directory for an object. 1482 * cpuset_create_dir - create a directory for an object.
1181 * cs: the cpuset we create the directory for. 1483 * cs: the cpuset we create the directory for.
1182 * It must have a valid ->parent field 1484 * It must have a valid ->parent field
1183 * And we are going to fill its ->dentry field. 1485 * And we are going to fill its ->dentry field.
1184 * name: The name to give to the cpuset directory. Will be copied. 1486 * name: The name to give to the cpuset directory. Will be copied.
@@ -1211,7 +1513,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
1211 struct dentry *dentry; 1513 struct dentry *dentry;
1212 int error; 1514 int error;
1213 1515
1214 down(&dir->d_inode->i_sem); 1516 mutex_lock(&dir->d_inode->i_mutex);
1215 dentry = cpuset_get_dentry(dir, cft->name); 1517 dentry = cpuset_get_dentry(dir, cft->name);
1216 if (!IS_ERR(dentry)) { 1518 if (!IS_ERR(dentry)) {
1217 error = cpuset_create_file(dentry, 0644 | S_IFREG); 1519 error = cpuset_create_file(dentry, 0644 | S_IFREG);
@@ -1220,7 +1522,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
1220 dput(dentry); 1522 dput(dentry);
1221 } else 1523 } else
1222 error = PTR_ERR(dentry); 1524 error = PTR_ERR(dentry);
1223 up(&dir->d_inode->i_sem); 1525 mutex_unlock(&dir->d_inode->i_mutex);
1224 return error; 1526 return error;
1225} 1527}
1226 1528
@@ -1252,7 +1554,7 @@ struct ctr_struct {
1252 * when reading out p->cpuset, as we don't really care if it changes 1554 * when reading out p->cpuset, as we don't really care if it changes
1253 * on the next cycle, and we are not going to try to dereference it. 1555 * on the next cycle, and we are not going to try to dereference it.
1254 */ 1556 */
1255static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) 1557static int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
1256{ 1558{
1257 int n = 0; 1559 int n = 0;
1258 struct task_struct *g, *p; 1560 struct task_struct *g, *p;
@@ -1408,6 +1710,21 @@ static struct cftype cft_notify_on_release = {
1408 .private = FILE_NOTIFY_ON_RELEASE, 1710 .private = FILE_NOTIFY_ON_RELEASE,
1409}; 1711};
1410 1712
1713static struct cftype cft_memory_migrate = {
1714 .name = "memory_migrate",
1715 .private = FILE_MEMORY_MIGRATE,
1716};
1717
1718static struct cftype cft_memory_pressure_enabled = {
1719 .name = "memory_pressure_enabled",
1720 .private = FILE_MEMORY_PRESSURE_ENABLED,
1721};
1722
1723static struct cftype cft_memory_pressure = {
1724 .name = "memory_pressure",
1725 .private = FILE_MEMORY_PRESSURE,
1726};
1727
1411static int cpuset_populate_dir(struct dentry *cs_dentry) 1728static int cpuset_populate_dir(struct dentry *cs_dentry)
1412{ 1729{
1413 int err; 1730 int err;
@@ -1422,6 +1739,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
1422 return err; 1739 return err;
1423 if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) 1740 if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
1424 return err; 1741 return err;
1742 if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0)
1743 return err;
1744 if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0)
1745 return err;
1425 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) 1746 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
1426 return err; 1747 return err;
1427 return 0; 1748 return 0;
@@ -1446,7 +1767,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1446 return -ENOMEM; 1767 return -ENOMEM;
1447 1768
1448 down(&manage_sem); 1769 down(&manage_sem);
1449 refresh_mems(); 1770 cpuset_update_task_memory_state();
1450 cs->flags = 0; 1771 cs->flags = 0;
1451 if (notify_on_release(parent)) 1772 if (notify_on_release(parent))
1452 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 1773 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
@@ -1457,11 +1778,13 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1457 INIT_LIST_HEAD(&cs->children); 1778 INIT_LIST_HEAD(&cs->children);
1458 atomic_inc(&cpuset_mems_generation); 1779 atomic_inc(&cpuset_mems_generation);
1459 cs->mems_generation = atomic_read(&cpuset_mems_generation); 1780 cs->mems_generation = atomic_read(&cpuset_mems_generation);
1781 fmeter_init(&cs->fmeter);
1460 1782
1461 cs->parent = parent; 1783 cs->parent = parent;
1462 1784
1463 down(&callback_sem); 1785 down(&callback_sem);
1464 list_add(&cs->sibling, &cs->parent->children); 1786 list_add(&cs->sibling, &cs->parent->children);
1787 number_of_cpusets++;
1465 up(&callback_sem); 1788 up(&callback_sem);
1466 1789
1467 err = cpuset_create_dir(cs, name, mode); 1790 err = cpuset_create_dir(cs, name, mode);
@@ -1470,7 +1793,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1470 1793
1471 /* 1794 /*
1472 * Release manage_sem before cpuset_populate_dir() because it 1795 * Release manage_sem before cpuset_populate_dir() because it
1473 * will down() this new directory's i_sem and if we race with 1796 * will down() this new directory's i_mutex and if we race with
1474 * another mkdir, we might deadlock. 1797 * another mkdir, we might deadlock.
1475 */ 1798 */
1476 up(&manage_sem); 1799 up(&manage_sem);
@@ -1489,7 +1812,7 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1489{ 1812{
1490 struct cpuset *c_parent = dentry->d_parent->d_fsdata; 1813 struct cpuset *c_parent = dentry->d_parent->d_fsdata;
1491 1814
1492 /* the vfs holds inode->i_sem already */ 1815 /* the vfs holds inode->i_mutex already */
1493 return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); 1816 return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
1494} 1817}
1495 1818
@@ -1500,10 +1823,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1500 struct cpuset *parent; 1823 struct cpuset *parent;
1501 char *pathbuf = NULL; 1824 char *pathbuf = NULL;
1502 1825
1503 /* the vfs holds both inode->i_sem already */ 1826 /* the vfs holds both inode->i_mutex already */
1504 1827
1505 down(&manage_sem); 1828 down(&manage_sem);
1506 refresh_mems(); 1829 cpuset_update_task_memory_state();
1507 if (atomic_read(&cs->count) > 0) { 1830 if (atomic_read(&cs->count) > 0) {
1508 up(&manage_sem); 1831 up(&manage_sem);
1509 return -EBUSY; 1832 return -EBUSY;
@@ -1524,6 +1847,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1524 spin_unlock(&d->d_lock); 1847 spin_unlock(&d->d_lock);
1525 cpuset_d_remove_dir(d); 1848 cpuset_d_remove_dir(d);
1526 dput(d); 1849 dput(d);
1850 number_of_cpusets--;
1527 up(&callback_sem); 1851 up(&callback_sem);
1528 if (list_empty(&parent->children)) 1852 if (list_empty(&parent->children))
1529 check_for_release(parent, &pathbuf); 1853 check_for_release(parent, &pathbuf);
@@ -1532,6 +1856,21 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1532 return 0; 1856 return 0;
1533} 1857}
1534 1858
1859/*
1860 * cpuset_init_early - just enough so that the calls to
1861 * cpuset_update_task_memory_state() in early init code
1862 * are harmless.
1863 */
1864
1865int __init cpuset_init_early(void)
1866{
1867 struct task_struct *tsk = current;
1868
1869 tsk->cpuset = &top_cpuset;
1870 tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation);
1871 return 0;
1872}
1873
1535/** 1874/**
1536 * cpuset_init - initialize cpusets at system boot 1875 * cpuset_init - initialize cpusets at system boot
1537 * 1876 *
@@ -1546,6 +1885,7 @@ int __init cpuset_init(void)
1546 top_cpuset.cpus_allowed = CPU_MASK_ALL; 1885 top_cpuset.cpus_allowed = CPU_MASK_ALL;
1547 top_cpuset.mems_allowed = NODE_MASK_ALL; 1886 top_cpuset.mems_allowed = NODE_MASK_ALL;
1548 1887
1888 fmeter_init(&top_cpuset.fmeter);
1549 atomic_inc(&cpuset_mems_generation); 1889 atomic_inc(&cpuset_mems_generation);
1550 top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); 1890 top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
1551 1891
@@ -1566,7 +1906,11 @@ int __init cpuset_init(void)
1566 root->d_inode->i_nlink++; 1906 root->d_inode->i_nlink++;
1567 top_cpuset.dentry = root; 1907 top_cpuset.dentry = root;
1568 root->d_inode->i_op = &cpuset_dir_inode_operations; 1908 root->d_inode->i_op = &cpuset_dir_inode_operations;
1909 number_of_cpusets = 1;
1569 err = cpuset_populate_dir(root); 1910 err = cpuset_populate_dir(root);
1911 /* memory_pressure_enabled is in root cpuset only */
1912 if (err == 0)
1913 err = cpuset_add_file(root, &cft_memory_pressure_enabled);
1570out: 1914out:
1571 return err; 1915 return err;
1572} 1916}
@@ -1632,15 +1976,13 @@ void cpuset_fork(struct task_struct *child)
1632 * 1976 *
1633 * We don't need to task_lock() this reference to tsk->cpuset, 1977 * We don't need to task_lock() this reference to tsk->cpuset,
1634 * because tsk is already marked PF_EXITING, so attach_task() won't 1978 * because tsk is already marked PF_EXITING, so attach_task() won't
1635 * mess with it. 1979 * mess with it, or task is a failed fork, never visible to attach_task.
1636 **/ 1980 **/
1637 1981
1638void cpuset_exit(struct task_struct *tsk) 1982void cpuset_exit(struct task_struct *tsk)
1639{ 1983{
1640 struct cpuset *cs; 1984 struct cpuset *cs;
1641 1985
1642 BUG_ON(!(tsk->flags & PF_EXITING));
1643
1644 cs = tsk->cpuset; 1986 cs = tsk->cpuset;
1645 tsk->cpuset = NULL; 1987 tsk->cpuset = NULL;
1646 1988
@@ -1667,14 +2009,14 @@ void cpuset_exit(struct task_struct *tsk)
1667 * tasks cpuset. 2009 * tasks cpuset.
1668 **/ 2010 **/
1669 2011
1670cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) 2012cpumask_t cpuset_cpus_allowed(struct task_struct *tsk)
1671{ 2013{
1672 cpumask_t mask; 2014 cpumask_t mask;
1673 2015
1674 down(&callback_sem); 2016 down(&callback_sem);
1675 task_lock((struct task_struct *)tsk); 2017 task_lock(tsk);
1676 guarantee_online_cpus(tsk->cpuset, &mask); 2018 guarantee_online_cpus(tsk->cpuset, &mask);
1677 task_unlock((struct task_struct *)tsk); 2019 task_unlock(tsk);
1678 up(&callback_sem); 2020 up(&callback_sem);
1679 2021
1680 return mask; 2022 return mask;
@@ -1686,43 +2028,26 @@ void cpuset_init_current_mems_allowed(void)
1686} 2028}
1687 2029
1688/** 2030/**
1689 * cpuset_update_current_mems_allowed - update mems parameters to new values 2031 * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset.
1690 * 2032 * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed.
1691 * If the current tasks cpusets mems_allowed changed behind our backs,
1692 * update current->mems_allowed and mems_generation to the new value.
1693 * Do not call this routine if in_interrupt().
1694 * 2033 *
1695 * Call without callback_sem or task_lock() held. May be called 2034 * Description: Returns the nodemask_t mems_allowed of the cpuset
1696 * with or without manage_sem held. Unless exiting, it will acquire 2035 * attached to the specified @tsk. Guaranteed to return some non-empty
1697 * task_lock(). Also might acquire callback_sem during call to 2036 * subset of node_online_map, even if this means going outside the
1698 * refresh_mems(). 2037 * tasks cpuset.
1699 */ 2038 **/
1700 2039
1701void cpuset_update_current_mems_allowed(void) 2040nodemask_t cpuset_mems_allowed(struct task_struct *tsk)
1702{ 2041{
1703 struct cpuset *cs; 2042 nodemask_t mask;
1704 int need_to_refresh = 0;
1705 2043
1706 task_lock(current); 2044 down(&callback_sem);
1707 cs = current->cpuset; 2045 task_lock(tsk);
1708 if (!cs) 2046 guarantee_online_mems(tsk->cpuset, &mask);
1709 goto done; 2047 task_unlock(tsk);
1710 if (current->cpuset_mems_generation != cs->mems_generation) 2048 up(&callback_sem);
1711 need_to_refresh = 1;
1712done:
1713 task_unlock(current);
1714 if (need_to_refresh)
1715 refresh_mems();
1716}
1717 2049
1718/** 2050 return mask;
1719 * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed
1720 * @nodes: pointer to a node bitmap that is and-ed with mems_allowed
1721 */
1722void cpuset_restrict_to_mems_allowed(unsigned long *nodes)
1723{
1724 bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed),
1725 MAX_NUMNODES);
1726} 2051}
1727 2052
1728/** 2053/**
@@ -1795,7 +2120,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
1795 * GFP_USER - only nodes in current tasks mems allowed ok. 2120 * GFP_USER - only nodes in current tasks mems allowed ok.
1796 **/ 2121 **/
1797 2122
1798int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) 2123int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
1799{ 2124{
1800 int node; /* node that zone z is on */ 2125 int node; /* node that zone z is on */
1801 const struct cpuset *cs; /* current cpuset ancestors */ 2126 const struct cpuset *cs; /* current cpuset ancestors */
@@ -1825,6 +2150,33 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
1825} 2150}
1826 2151
1827/** 2152/**
2153 * cpuset_lock - lock out any changes to cpuset structures
2154 *
2155 * The out of memory (oom) code needs to lock down cpusets
2156 * from being changed while it scans the tasklist looking for a
2157 * task in an overlapping cpuset. Expose callback_sem via this
2158 * cpuset_lock() routine, so the oom code can lock it, before
2159 * locking the task list. The tasklist_lock is a spinlock, so
2160 * must be taken inside callback_sem.
2161 */
2162
2163void cpuset_lock(void)
2164{
2165 down(&callback_sem);
2166}
2167
2168/**
2169 * cpuset_unlock - release lock on cpuset changes
2170 *
2171 * Undo the lock taken in a previous cpuset_lock() call.
2172 */
2173
2174void cpuset_unlock(void)
2175{
2176 up(&callback_sem);
2177}
2178
2179/**
1828 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? 2180 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
1829 * @p: pointer to task_struct of some other task. 2181 * @p: pointer to task_struct of some other task.
1830 * 2182 *
@@ -1833,7 +2185,7 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
1833 * determine if task @p's memory usage might impact the memory 2185 * determine if task @p's memory usage might impact the memory
1834 * available to the current task. 2186 * available to the current task.
1835 * 2187 *
1836 * Acquires callback_sem - not suitable for calling from a fast path. 2188 * Call while holding callback_sem.
1837 **/ 2189 **/
1838 2190
1839int cpuset_excl_nodes_overlap(const struct task_struct *p) 2191int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -1841,8 +2193,6 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
1841 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ 2193 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
1842 int overlap = 0; /* do cpusets overlap? */ 2194 int overlap = 0; /* do cpusets overlap? */
1843 2195
1844 down(&callback_sem);
1845
1846 task_lock(current); 2196 task_lock(current);
1847 if (current->flags & PF_EXITING) { 2197 if (current->flags & PF_EXITING) {
1848 task_unlock(current); 2198 task_unlock(current);
@@ -1861,12 +2211,46 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
1861 2211
1862 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); 2212 overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
1863done: 2213done:
1864 up(&callback_sem);
1865
1866 return overlap; 2214 return overlap;
1867} 2215}
1868 2216
1869/* 2217/*
2218 * Collection of memory_pressure is suppressed unless
2219 * this flag is enabled by writing "1" to the special
2220 * cpuset file 'memory_pressure_enabled' in the root cpuset.
2221 */
2222
2223int cpuset_memory_pressure_enabled __read_mostly;
2224
2225/**
2226 * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
2227 *
2228 * Keep a running average of the rate of synchronous (direct)
2229 * page reclaim efforts initiated by tasks in each cpuset.
2230 *
2231 * This represents the rate at which some task in the cpuset
2232 * ran low on memory on all nodes it was allowed to use, and
2233 * had to enter the kernels page reclaim code in an effort to
2234 * create more free memory by tossing clean pages or swapping
2235 * or writing dirty pages.
2236 *
2237 * Display to user space in the per-cpuset read-only file
2238 * "memory_pressure". Value displayed is an integer
2239 * representing the recent rate of entry into the synchronous
2240 * (direct) page reclaim by any task attached to the cpuset.
2241 **/
2242
2243void __cpuset_memory_pressure_bump(void)
2244{
2245 struct cpuset *cs;
2246
2247 task_lock(current);
2248 cs = current->cpuset;
2249 fmeter_markevent(&cs->fmeter);
2250 task_unlock(current);
2251}
2252
2253/*
1870 * proc_cpuset_show() 2254 * proc_cpuset_show()
1871 * - Print tasks cpuset path into seq_file. 2255 * - Print tasks cpuset path into seq_file.
1872 * - Used for /proc/<pid>/cpuset. 2256 * - Used for /proc/<pid>/cpuset.