diff options
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r-- | kernel/cpuset.c | 582 |
1 files changed, 483 insertions, 99 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7430640f9816..fe2f71f92ae0 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/namei.h> | 39 | #include <linux/namei.h> |
40 | #include <linux/pagemap.h> | 40 | #include <linux/pagemap.h> |
41 | #include <linux/proc_fs.h> | 41 | #include <linux/proc_fs.h> |
42 | #include <linux/rcupdate.h> | ||
42 | #include <linux/sched.h> | 43 | #include <linux/sched.h> |
43 | #include <linux/seq_file.h> | 44 | #include <linux/seq_file.h> |
44 | #include <linux/slab.h> | 45 | #include <linux/slab.h> |
@@ -54,7 +55,23 @@ | |||
54 | #include <asm/atomic.h> | 55 | #include <asm/atomic.h> |
55 | #include <asm/semaphore.h> | 56 | #include <asm/semaphore.h> |
56 | 57 | ||
57 | #define CPUSET_SUPER_MAGIC 0x27e0eb | 58 | #define CPUSET_SUPER_MAGIC 0x27e0eb |
59 | |||
60 | /* | ||
61 | * Tracks how many cpusets are currently defined in system. | ||
62 | * When there is only one cpuset (the root cpuset) we can | ||
63 | * short circuit some hooks. | ||
64 | */ | ||
65 | int number_of_cpusets __read_mostly; | ||
66 | |||
67 | /* See "Frequency meter" comments, below. */ | ||
68 | |||
69 | struct fmeter { | ||
70 | int cnt; /* unprocessed events count */ | ||
71 | int val; /* most recent output value */ | ||
72 | time_t time; /* clock (secs) when val computed */ | ||
73 | spinlock_t lock; /* guards read or write of above */ | ||
74 | }; | ||
58 | 75 | ||
59 | struct cpuset { | 76 | struct cpuset { |
60 | unsigned long flags; /* "unsigned long" so bitops work */ | 77 | unsigned long flags; /* "unsigned long" so bitops work */ |
@@ -80,13 +97,16 @@ struct cpuset { | |||
80 | * Copy of global cpuset_mems_generation as of the most | 97 | * Copy of global cpuset_mems_generation as of the most |
81 | * recent time this cpuset changed its mems_allowed. | 98 | * recent time this cpuset changed its mems_allowed. |
82 | */ | 99 | */ |
83 | int mems_generation; | 100 | int mems_generation; |
101 | |||
102 | struct fmeter fmeter; /* memory_pressure filter */ | ||
84 | }; | 103 | }; |
85 | 104 | ||
86 | /* bits in struct cpuset flags field */ | 105 | /* bits in struct cpuset flags field */ |
87 | typedef enum { | 106 | typedef enum { |
88 | CS_CPU_EXCLUSIVE, | 107 | CS_CPU_EXCLUSIVE, |
89 | CS_MEM_EXCLUSIVE, | 108 | CS_MEM_EXCLUSIVE, |
109 | CS_MEMORY_MIGRATE, | ||
90 | CS_REMOVED, | 110 | CS_REMOVED, |
91 | CS_NOTIFY_ON_RELEASE | 111 | CS_NOTIFY_ON_RELEASE |
92 | } cpuset_flagbits_t; | 112 | } cpuset_flagbits_t; |
@@ -112,6 +132,11 @@ static inline int notify_on_release(const struct cpuset *cs) | |||
112 | return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 132 | return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
113 | } | 133 | } |
114 | 134 | ||
135 | static inline int is_memory_migrate(const struct cpuset *cs) | ||
136 | { | ||
137 | return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags); | ||
138 | } | ||
139 | |||
115 | /* | 140 | /* |
116 | * Increment this atomic integer everytime any cpuset changes its | 141 | * Increment this atomic integer everytime any cpuset changes its |
117 | * mems_allowed value. Users of cpusets can track this generation | 142 | * mems_allowed value. Users of cpusets can track this generation |
@@ -137,13 +162,10 @@ static struct cpuset top_cpuset = { | |||
137 | .count = ATOMIC_INIT(0), | 162 | .count = ATOMIC_INIT(0), |
138 | .sibling = LIST_HEAD_INIT(top_cpuset.sibling), | 163 | .sibling = LIST_HEAD_INIT(top_cpuset.sibling), |
139 | .children = LIST_HEAD_INIT(top_cpuset.children), | 164 | .children = LIST_HEAD_INIT(top_cpuset.children), |
140 | .parent = NULL, | ||
141 | .dentry = NULL, | ||
142 | .mems_generation = 0, | ||
143 | }; | 165 | }; |
144 | 166 | ||
145 | static struct vfsmount *cpuset_mount; | 167 | static struct vfsmount *cpuset_mount; |
146 | static struct super_block *cpuset_sb = NULL; | 168 | static struct super_block *cpuset_sb; |
147 | 169 | ||
148 | /* | 170 | /* |
149 | * We have two global cpuset semaphores below. They can nest. | 171 | * We have two global cpuset semaphores below. They can nest. |
@@ -227,6 +249,11 @@ static struct super_block *cpuset_sb = NULL; | |||
227 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock | 249 | * a tasks cpuset pointer we use task_lock(), which acts on a spinlock |
228 | * (task->alloc_lock) already in the task_struct routinely used for | 250 | * (task->alloc_lock) already in the task_struct routinely used for |
229 | * such matters. | 251 | * such matters. |
252 | * | ||
253 | * P.S. One more locking exception. RCU is used to guard the | ||
254 | * update of a tasks cpuset pointer by attach_task() and the | ||
255 | * access of task->cpuset->mems_generation via that pointer in | ||
256 | * the routine cpuset_update_task_memory_state(). | ||
230 | */ | 257 | */ |
231 | 258 | ||
232 | static DECLARE_MUTEX(manage_sem); | 259 | static DECLARE_MUTEX(manage_sem); |
@@ -304,7 +331,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry) | |||
304 | spin_lock(&dcache_lock); | 331 | spin_lock(&dcache_lock); |
305 | node = dentry->d_subdirs.next; | 332 | node = dentry->d_subdirs.next; |
306 | while (node != &dentry->d_subdirs) { | 333 | while (node != &dentry->d_subdirs) { |
307 | struct dentry *d = list_entry(node, struct dentry, d_child); | 334 | struct dentry *d = list_entry(node, struct dentry, d_u.d_child); |
308 | list_del_init(node); | 335 | list_del_init(node); |
309 | if (d->d_inode) { | 336 | if (d->d_inode) { |
310 | d = dget_locked(d); | 337 | d = dget_locked(d); |
@@ -316,7 +343,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry) | |||
316 | } | 343 | } |
317 | node = dentry->d_subdirs.next; | 344 | node = dentry->d_subdirs.next; |
318 | } | 345 | } |
319 | list_del_init(&dentry->d_child); | 346 | list_del_init(&dentry->d_u.d_child); |
320 | spin_unlock(&dcache_lock); | 347 | spin_unlock(&dcache_lock); |
321 | remove_dir(dentry); | 348 | remove_dir(dentry); |
322 | } | 349 | } |
@@ -570,20 +597,43 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
570 | BUG_ON(!nodes_intersects(*pmask, node_online_map)); | 597 | BUG_ON(!nodes_intersects(*pmask, node_online_map)); |
571 | } | 598 | } |
572 | 599 | ||
573 | /* | 600 | /** |
574 | * Refresh current tasks mems_allowed and mems_generation from current | 601 | * cpuset_update_task_memory_state - update task memory placement |
575 | * tasks cpuset. | 602 | * |
603 | * If the current tasks cpusets mems_allowed changed behind our | ||
604 | * backs, update current->mems_allowed, mems_generation and task NUMA | ||
605 | * mempolicy to the new value. | ||
576 | * | 606 | * |
577 | * Call without callback_sem or task_lock() held. May be called with | 607 | * Task mempolicy is updated by rebinding it relative to the |
578 | * or without manage_sem held. Will acquire task_lock() and might | 608 | * current->cpuset if a task has its memory placement changed. |
579 | * acquire callback_sem during call. | 609 | * Do not call this routine if in_interrupt(). |
580 | * | 610 | * |
581 | * The task_lock() is required to dereference current->cpuset safely. | 611 | * Call without callback_sem or task_lock() held. May be called |
582 | * Without it, we could pick up the pointer value of current->cpuset | 612 | * with or without manage_sem held. Doesn't need task_lock to guard |
583 | * in one instruction, and then attach_task could give us a different | 613 | * against another task changing a non-NULL cpuset pointer to NULL, |
584 | * cpuset, and then the cpuset we had could be removed and freed, | 614 | * as that is only done by a task on itself, and if the current task |
585 | * and then on our next instruction, we could dereference a no longer | 615 | * is here, it is not simultaneously in the exit code NULL'ing its |
586 | * valid cpuset pointer to get its mems_generation field. | 616 | * cpuset pointer. This routine also might acquire callback_sem and |
617 | * current->mm->mmap_sem during call. | ||
618 | * | ||
619 | * Reading current->cpuset->mems_generation doesn't need task_lock | ||
620 | * to guard the current->cpuset derefence, because it is guarded | ||
621 | * from concurrent freeing of current->cpuset by attach_task(), | ||
622 | * using RCU. | ||
623 | * | ||
624 | * The rcu_dereference() is technically probably not needed, | ||
625 | * as I don't actually mind if I see a new cpuset pointer but | ||
626 | * an old value of mems_generation. However this really only | ||
627 | * matters on alpha systems using cpusets heavily. If I dropped | ||
628 | * that rcu_dereference(), it would save them a memory barrier. | ||
629 | * For all other arch's, rcu_dereference is a no-op anyway, and for | ||
630 | * alpha systems not using cpusets, another planned optimization, | ||
631 | * avoiding the rcu critical section for tasks in the root cpuset | ||
632 | * which is statically allocated, so can't vanish, will make this | ||
633 | * irrelevant. Better to use RCU as intended, than to engage in | ||
634 | * some cute trick to save a memory barrier that is impossible to | ||
635 | * test, for alpha systems using cpusets heavily, which might not | ||
636 | * even exist. | ||
587 | * | 637 | * |
588 | * This routine is needed to update the per-task mems_allowed data, | 638 | * This routine is needed to update the per-task mems_allowed data, |
589 | * within the tasks context, when it is trying to allocate memory | 639 | * within the tasks context, when it is trying to allocate memory |
@@ -591,27 +641,31 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) | |||
591 | * task has been modifying its cpuset. | 641 | * task has been modifying its cpuset. |
592 | */ | 642 | */ |
593 | 643 | ||
594 | static void refresh_mems(void) | 644 | void cpuset_update_task_memory_state() |
595 | { | 645 | { |
596 | int my_cpusets_mem_gen; | 646 | int my_cpusets_mem_gen; |
647 | struct task_struct *tsk = current; | ||
648 | struct cpuset *cs; | ||
597 | 649 | ||
598 | task_lock(current); | 650 | if (tsk->cpuset == &top_cpuset) { |
599 | my_cpusets_mem_gen = current->cpuset->mems_generation; | 651 | /* Don't need rcu for top_cpuset. It's never freed. */ |
600 | task_unlock(current); | 652 | my_cpusets_mem_gen = top_cpuset.mems_generation; |
601 | 653 | } else { | |
602 | if (current->cpuset_mems_generation != my_cpusets_mem_gen) { | 654 | rcu_read_lock(); |
603 | struct cpuset *cs; | 655 | cs = rcu_dereference(tsk->cpuset); |
604 | nodemask_t oldmem = current->mems_allowed; | 656 | my_cpusets_mem_gen = cs->mems_generation; |
657 | rcu_read_unlock(); | ||
658 | } | ||
605 | 659 | ||
660 | if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { | ||
606 | down(&callback_sem); | 661 | down(&callback_sem); |
607 | task_lock(current); | 662 | task_lock(tsk); |
608 | cs = current->cpuset; | 663 | cs = tsk->cpuset; /* Maybe changed when task not locked */ |
609 | guarantee_online_mems(cs, ¤t->mems_allowed); | 664 | guarantee_online_mems(cs, &tsk->mems_allowed); |
610 | current->cpuset_mems_generation = cs->mems_generation; | 665 | tsk->cpuset_mems_generation = cs->mems_generation; |
611 | task_unlock(current); | 666 | task_unlock(tsk); |
612 | up(&callback_sem); | 667 | up(&callback_sem); |
613 | if (!nodes_equal(oldmem, current->mems_allowed)) | 668 | mpol_rebind_task(tsk, &tsk->mems_allowed); |
614 | numa_policy_rebind(&oldmem, ¤t->mems_allowed); | ||
615 | } | 669 | } |
616 | } | 670 | } |
617 | 671 | ||
@@ -766,36 +820,150 @@ static int update_cpumask(struct cpuset *cs, char *buf) | |||
766 | } | 820 | } |
767 | 821 | ||
768 | /* | 822 | /* |
823 | * Handle user request to change the 'mems' memory placement | ||
824 | * of a cpuset. Needs to validate the request, update the | ||
825 | * cpusets mems_allowed and mems_generation, and for each | ||
826 | * task in the cpuset, rebind any vma mempolicies and if | ||
827 | * the cpuset is marked 'memory_migrate', migrate the tasks | ||
828 | * pages to the new memory. | ||
829 | * | ||
769 | * Call with manage_sem held. May take callback_sem during call. | 830 | * Call with manage_sem held. May take callback_sem during call. |
831 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | ||
832 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | ||
833 | * their mempolicies to the cpusets new mems_allowed. | ||
770 | */ | 834 | */ |
771 | 835 | ||
772 | static int update_nodemask(struct cpuset *cs, char *buf) | 836 | static int update_nodemask(struct cpuset *cs, char *buf) |
773 | { | 837 | { |
774 | struct cpuset trialcs; | 838 | struct cpuset trialcs; |
839 | nodemask_t oldmem; | ||
840 | struct task_struct *g, *p; | ||
841 | struct mm_struct **mmarray; | ||
842 | int i, n, ntasks; | ||
843 | int migrate; | ||
844 | int fudge; | ||
775 | int retval; | 845 | int retval; |
776 | 846 | ||
777 | trialcs = *cs; | 847 | trialcs = *cs; |
778 | retval = nodelist_parse(buf, trialcs.mems_allowed); | 848 | retval = nodelist_parse(buf, trialcs.mems_allowed); |
779 | if (retval < 0) | 849 | if (retval < 0) |
780 | return retval; | 850 | goto done; |
781 | nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); | 851 | nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); |
782 | if (nodes_empty(trialcs.mems_allowed)) | 852 | oldmem = cs->mems_allowed; |
783 | return -ENOSPC; | 853 | if (nodes_equal(oldmem, trialcs.mems_allowed)) { |
854 | retval = 0; /* Too easy - nothing to do */ | ||
855 | goto done; | ||
856 | } | ||
857 | if (nodes_empty(trialcs.mems_allowed)) { | ||
858 | retval = -ENOSPC; | ||
859 | goto done; | ||
860 | } | ||
784 | retval = validate_change(cs, &trialcs); | 861 | retval = validate_change(cs, &trialcs); |
785 | if (retval == 0) { | 862 | if (retval < 0) |
786 | down(&callback_sem); | 863 | goto done; |
787 | cs->mems_allowed = trialcs.mems_allowed; | 864 | |
788 | atomic_inc(&cpuset_mems_generation); | 865 | down(&callback_sem); |
789 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 866 | cs->mems_allowed = trialcs.mems_allowed; |
790 | up(&callback_sem); | 867 | atomic_inc(&cpuset_mems_generation); |
868 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | ||
869 | up(&callback_sem); | ||
870 | |||
871 | set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ | ||
872 | |||
873 | fudge = 10; /* spare mmarray[] slots */ | ||
874 | fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ | ||
875 | retval = -ENOMEM; | ||
876 | |||
877 | /* | ||
878 | * Allocate mmarray[] to hold mm reference for each task | ||
879 | * in cpuset cs. Can't kmalloc GFP_KERNEL while holding | ||
880 | * tasklist_lock. We could use GFP_ATOMIC, but with a | ||
881 | * few more lines of code, we can retry until we get a big | ||
882 | * enough mmarray[] w/o using GFP_ATOMIC. | ||
883 | */ | ||
884 | while (1) { | ||
885 | ntasks = atomic_read(&cs->count); /* guess */ | ||
886 | ntasks += fudge; | ||
887 | mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); | ||
888 | if (!mmarray) | ||
889 | goto done; | ||
890 | write_lock_irq(&tasklist_lock); /* block fork */ | ||
891 | if (atomic_read(&cs->count) <= ntasks) | ||
892 | break; /* got enough */ | ||
893 | write_unlock_irq(&tasklist_lock); /* try again */ | ||
894 | kfree(mmarray); | ||
791 | } | 895 | } |
896 | |||
897 | n = 0; | ||
898 | |||
899 | /* Load up mmarray[] with mm reference for each task in cpuset. */ | ||
900 | do_each_thread(g, p) { | ||
901 | struct mm_struct *mm; | ||
902 | |||
903 | if (n >= ntasks) { | ||
904 | printk(KERN_WARNING | ||
905 | "Cpuset mempolicy rebind incomplete.\n"); | ||
906 | continue; | ||
907 | } | ||
908 | if (p->cpuset != cs) | ||
909 | continue; | ||
910 | mm = get_task_mm(p); | ||
911 | if (!mm) | ||
912 | continue; | ||
913 | mmarray[n++] = mm; | ||
914 | } while_each_thread(g, p); | ||
915 | write_unlock_irq(&tasklist_lock); | ||
916 | |||
917 | /* | ||
918 | * Now that we've dropped the tasklist spinlock, we can | ||
919 | * rebind the vma mempolicies of each mm in mmarray[] to their | ||
920 | * new cpuset, and release that mm. The mpol_rebind_mm() | ||
921 | * call takes mmap_sem, which we couldn't take while holding | ||
922 | * tasklist_lock. Forks can happen again now - the mpol_copy() | ||
923 | * cpuset_being_rebound check will catch such forks, and rebind | ||
924 | * their vma mempolicies too. Because we still hold the global | ||
925 | * cpuset manage_sem, we know that no other rebind effort will | ||
926 | * be contending for the global variable cpuset_being_rebound. | ||
927 | * It's ok if we rebind the same mm twice; mpol_rebind_mm() | ||
928 | * is idempotent. Also migrate pages in each mm to new nodes. | ||
929 | */ | ||
930 | migrate = is_memory_migrate(cs); | ||
931 | for (i = 0; i < n; i++) { | ||
932 | struct mm_struct *mm = mmarray[i]; | ||
933 | |||
934 | mpol_rebind_mm(mm, &cs->mems_allowed); | ||
935 | if (migrate) { | ||
936 | do_migrate_pages(mm, &oldmem, &cs->mems_allowed, | ||
937 | MPOL_MF_MOVE_ALL); | ||
938 | } | ||
939 | mmput(mm); | ||
940 | } | ||
941 | |||
942 | /* We're done rebinding vma's to this cpusets new mems_allowed. */ | ||
943 | kfree(mmarray); | ||
944 | set_cpuset_being_rebound(NULL); | ||
945 | retval = 0; | ||
946 | done: | ||
792 | return retval; | 947 | return retval; |
793 | } | 948 | } |
794 | 949 | ||
795 | /* | 950 | /* |
951 | * Call with manage_sem held. | ||
952 | */ | ||
953 | |||
954 | static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | ||
955 | { | ||
956 | if (simple_strtoul(buf, NULL, 10) != 0) | ||
957 | cpuset_memory_pressure_enabled = 1; | ||
958 | else | ||
959 | cpuset_memory_pressure_enabled = 0; | ||
960 | return 0; | ||
961 | } | ||
962 | |||
963 | /* | ||
796 | * update_flag - read a 0 or a 1 in a file and update associated flag | 964 | * update_flag - read a 0 or a 1 in a file and update associated flag |
797 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, | 965 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, |
798 | * CS_NOTIFY_ON_RELEASE) | 966 | * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) |
799 | * cs: the cpuset to update | 967 | * cs: the cpuset to update |
800 | * buf: the buffer where we read the 0 or 1 | 968 | * buf: the buffer where we read the 0 or 1 |
801 | * | 969 | * |
@@ -834,6 +1002,104 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
834 | } | 1002 | } |
835 | 1003 | ||
836 | /* | 1004 | /* |
1005 | * Frequency meter - How fast is some event occuring? | ||
1006 | * | ||
1007 | * These routines manage a digitally filtered, constant time based, | ||
1008 | * event frequency meter. There are four routines: | ||
1009 | * fmeter_init() - initialize a frequency meter. | ||
1010 | * fmeter_markevent() - called each time the event happens. | ||
1011 | * fmeter_getrate() - returns the recent rate of such events. | ||
1012 | * fmeter_update() - internal routine used to update fmeter. | ||
1013 | * | ||
1014 | * A common data structure is passed to each of these routines, | ||
1015 | * which is used to keep track of the state required to manage the | ||
1016 | * frequency meter and its digital filter. | ||
1017 | * | ||
1018 | * The filter works on the number of events marked per unit time. | ||
1019 | * The filter is single-pole low-pass recursive (IIR). The time unit | ||
1020 | * is 1 second. Arithmetic is done using 32-bit integers scaled to | ||
1021 | * simulate 3 decimal digits of precision (multiplied by 1000). | ||
1022 | * | ||
1023 | * With an FM_COEF of 933, and a time base of 1 second, the filter | ||
1024 | * has a half-life of 10 seconds, meaning that if the events quit | ||
1025 | * happening, then the rate returned from the fmeter_getrate() | ||
1026 | * will be cut in half each 10 seconds, until it converges to zero. | ||
1027 | * | ||
1028 | * It is not worth doing a real infinitely recursive filter. If more | ||
1029 | * than FM_MAXTICKS ticks have elapsed since the last filter event, | ||
1030 | * just compute FM_MAXTICKS ticks worth, by which point the level | ||
1031 | * will be stable. | ||
1032 | * | ||
1033 | * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid | ||
1034 | * arithmetic overflow in the fmeter_update() routine. | ||
1035 | * | ||
1036 | * Given the simple 32 bit integer arithmetic used, this meter works | ||
1037 | * best for reporting rates between one per millisecond (msec) and | ||
1038 | * one per 32 (approx) seconds. At constant rates faster than one | ||
1039 | * per msec it maxes out at values just under 1,000,000. At constant | ||
1040 | * rates between one per msec, and one per second it will stabilize | ||
1041 | * to a value N*1000, where N is the rate of events per second. | ||
1042 | * At constant rates between one per second and one per 32 seconds, | ||
1043 | * it will be choppy, moving up on the seconds that have an event, | ||
1044 | * and then decaying until the next event. At rates slower than | ||
1045 | * about one in 32 seconds, it decays all the way back to zero between | ||
1046 | * each event. | ||
1047 | */ | ||
1048 | |||
1049 | #define FM_COEF 933 /* coefficient for half-life of 10 secs */ | ||
1050 | #define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */ | ||
1051 | #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ | ||
1052 | #define FM_SCALE 1000 /* faux fixed point scale */ | ||
1053 | |||
1054 | /* Initialize a frequency meter */ | ||
1055 | static void fmeter_init(struct fmeter *fmp) | ||
1056 | { | ||
1057 | fmp->cnt = 0; | ||
1058 | fmp->val = 0; | ||
1059 | fmp->time = 0; | ||
1060 | spin_lock_init(&fmp->lock); | ||
1061 | } | ||
1062 | |||
1063 | /* Internal meter update - process cnt events and update value */ | ||
1064 | static void fmeter_update(struct fmeter *fmp) | ||
1065 | { | ||
1066 | time_t now = get_seconds(); | ||
1067 | time_t ticks = now - fmp->time; | ||
1068 | |||
1069 | if (ticks == 0) | ||
1070 | return; | ||
1071 | |||
1072 | ticks = min(FM_MAXTICKS, ticks); | ||
1073 | while (ticks-- > 0) | ||
1074 | fmp->val = (FM_COEF * fmp->val) / FM_SCALE; | ||
1075 | fmp->time = now; | ||
1076 | |||
1077 | fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; | ||
1078 | fmp->cnt = 0; | ||
1079 | } | ||
1080 | |||
1081 | /* Process any previous ticks, then bump cnt by one (times scale). */ | ||
1082 | static void fmeter_markevent(struct fmeter *fmp) | ||
1083 | { | ||
1084 | spin_lock(&fmp->lock); | ||
1085 | fmeter_update(fmp); | ||
1086 | fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); | ||
1087 | spin_unlock(&fmp->lock); | ||
1088 | } | ||
1089 | |||
1090 | /* Process any previous ticks, then return current value. */ | ||
1091 | static int fmeter_getrate(struct fmeter *fmp) | ||
1092 | { | ||
1093 | int val; | ||
1094 | |||
1095 | spin_lock(&fmp->lock); | ||
1096 | fmeter_update(fmp); | ||
1097 | val = fmp->val; | ||
1098 | spin_unlock(&fmp->lock); | ||
1099 | return val; | ||
1100 | } | ||
1101 | |||
1102 | /* | ||
837 | * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly | 1103 | * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly |
838 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be | 1104 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be |
839 | * notified on release. | 1105 | * notified on release. |
@@ -848,6 +1114,8 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
848 | struct task_struct *tsk; | 1114 | struct task_struct *tsk; |
849 | struct cpuset *oldcs; | 1115 | struct cpuset *oldcs; |
850 | cpumask_t cpus; | 1116 | cpumask_t cpus; |
1117 | nodemask_t from, to; | ||
1118 | struct mm_struct *mm; | ||
851 | 1119 | ||
852 | if (sscanf(pidbuf, "%d", &pid) != 1) | 1120 | if (sscanf(pidbuf, "%d", &pid) != 1) |
853 | return -EIO; | 1121 | return -EIO; |
@@ -887,14 +1155,27 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
887 | return -ESRCH; | 1155 | return -ESRCH; |
888 | } | 1156 | } |
889 | atomic_inc(&cs->count); | 1157 | atomic_inc(&cs->count); |
890 | tsk->cpuset = cs; | 1158 | rcu_assign_pointer(tsk->cpuset, cs); |
891 | task_unlock(tsk); | 1159 | task_unlock(tsk); |
892 | 1160 | ||
893 | guarantee_online_cpus(cs, &cpus); | 1161 | guarantee_online_cpus(cs, &cpus); |
894 | set_cpus_allowed(tsk, cpus); | 1162 | set_cpus_allowed(tsk, cpus); |
895 | 1163 | ||
1164 | from = oldcs->mems_allowed; | ||
1165 | to = cs->mems_allowed; | ||
1166 | |||
896 | up(&callback_sem); | 1167 | up(&callback_sem); |
1168 | |||
1169 | mm = get_task_mm(tsk); | ||
1170 | if (mm) { | ||
1171 | mpol_rebind_mm(mm, &to); | ||
1172 | mmput(mm); | ||
1173 | } | ||
1174 | |||
1175 | if (is_memory_migrate(cs)) | ||
1176 | do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); | ||
897 | put_task_struct(tsk); | 1177 | put_task_struct(tsk); |
1178 | synchronize_rcu(); | ||
898 | if (atomic_dec_and_test(&oldcs->count)) | 1179 | if (atomic_dec_and_test(&oldcs->count)) |
899 | check_for_release(oldcs, ppathbuf); | 1180 | check_for_release(oldcs, ppathbuf); |
900 | return 0; | 1181 | return 0; |
@@ -905,11 +1186,14 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) | |||
905 | typedef enum { | 1186 | typedef enum { |
906 | FILE_ROOT, | 1187 | FILE_ROOT, |
907 | FILE_DIR, | 1188 | FILE_DIR, |
1189 | FILE_MEMORY_MIGRATE, | ||
908 | FILE_CPULIST, | 1190 | FILE_CPULIST, |
909 | FILE_MEMLIST, | 1191 | FILE_MEMLIST, |
910 | FILE_CPU_EXCLUSIVE, | 1192 | FILE_CPU_EXCLUSIVE, |
911 | FILE_MEM_EXCLUSIVE, | 1193 | FILE_MEM_EXCLUSIVE, |
912 | FILE_NOTIFY_ON_RELEASE, | 1194 | FILE_NOTIFY_ON_RELEASE, |
1195 | FILE_MEMORY_PRESSURE_ENABLED, | ||
1196 | FILE_MEMORY_PRESSURE, | ||
913 | FILE_TASKLIST, | 1197 | FILE_TASKLIST, |
914 | } cpuset_filetype_t; | 1198 | } cpuset_filetype_t; |
915 | 1199 | ||
@@ -960,6 +1244,15 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
960 | case FILE_NOTIFY_ON_RELEASE: | 1244 | case FILE_NOTIFY_ON_RELEASE: |
961 | retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); | 1245 | retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); |
962 | break; | 1246 | break; |
1247 | case FILE_MEMORY_MIGRATE: | ||
1248 | retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); | ||
1249 | break; | ||
1250 | case FILE_MEMORY_PRESSURE_ENABLED: | ||
1251 | retval = update_memory_pressure_enabled(cs, buffer); | ||
1252 | break; | ||
1253 | case FILE_MEMORY_PRESSURE: | ||
1254 | retval = -EACCES; | ||
1255 | break; | ||
963 | case FILE_TASKLIST: | 1256 | case FILE_TASKLIST: |
964 | retval = attach_task(cs, buffer, &pathbuf); | 1257 | retval = attach_task(cs, buffer, &pathbuf); |
965 | break; | 1258 | break; |
@@ -1060,6 +1353,15 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | |||
1060 | case FILE_NOTIFY_ON_RELEASE: | 1353 | case FILE_NOTIFY_ON_RELEASE: |
1061 | *s++ = notify_on_release(cs) ? '1' : '0'; | 1354 | *s++ = notify_on_release(cs) ? '1' : '0'; |
1062 | break; | 1355 | break; |
1356 | case FILE_MEMORY_MIGRATE: | ||
1357 | *s++ = is_memory_migrate(cs) ? '1' : '0'; | ||
1358 | break; | ||
1359 | case FILE_MEMORY_PRESSURE_ENABLED: | ||
1360 | *s++ = cpuset_memory_pressure_enabled ? '1' : '0'; | ||
1361 | break; | ||
1362 | case FILE_MEMORY_PRESSURE: | ||
1363 | s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); | ||
1364 | break; | ||
1063 | default: | 1365 | default: |
1064 | retval = -EINVAL; | 1366 | retval = -EINVAL; |
1065 | goto out; | 1367 | goto out; |
@@ -1178,7 +1480,7 @@ static int cpuset_create_file(struct dentry *dentry, int mode) | |||
1178 | 1480 | ||
1179 | /* | 1481 | /* |
1180 | * cpuset_create_dir - create a directory for an object. | 1482 | * cpuset_create_dir - create a directory for an object. |
1181 | * cs: the cpuset we create the directory for. | 1483 | * cs: the cpuset we create the directory for. |
1182 | * It must have a valid ->parent field | 1484 | * It must have a valid ->parent field |
1183 | * And we are going to fill its ->dentry field. | 1485 | * And we are going to fill its ->dentry field. |
1184 | * name: The name to give to the cpuset directory. Will be copied. | 1486 | * name: The name to give to the cpuset directory. Will be copied. |
@@ -1211,7 +1513,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft) | |||
1211 | struct dentry *dentry; | 1513 | struct dentry *dentry; |
1212 | int error; | 1514 | int error; |
1213 | 1515 | ||
1214 | down(&dir->d_inode->i_sem); | 1516 | mutex_lock(&dir->d_inode->i_mutex); |
1215 | dentry = cpuset_get_dentry(dir, cft->name); | 1517 | dentry = cpuset_get_dentry(dir, cft->name); |
1216 | if (!IS_ERR(dentry)) { | 1518 | if (!IS_ERR(dentry)) { |
1217 | error = cpuset_create_file(dentry, 0644 | S_IFREG); | 1519 | error = cpuset_create_file(dentry, 0644 | S_IFREG); |
@@ -1220,7 +1522,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft) | |||
1220 | dput(dentry); | 1522 | dput(dentry); |
1221 | } else | 1523 | } else |
1222 | error = PTR_ERR(dentry); | 1524 | error = PTR_ERR(dentry); |
1223 | up(&dir->d_inode->i_sem); | 1525 | mutex_unlock(&dir->d_inode->i_mutex); |
1224 | return error; | 1526 | return error; |
1225 | } | 1527 | } |
1226 | 1528 | ||
@@ -1252,7 +1554,7 @@ struct ctr_struct { | |||
1252 | * when reading out p->cpuset, as we don't really care if it changes | 1554 | * when reading out p->cpuset, as we don't really care if it changes |
1253 | * on the next cycle, and we are not going to try to dereference it. | 1555 | * on the next cycle, and we are not going to try to dereference it. |
1254 | */ | 1556 | */ |
1255 | static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) | 1557 | static int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) |
1256 | { | 1558 | { |
1257 | int n = 0; | 1559 | int n = 0; |
1258 | struct task_struct *g, *p; | 1560 | struct task_struct *g, *p; |
@@ -1408,6 +1710,21 @@ static struct cftype cft_notify_on_release = { | |||
1408 | .private = FILE_NOTIFY_ON_RELEASE, | 1710 | .private = FILE_NOTIFY_ON_RELEASE, |
1409 | }; | 1711 | }; |
1410 | 1712 | ||
1713 | static struct cftype cft_memory_migrate = { | ||
1714 | .name = "memory_migrate", | ||
1715 | .private = FILE_MEMORY_MIGRATE, | ||
1716 | }; | ||
1717 | |||
1718 | static struct cftype cft_memory_pressure_enabled = { | ||
1719 | .name = "memory_pressure_enabled", | ||
1720 | .private = FILE_MEMORY_PRESSURE_ENABLED, | ||
1721 | }; | ||
1722 | |||
1723 | static struct cftype cft_memory_pressure = { | ||
1724 | .name = "memory_pressure", | ||
1725 | .private = FILE_MEMORY_PRESSURE, | ||
1726 | }; | ||
1727 | |||
1411 | static int cpuset_populate_dir(struct dentry *cs_dentry) | 1728 | static int cpuset_populate_dir(struct dentry *cs_dentry) |
1412 | { | 1729 | { |
1413 | int err; | 1730 | int err; |
@@ -1422,6 +1739,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) | |||
1422 | return err; | 1739 | return err; |
1423 | if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) | 1740 | if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0) |
1424 | return err; | 1741 | return err; |
1742 | if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0) | ||
1743 | return err; | ||
1744 | if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) | ||
1745 | return err; | ||
1425 | if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) | 1746 | if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) |
1426 | return err; | 1747 | return err; |
1427 | return 0; | 1748 | return 0; |
@@ -1446,7 +1767,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1446 | return -ENOMEM; | 1767 | return -ENOMEM; |
1447 | 1768 | ||
1448 | down(&manage_sem); | 1769 | down(&manage_sem); |
1449 | refresh_mems(); | 1770 | cpuset_update_task_memory_state(); |
1450 | cs->flags = 0; | 1771 | cs->flags = 0; |
1451 | if (notify_on_release(parent)) | 1772 | if (notify_on_release(parent)) |
1452 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 1773 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
@@ -1457,11 +1778,13 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1457 | INIT_LIST_HEAD(&cs->children); | 1778 | INIT_LIST_HEAD(&cs->children); |
1458 | atomic_inc(&cpuset_mems_generation); | 1779 | atomic_inc(&cpuset_mems_generation); |
1459 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 1780 | cs->mems_generation = atomic_read(&cpuset_mems_generation); |
1781 | fmeter_init(&cs->fmeter); | ||
1460 | 1782 | ||
1461 | cs->parent = parent; | 1783 | cs->parent = parent; |
1462 | 1784 | ||
1463 | down(&callback_sem); | 1785 | down(&callback_sem); |
1464 | list_add(&cs->sibling, &cs->parent->children); | 1786 | list_add(&cs->sibling, &cs->parent->children); |
1787 | number_of_cpusets++; | ||
1465 | up(&callback_sem); | 1788 | up(&callback_sem); |
1466 | 1789 | ||
1467 | err = cpuset_create_dir(cs, name, mode); | 1790 | err = cpuset_create_dir(cs, name, mode); |
@@ -1470,7 +1793,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1470 | 1793 | ||
1471 | /* | 1794 | /* |
1472 | * Release manage_sem before cpuset_populate_dir() because it | 1795 | * Release manage_sem before cpuset_populate_dir() because it |
1473 | * will down() this new directory's i_sem and if we race with | 1796 | * will down() this new directory's i_mutex and if we race with |
1474 | * another mkdir, we might deadlock. | 1797 | * another mkdir, we might deadlock. |
1475 | */ | 1798 | */ |
1476 | up(&manage_sem); | 1799 | up(&manage_sem); |
@@ -1489,7 +1812,7 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode) | |||
1489 | { | 1812 | { |
1490 | struct cpuset *c_parent = dentry->d_parent->d_fsdata; | 1813 | struct cpuset *c_parent = dentry->d_parent->d_fsdata; |
1491 | 1814 | ||
1492 | /* the vfs holds inode->i_sem already */ | 1815 | /* the vfs holds inode->i_mutex already */ |
1493 | return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); | 1816 | return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); |
1494 | } | 1817 | } |
1495 | 1818 | ||
@@ -1500,10 +1823,10 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1500 | struct cpuset *parent; | 1823 | struct cpuset *parent; |
1501 | char *pathbuf = NULL; | 1824 | char *pathbuf = NULL; |
1502 | 1825 | ||
1503 | /* the vfs holds both inode->i_sem already */ | 1826 | /* the vfs holds both inode->i_mutex already */ |
1504 | 1827 | ||
1505 | down(&manage_sem); | 1828 | down(&manage_sem); |
1506 | refresh_mems(); | 1829 | cpuset_update_task_memory_state(); |
1507 | if (atomic_read(&cs->count) > 0) { | 1830 | if (atomic_read(&cs->count) > 0) { |
1508 | up(&manage_sem); | 1831 | up(&manage_sem); |
1509 | return -EBUSY; | 1832 | return -EBUSY; |
@@ -1524,6 +1847,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1524 | spin_unlock(&d->d_lock); | 1847 | spin_unlock(&d->d_lock); |
1525 | cpuset_d_remove_dir(d); | 1848 | cpuset_d_remove_dir(d); |
1526 | dput(d); | 1849 | dput(d); |
1850 | number_of_cpusets--; | ||
1527 | up(&callback_sem); | 1851 | up(&callback_sem); |
1528 | if (list_empty(&parent->children)) | 1852 | if (list_empty(&parent->children)) |
1529 | check_for_release(parent, &pathbuf); | 1853 | check_for_release(parent, &pathbuf); |
@@ -1532,6 +1856,21 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) | |||
1532 | return 0; | 1856 | return 0; |
1533 | } | 1857 | } |
1534 | 1858 | ||
1859 | /* | ||
1860 | * cpuset_init_early - just enough so that the calls to | ||
1861 | * cpuset_update_task_memory_state() in early init code | ||
1862 | * are harmless. | ||
1863 | */ | ||
1864 | |||
1865 | int __init cpuset_init_early(void) | ||
1866 | { | ||
1867 | struct task_struct *tsk = current; | ||
1868 | |||
1869 | tsk->cpuset = &top_cpuset; | ||
1870 | tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation); | ||
1871 | return 0; | ||
1872 | } | ||
1873 | |||
1535 | /** | 1874 | /** |
1536 | * cpuset_init - initialize cpusets at system boot | 1875 | * cpuset_init - initialize cpusets at system boot |
1537 | * | 1876 | * |
@@ -1546,6 +1885,7 @@ int __init cpuset_init(void) | |||
1546 | top_cpuset.cpus_allowed = CPU_MASK_ALL; | 1885 | top_cpuset.cpus_allowed = CPU_MASK_ALL; |
1547 | top_cpuset.mems_allowed = NODE_MASK_ALL; | 1886 | top_cpuset.mems_allowed = NODE_MASK_ALL; |
1548 | 1887 | ||
1888 | fmeter_init(&top_cpuset.fmeter); | ||
1549 | atomic_inc(&cpuset_mems_generation); | 1889 | atomic_inc(&cpuset_mems_generation); |
1550 | top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); | 1890 | top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); |
1551 | 1891 | ||
@@ -1566,7 +1906,11 @@ int __init cpuset_init(void) | |||
1566 | root->d_inode->i_nlink++; | 1906 | root->d_inode->i_nlink++; |
1567 | top_cpuset.dentry = root; | 1907 | top_cpuset.dentry = root; |
1568 | root->d_inode->i_op = &cpuset_dir_inode_operations; | 1908 | root->d_inode->i_op = &cpuset_dir_inode_operations; |
1909 | number_of_cpusets = 1; | ||
1569 | err = cpuset_populate_dir(root); | 1910 | err = cpuset_populate_dir(root); |
1911 | /* memory_pressure_enabled is in root cpuset only */ | ||
1912 | if (err == 0) | ||
1913 | err = cpuset_add_file(root, &cft_memory_pressure_enabled); | ||
1570 | out: | 1914 | out: |
1571 | return err; | 1915 | return err; |
1572 | } | 1916 | } |
@@ -1632,15 +1976,13 @@ void cpuset_fork(struct task_struct *child) | |||
1632 | * | 1976 | * |
1633 | * We don't need to task_lock() this reference to tsk->cpuset, | 1977 | * We don't need to task_lock() this reference to tsk->cpuset, |
1634 | * because tsk is already marked PF_EXITING, so attach_task() won't | 1978 | * because tsk is already marked PF_EXITING, so attach_task() won't |
1635 | * mess with it. | 1979 | * mess with it, or task is a failed fork, never visible to attach_task. |
1636 | **/ | 1980 | **/ |
1637 | 1981 | ||
1638 | void cpuset_exit(struct task_struct *tsk) | 1982 | void cpuset_exit(struct task_struct *tsk) |
1639 | { | 1983 | { |
1640 | struct cpuset *cs; | 1984 | struct cpuset *cs; |
1641 | 1985 | ||
1642 | BUG_ON(!(tsk->flags & PF_EXITING)); | ||
1643 | |||
1644 | cs = tsk->cpuset; | 1986 | cs = tsk->cpuset; |
1645 | tsk->cpuset = NULL; | 1987 | tsk->cpuset = NULL; |
1646 | 1988 | ||
@@ -1667,14 +2009,14 @@ void cpuset_exit(struct task_struct *tsk) | |||
1667 | * tasks cpuset. | 2009 | * tasks cpuset. |
1668 | **/ | 2010 | **/ |
1669 | 2011 | ||
1670 | cpumask_t cpuset_cpus_allowed(const struct task_struct *tsk) | 2012 | cpumask_t cpuset_cpus_allowed(struct task_struct *tsk) |
1671 | { | 2013 | { |
1672 | cpumask_t mask; | 2014 | cpumask_t mask; |
1673 | 2015 | ||
1674 | down(&callback_sem); | 2016 | down(&callback_sem); |
1675 | task_lock((struct task_struct *)tsk); | 2017 | task_lock(tsk); |
1676 | guarantee_online_cpus(tsk->cpuset, &mask); | 2018 | guarantee_online_cpus(tsk->cpuset, &mask); |
1677 | task_unlock((struct task_struct *)tsk); | 2019 | task_unlock(tsk); |
1678 | up(&callback_sem); | 2020 | up(&callback_sem); |
1679 | 2021 | ||
1680 | return mask; | 2022 | return mask; |
@@ -1686,43 +2028,26 @@ void cpuset_init_current_mems_allowed(void) | |||
1686 | } | 2028 | } |
1687 | 2029 | ||
1688 | /** | 2030 | /** |
1689 | * cpuset_update_current_mems_allowed - update mems parameters to new values | 2031 | * cpuset_mems_allowed - return mems_allowed mask from a tasks cpuset. |
1690 | * | 2032 | * @tsk: pointer to task_struct from which to obtain cpuset->mems_allowed. |
1691 | * If the current tasks cpusets mems_allowed changed behind our backs, | ||
1692 | * update current->mems_allowed and mems_generation to the new value. | ||
1693 | * Do not call this routine if in_interrupt(). | ||
1694 | * | 2033 | * |
1695 | * Call without callback_sem or task_lock() held. May be called | 2034 | * Description: Returns the nodemask_t mems_allowed of the cpuset |
1696 | * with or without manage_sem held. Unless exiting, it will acquire | 2035 | * attached to the specified @tsk. Guaranteed to return some non-empty |
1697 | * task_lock(). Also might acquire callback_sem during call to | 2036 | * subset of node_online_map, even if this means going outside the |
1698 | * refresh_mems(). | 2037 | * tasks cpuset. |
1699 | */ | 2038 | **/ |
1700 | 2039 | ||
1701 | void cpuset_update_current_mems_allowed(void) | 2040 | nodemask_t cpuset_mems_allowed(struct task_struct *tsk) |
1702 | { | 2041 | { |
1703 | struct cpuset *cs; | 2042 | nodemask_t mask; |
1704 | int need_to_refresh = 0; | ||
1705 | 2043 | ||
1706 | task_lock(current); | 2044 | down(&callback_sem); |
1707 | cs = current->cpuset; | 2045 | task_lock(tsk); |
1708 | if (!cs) | 2046 | guarantee_online_mems(tsk->cpuset, &mask); |
1709 | goto done; | 2047 | task_unlock(tsk); |
1710 | if (current->cpuset_mems_generation != cs->mems_generation) | 2048 | up(&callback_sem); |
1711 | need_to_refresh = 1; | ||
1712 | done: | ||
1713 | task_unlock(current); | ||
1714 | if (need_to_refresh) | ||
1715 | refresh_mems(); | ||
1716 | } | ||
1717 | 2049 | ||
1718 | /** | 2050 | return mask; |
1719 | * cpuset_restrict_to_mems_allowed - limit nodes to current mems_allowed | ||
1720 | * @nodes: pointer to a node bitmap that is and-ed with mems_allowed | ||
1721 | */ | ||
1722 | void cpuset_restrict_to_mems_allowed(unsigned long *nodes) | ||
1723 | { | ||
1724 | bitmap_and(nodes, nodes, nodes_addr(current->mems_allowed), | ||
1725 | MAX_NUMNODES); | ||
1726 | } | 2051 | } |
1727 | 2052 | ||
1728 | /** | 2053 | /** |
@@ -1795,7 +2120,7 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) | |||
1795 | * GFP_USER - only nodes in current tasks mems allowed ok. | 2120 | * GFP_USER - only nodes in current tasks mems allowed ok. |
1796 | **/ | 2121 | **/ |
1797 | 2122 | ||
1798 | int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | 2123 | int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) |
1799 | { | 2124 | { |
1800 | int node; /* node that zone z is on */ | 2125 | int node; /* node that zone z is on */ |
1801 | const struct cpuset *cs; /* current cpuset ancestors */ | 2126 | const struct cpuset *cs; /* current cpuset ancestors */ |
@@ -1825,6 +2150,33 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | |||
1825 | } | 2150 | } |
1826 | 2151 | ||
1827 | /** | 2152 | /** |
2153 | * cpuset_lock - lock out any changes to cpuset structures | ||
2154 | * | ||
2155 | * The out of memory (oom) code needs to lock down cpusets | ||
2156 | * from being changed while it scans the tasklist looking for a | ||
2157 | * task in an overlapping cpuset. Expose callback_sem via this | ||
2158 | * cpuset_lock() routine, so the oom code can lock it, before | ||
2159 | * locking the task list. The tasklist_lock is a spinlock, so | ||
2160 | * must be taken inside callback_sem. | ||
2161 | */ | ||
2162 | |||
2163 | void cpuset_lock(void) | ||
2164 | { | ||
2165 | down(&callback_sem); | ||
2166 | } | ||
2167 | |||
2168 | /** | ||
2169 | * cpuset_unlock - release lock on cpuset changes | ||
2170 | * | ||
2171 | * Undo the lock taken in a previous cpuset_lock() call. | ||
2172 | */ | ||
2173 | |||
2174 | void cpuset_unlock(void) | ||
2175 | { | ||
2176 | up(&callback_sem); | ||
2177 | } | ||
2178 | |||
2179 | /** | ||
1828 | * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? | 2180 | * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? |
1829 | * @p: pointer to task_struct of some other task. | 2181 | * @p: pointer to task_struct of some other task. |
1830 | * | 2182 | * |
@@ -1833,7 +2185,7 @@ int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | |||
1833 | * determine if task @p's memory usage might impact the memory | 2185 | * determine if task @p's memory usage might impact the memory |
1834 | * available to the current task. | 2186 | * available to the current task. |
1835 | * | 2187 | * |
1836 | * Acquires callback_sem - not suitable for calling from a fast path. | 2188 | * Call while holding callback_sem. |
1837 | **/ | 2189 | **/ |
1838 | 2190 | ||
1839 | int cpuset_excl_nodes_overlap(const struct task_struct *p) | 2191 | int cpuset_excl_nodes_overlap(const struct task_struct *p) |
@@ -1841,8 +2193,6 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) | |||
1841 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ | 2193 | const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ |
1842 | int overlap = 0; /* do cpusets overlap? */ | 2194 | int overlap = 0; /* do cpusets overlap? */ |
1843 | 2195 | ||
1844 | down(&callback_sem); | ||
1845 | |||
1846 | task_lock(current); | 2196 | task_lock(current); |
1847 | if (current->flags & PF_EXITING) { | 2197 | if (current->flags & PF_EXITING) { |
1848 | task_unlock(current); | 2198 | task_unlock(current); |
@@ -1861,12 +2211,46 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) | |||
1861 | 2211 | ||
1862 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); | 2212 | overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); |
1863 | done: | 2213 | done: |
1864 | up(&callback_sem); | ||
1865 | |||
1866 | return overlap; | 2214 | return overlap; |
1867 | } | 2215 | } |
1868 | 2216 | ||
1869 | /* | 2217 | /* |
2218 | * Collection of memory_pressure is suppressed unless | ||
2219 | * this flag is enabled by writing "1" to the special | ||
2220 | * cpuset file 'memory_pressure_enabled' in the root cpuset. | ||
2221 | */ | ||
2222 | |||
2223 | int cpuset_memory_pressure_enabled __read_mostly; | ||
2224 | |||
2225 | /** | ||
2226 | * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. | ||
2227 | * | ||
2228 | * Keep a running average of the rate of synchronous (direct) | ||
2229 | * page reclaim efforts initiated by tasks in each cpuset. | ||
2230 | * | ||
2231 | * This represents the rate at which some task in the cpuset | ||
2232 | * ran low on memory on all nodes it was allowed to use, and | ||
2233 | * had to enter the kernels page reclaim code in an effort to | ||
2234 | * create more free memory by tossing clean pages or swapping | ||
2235 | * or writing dirty pages. | ||
2236 | * | ||
2237 | * Display to user space in the per-cpuset read-only file | ||
2238 | * "memory_pressure". Value displayed is an integer | ||
2239 | * representing the recent rate of entry into the synchronous | ||
2240 | * (direct) page reclaim by any task attached to the cpuset. | ||
2241 | **/ | ||
2242 | |||
2243 | void __cpuset_memory_pressure_bump(void) | ||
2244 | { | ||
2245 | struct cpuset *cs; | ||
2246 | |||
2247 | task_lock(current); | ||
2248 | cs = current->cpuset; | ||
2249 | fmeter_markevent(&cs->fmeter); | ||
2250 | task_unlock(current); | ||
2251 | } | ||
2252 | |||
2253 | /* | ||
1870 | * proc_cpuset_show() | 2254 | * proc_cpuset_show() |
1871 | * - Print tasks cpuset path into seq_file. | 2255 | * - Print tasks cpuset path into seq_file. |
1872 | * - Used for /proc/<pid>/cpuset. | 2256 | * - Used for /proc/<pid>/cpuset. |