diff options
-rw-r--r-- | include/linux/cgroup.h | 4 | ||||
-rw-r--r-- | include/linux/cpuset.h | 37 | ||||
-rw-r--r-- | kernel/cgroup.c | 175 | ||||
-rw-r--r-- | kernel/cpuset.c | 162 | ||||
-rw-r--r-- | mm/hugetlb.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 6 | ||||
-rw-r--r-- | mm/slab.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 3 | ||||
-rw-r--r-- | mm/vmscan.c | 5 |
10 files changed, 200 insertions, 198 deletions
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 641e56494a92..da0dae0600e6 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h | |||
@@ -638,8 +638,10 @@ struct cgroup_subsys { | |||
638 | struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); | 638 | struct cgroup_subsys_state *(*css_alloc)(struct cgroup_subsys_state *parent_css); |
639 | int (*css_online)(struct cgroup_subsys_state *css); | 639 | int (*css_online)(struct cgroup_subsys_state *css); |
640 | void (*css_offline)(struct cgroup_subsys_state *css); | 640 | void (*css_offline)(struct cgroup_subsys_state *css); |
641 | void (*css_released)(struct cgroup_subsys_state *css); | ||
641 | void (*css_free)(struct cgroup_subsys_state *css); | 642 | void (*css_free)(struct cgroup_subsys_state *css); |
642 | void (*css_reset)(struct cgroup_subsys_state *css); | 643 | void (*css_reset)(struct cgroup_subsys_state *css); |
644 | void (*css_e_css_changed)(struct cgroup_subsys_state *css); | ||
643 | 645 | ||
644 | int (*can_attach)(struct cgroup_subsys_state *css, | 646 | int (*can_attach)(struct cgroup_subsys_state *css, |
645 | struct cgroup_taskset *tset); | 647 | struct cgroup_taskset *tset); |
@@ -934,6 +936,8 @@ void css_task_iter_end(struct css_task_iter *it); | |||
934 | int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); | 936 | int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); |
935 | int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); | 937 | int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); |
936 | 938 | ||
939 | struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, | ||
940 | struct cgroup_subsys *ss); | ||
937 | struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, | 941 | struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, |
938 | struct cgroup_subsys *ss); | 942 | struct cgroup_subsys *ss); |
939 | 943 | ||
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 2f073db7392e..1b357997cac5 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
@@ -48,29 +48,16 @@ extern nodemask_t cpuset_mems_allowed(struct task_struct *p); | |||
48 | void cpuset_init_current_mems_allowed(void); | 48 | void cpuset_init_current_mems_allowed(void); |
49 | int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); | 49 | int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask); |
50 | 50 | ||
51 | extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask); | 51 | extern int __cpuset_node_allowed(int node, gfp_t gfp_mask); |
52 | extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask); | ||
53 | 52 | ||
54 | static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) | 53 | static inline int cpuset_node_allowed(int node, gfp_t gfp_mask) |
55 | { | 54 | { |
56 | return nr_cpusets() <= 1 || | 55 | return nr_cpusets() <= 1 || __cpuset_node_allowed(node, gfp_mask); |
57 | __cpuset_node_allowed_softwall(node, gfp_mask); | ||
58 | } | 56 | } |
59 | 57 | ||
60 | static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) | 58 | static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) |
61 | { | 59 | { |
62 | return nr_cpusets() <= 1 || | 60 | return cpuset_node_allowed(zone_to_nid(z), gfp_mask); |
63 | __cpuset_node_allowed_hardwall(node, gfp_mask); | ||
64 | } | ||
65 | |||
66 | static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) | ||
67 | { | ||
68 | return cpuset_node_allowed_softwall(zone_to_nid(z), gfp_mask); | ||
69 | } | ||
70 | |||
71 | static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) | ||
72 | { | ||
73 | return cpuset_node_allowed_hardwall(zone_to_nid(z), gfp_mask); | ||
74 | } | 61 | } |
75 | 62 | ||
76 | extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, | 63 | extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, |
@@ -179,22 +166,12 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) | |||
179 | return 1; | 166 | return 1; |
180 | } | 167 | } |
181 | 168 | ||
182 | static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) | 169 | static inline int cpuset_node_allowed(int node, gfp_t gfp_mask) |
183 | { | ||
184 | return 1; | ||
185 | } | ||
186 | |||
187 | static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) | ||
188 | { | ||
189 | return 1; | ||
190 | } | ||
191 | |||
192 | static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask) | ||
193 | { | 170 | { |
194 | return 1; | 171 | return 1; |
195 | } | 172 | } |
196 | 173 | ||
197 | static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask) | 174 | static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) |
198 | { | 175 | { |
199 | return 1; | 176 | return 1; |
200 | } | 177 | } |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 136eceadeed1..bb263d0caab3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -277,6 +277,10 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, | |||
277 | if (!(cgrp->root->subsys_mask & (1 << ss->id))) | 277 | if (!(cgrp->root->subsys_mask & (1 << ss->id))) |
278 | return NULL; | 278 | return NULL; |
279 | 279 | ||
280 | /* | ||
281 | * This function is used while updating css associations and thus | ||
282 | * can't test the csses directly. Use ->child_subsys_mask. | ||
283 | */ | ||
280 | while (cgroup_parent(cgrp) && | 284 | while (cgroup_parent(cgrp) && |
281 | !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) | 285 | !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) |
282 | cgrp = cgroup_parent(cgrp); | 286 | cgrp = cgroup_parent(cgrp); |
@@ -284,6 +288,39 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, | |||
284 | return cgroup_css(cgrp, ss); | 288 | return cgroup_css(cgrp, ss); |
285 | } | 289 | } |
286 | 290 | ||
291 | /** | ||
292 | * cgroup_get_e_css - get a cgroup's effective css for the specified subsystem | ||
293 | * @cgrp: the cgroup of interest | ||
294 | * @ss: the subsystem of interest | ||
295 | * | ||
296 | * Find and get the effective css of @cgrp for @ss. The effective css is | ||
297 | * defined as the matching css of the nearest ancestor including self which | ||
298 | * has @ss enabled. If @ss is not mounted on the hierarchy @cgrp is on, | ||
299 | * the root css is returned, so this function always returns a valid css. | ||
300 | * The returned css must be put using css_put(). | ||
301 | */ | ||
302 | struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgrp, | ||
303 | struct cgroup_subsys *ss) | ||
304 | { | ||
305 | struct cgroup_subsys_state *css; | ||
306 | |||
307 | rcu_read_lock(); | ||
308 | |||
309 | do { | ||
310 | css = cgroup_css(cgrp, ss); | ||
311 | |||
312 | if (css && css_tryget_online(css)) | ||
313 | goto out_unlock; | ||
314 | cgrp = cgroup_parent(cgrp); | ||
315 | } while (cgrp); | ||
316 | |||
317 | css = init_css_set.subsys[ss->id]; | ||
318 | css_get(css); | ||
319 | out_unlock: | ||
320 | rcu_read_unlock(); | ||
321 | return css; | ||
322 | } | ||
323 | |||
287 | /* convenient tests for these bits */ | 324 | /* convenient tests for these bits */ |
288 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) | 325 | static inline bool cgroup_is_dead(const struct cgroup *cgrp) |
289 | { | 326 | { |
@@ -1019,31 +1056,30 @@ static void cgroup_put(struct cgroup *cgrp) | |||
1019 | } | 1056 | } |
1020 | 1057 | ||
1021 | /** | 1058 | /** |
1022 | * cgroup_refresh_child_subsys_mask - update child_subsys_mask | 1059 | * cgroup_calc_child_subsys_mask - calculate child_subsys_mask |
1023 | * @cgrp: the target cgroup | 1060 | * @cgrp: the target cgroup |
1061 | * @subtree_control: the new subtree_control mask to consider | ||
1024 | * | 1062 | * |
1025 | * On the default hierarchy, a subsystem may request other subsystems to be | 1063 | * On the default hierarchy, a subsystem may request other subsystems to be |
1026 | * enabled together through its ->depends_on mask. In such cases, more | 1064 | * enabled together through its ->depends_on mask. In such cases, more |
1027 | * subsystems than specified in "cgroup.subtree_control" may be enabled. | 1065 | * subsystems than specified in "cgroup.subtree_control" may be enabled. |
1028 | * | 1066 | * |
1029 | * This function determines which subsystems need to be enabled given the | 1067 | * This function calculates which subsystems need to be enabled if |
1030 | * current @cgrp->subtree_control and records it in | 1068 | * @subtree_control is to be applied to @cgrp. The returned mask is always |
1031 | * @cgrp->child_subsys_mask. The resulting mask is always a superset of | 1069 | * a superset of @subtree_control and follows the usual hierarchy rules. |
1032 | * @cgrp->subtree_control and follows the usual hierarchy rules. | ||
1033 | */ | 1070 | */ |
1034 | static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) | 1071 | static unsigned int cgroup_calc_child_subsys_mask(struct cgroup *cgrp, |
1072 | unsigned int subtree_control) | ||
1035 | { | 1073 | { |
1036 | struct cgroup *parent = cgroup_parent(cgrp); | 1074 | struct cgroup *parent = cgroup_parent(cgrp); |
1037 | unsigned int cur_ss_mask = cgrp->subtree_control; | 1075 | unsigned int cur_ss_mask = subtree_control; |
1038 | struct cgroup_subsys *ss; | 1076 | struct cgroup_subsys *ss; |
1039 | int ssid; | 1077 | int ssid; |
1040 | 1078 | ||
1041 | lockdep_assert_held(&cgroup_mutex); | 1079 | lockdep_assert_held(&cgroup_mutex); |
1042 | 1080 | ||
1043 | if (!cgroup_on_dfl(cgrp)) { | 1081 | if (!cgroup_on_dfl(cgrp)) |
1044 | cgrp->child_subsys_mask = cur_ss_mask; | 1082 | return cur_ss_mask; |
1045 | return; | ||
1046 | } | ||
1047 | 1083 | ||
1048 | while (true) { | 1084 | while (true) { |
1049 | unsigned int new_ss_mask = cur_ss_mask; | 1085 | unsigned int new_ss_mask = cur_ss_mask; |
@@ -1067,7 +1103,20 @@ static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) | |||
1067 | cur_ss_mask = new_ss_mask; | 1103 | cur_ss_mask = new_ss_mask; |
1068 | } | 1104 | } |
1069 | 1105 | ||
1070 | cgrp->child_subsys_mask = cur_ss_mask; | 1106 | return cur_ss_mask; |
1107 | } | ||
1108 | |||
1109 | /** | ||
1110 | * cgroup_refresh_child_subsys_mask - update child_subsys_mask | ||
1111 | * @cgrp: the target cgroup | ||
1112 | * | ||
1113 | * Update @cgrp->child_subsys_mask according to the current | ||
1114 | * @cgrp->subtree_control using cgroup_calc_child_subsys_mask(). | ||
1115 | */ | ||
1116 | static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) | ||
1117 | { | ||
1118 | cgrp->child_subsys_mask = | ||
1119 | cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control); | ||
1071 | } | 1120 | } |
1072 | 1121 | ||
1073 | /** | 1122 | /** |
@@ -2641,7 +2690,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
2641 | loff_t off) | 2690 | loff_t off) |
2642 | { | 2691 | { |
2643 | unsigned int enable = 0, disable = 0; | 2692 | unsigned int enable = 0, disable = 0; |
2644 | unsigned int css_enable, css_disable, old_ctrl, new_ctrl; | 2693 | unsigned int css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; |
2645 | struct cgroup *cgrp, *child; | 2694 | struct cgroup *cgrp, *child; |
2646 | struct cgroup_subsys *ss; | 2695 | struct cgroup_subsys *ss; |
2647 | char *tok; | 2696 | char *tok; |
@@ -2693,36 +2742,6 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
2693 | ret = -ENOENT; | 2742 | ret = -ENOENT; |
2694 | goto out_unlock; | 2743 | goto out_unlock; |
2695 | } | 2744 | } |
2696 | |||
2697 | /* | ||
2698 | * @ss is already enabled through dependency and | ||
2699 | * we'll just make it visible. Skip draining. | ||
2700 | */ | ||
2701 | if (cgrp->child_subsys_mask & (1 << ssid)) | ||
2702 | continue; | ||
2703 | |||
2704 | /* | ||
2705 | * Because css offlining is asynchronous, userland | ||
2706 | * might try to re-enable the same controller while | ||
2707 | * the previous instance is still around. In such | ||
2708 | * cases, wait till it's gone using offline_waitq. | ||
2709 | */ | ||
2710 | cgroup_for_each_live_child(child, cgrp) { | ||
2711 | DEFINE_WAIT(wait); | ||
2712 | |||
2713 | if (!cgroup_css(child, ss)) | ||
2714 | continue; | ||
2715 | |||
2716 | cgroup_get(child); | ||
2717 | prepare_to_wait(&child->offline_waitq, &wait, | ||
2718 | TASK_UNINTERRUPTIBLE); | ||
2719 | cgroup_kn_unlock(of->kn); | ||
2720 | schedule(); | ||
2721 | finish_wait(&child->offline_waitq, &wait); | ||
2722 | cgroup_put(child); | ||
2723 | |||
2724 | return restart_syscall(); | ||
2725 | } | ||
2726 | } else if (disable & (1 << ssid)) { | 2745 | } else if (disable & (1 << ssid)) { |
2727 | if (!(cgrp->subtree_control & (1 << ssid))) { | 2746 | if (!(cgrp->subtree_control & (1 << ssid))) { |
2728 | disable &= ~(1 << ssid); | 2747 | disable &= ~(1 << ssid); |
@@ -2758,19 +2777,48 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
2758 | * subsystems than specified may need to be enabled or disabled | 2777 | * subsystems than specified may need to be enabled or disabled |
2759 | * depending on subsystem dependencies. | 2778 | * depending on subsystem dependencies. |
2760 | */ | 2779 | */ |
2761 | cgrp->subtree_control |= enable; | 2780 | old_sc = cgrp->subtree_control; |
2762 | cgrp->subtree_control &= ~disable; | 2781 | old_ss = cgrp->child_subsys_mask; |
2782 | new_sc = (old_sc | enable) & ~disable; | ||
2783 | new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc); | ||
2763 | 2784 | ||
2764 | old_ctrl = cgrp->child_subsys_mask; | 2785 | css_enable = ~old_ss & new_ss; |
2765 | cgroup_refresh_child_subsys_mask(cgrp); | 2786 | css_disable = old_ss & ~new_ss; |
2766 | new_ctrl = cgrp->child_subsys_mask; | ||
2767 | |||
2768 | css_enable = ~old_ctrl & new_ctrl; | ||
2769 | css_disable = old_ctrl & ~new_ctrl; | ||
2770 | enable |= css_enable; | 2787 | enable |= css_enable; |
2771 | disable |= css_disable; | 2788 | disable |= css_disable; |
2772 | 2789 | ||
2773 | /* | 2790 | /* |
2791 | * Because css offlining is asynchronous, userland might try to | ||
2792 | * re-enable the same controller while the previous instance is | ||
2793 | * still around. In such cases, wait till it's gone using | ||
2794 | * offline_waitq. | ||
2795 | */ | ||
2796 | for_each_subsys(ss, ssid) { | ||
2797 | if (!(css_enable & (1 << ssid))) | ||
2798 | continue; | ||
2799 | |||
2800 | cgroup_for_each_live_child(child, cgrp) { | ||
2801 | DEFINE_WAIT(wait); | ||
2802 | |||
2803 | if (!cgroup_css(child, ss)) | ||
2804 | continue; | ||
2805 | |||
2806 | cgroup_get(child); | ||
2807 | prepare_to_wait(&child->offline_waitq, &wait, | ||
2808 | TASK_UNINTERRUPTIBLE); | ||
2809 | cgroup_kn_unlock(of->kn); | ||
2810 | schedule(); | ||
2811 | finish_wait(&child->offline_waitq, &wait); | ||
2812 | cgroup_put(child); | ||
2813 | |||
2814 | return restart_syscall(); | ||
2815 | } | ||
2816 | } | ||
2817 | |||
2818 | cgrp->subtree_control = new_sc; | ||
2819 | cgrp->child_subsys_mask = new_ss; | ||
2820 | |||
2821 | /* | ||
2774 | * Create new csses or make the existing ones visible. A css is | 2822 | * Create new csses or make the existing ones visible. A css is |
2775 | * created invisible if it's being implicitly enabled through | 2823 | * created invisible if it's being implicitly enabled through |
2776 | * dependency. An invisible css is made visible when the userland | 2824 | * dependency. An invisible css is made visible when the userland |
@@ -2825,6 +2873,24 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, | |||
2825 | } | 2873 | } |
2826 | } | 2874 | } |
2827 | 2875 | ||
2876 | /* | ||
2877 | * The effective csses of all the descendants (excluding @cgrp) may | ||
2878 | * have changed. Subsystems can optionally subscribe to this event | ||
2879 | * by implementing ->css_e_css_changed() which is invoked if any of | ||
2880 | * the effective csses seen from the css's cgroup may have changed. | ||
2881 | */ | ||
2882 | for_each_subsys(ss, ssid) { | ||
2883 | struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss); | ||
2884 | struct cgroup_subsys_state *css; | ||
2885 | |||
2886 | if (!ss->css_e_css_changed || !this_css) | ||
2887 | continue; | ||
2888 | |||
2889 | css_for_each_descendant_pre(css, this_css) | ||
2890 | if (css != this_css) | ||
2891 | ss->css_e_css_changed(css); | ||
2892 | } | ||
2893 | |||
2828 | kernfs_activate(cgrp->kn); | 2894 | kernfs_activate(cgrp->kn); |
2829 | ret = 0; | 2895 | ret = 0; |
2830 | out_unlock: | 2896 | out_unlock: |
@@ -2832,9 +2898,8 @@ out_unlock: | |||
2832 | return ret ?: nbytes; | 2898 | return ret ?: nbytes; |
2833 | 2899 | ||
2834 | err_undo_css: | 2900 | err_undo_css: |
2835 | cgrp->subtree_control &= ~enable; | 2901 | cgrp->subtree_control = old_sc; |
2836 | cgrp->subtree_control |= disable; | 2902 | cgrp->child_subsys_mask = old_ss; |
2837 | cgroup_refresh_child_subsys_mask(cgrp); | ||
2838 | 2903 | ||
2839 | for_each_subsys(ss, ssid) { | 2904 | for_each_subsys(ss, ssid) { |
2840 | if (!(enable & (1 << ssid))) | 2905 | if (!(enable & (1 << ssid))) |
@@ -4370,6 +4435,8 @@ static void css_release_work_fn(struct work_struct *work) | |||
4370 | if (ss) { | 4435 | if (ss) { |
4371 | /* css release path */ | 4436 | /* css release path */ |
4372 | cgroup_idr_remove(&ss->css_idr, css->id); | 4437 | cgroup_idr_remove(&ss->css_idr, css->id); |
4438 | if (ss->css_released) | ||
4439 | ss->css_released(css); | ||
4373 | } else { | 4440 | } else { |
4374 | /* cgroup release path */ | 4441 | /* cgroup release path */ |
4375 | cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); | 4442 | cgroup_idr_remove(&cgrp->root->cgroup_idr, cgrp->id); |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 723cfc9d0ad7..64b257f6bca2 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -248,34 +248,34 @@ static struct cpuset top_cpuset = { | |||
248 | if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) | 248 | if (is_cpuset_online(((des_cs) = css_cs((pos_css))))) |
249 | 249 | ||
250 | /* | 250 | /* |
251 | * There are two global mutexes guarding cpuset structures - cpuset_mutex | 251 | * There are two global locks guarding cpuset structures - cpuset_mutex and |
252 | * and callback_mutex. The latter may nest inside the former. We also | 252 | * callback_lock. We also require taking task_lock() when dereferencing a |
253 | * require taking task_lock() when dereferencing a task's cpuset pointer. | 253 | * task's cpuset pointer. See "The task_lock() exception", at the end of this |
254 | * See "The task_lock() exception", at the end of this comment. | 254 | * comment. |
255 | * | 255 | * |
256 | * A task must hold both mutexes to modify cpusets. If a task holds | 256 | * A task must hold both locks to modify cpusets. If a task holds |
257 | * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it | 257 | * cpuset_mutex, then it blocks others wanting that mutex, ensuring that it |
258 | * is the only task able to also acquire callback_mutex and be able to | 258 | * is the only task able to also acquire callback_lock and be able to |
259 | * modify cpusets. It can perform various checks on the cpuset structure | 259 | * modify cpusets. It can perform various checks on the cpuset structure |
260 | * first, knowing nothing will change. It can also allocate memory while | 260 | * first, knowing nothing will change. It can also allocate memory while |
261 | * just holding cpuset_mutex. While it is performing these checks, various | 261 | * just holding cpuset_mutex. While it is performing these checks, various |
262 | * callback routines can briefly acquire callback_mutex to query cpusets. | 262 | * callback routines can briefly acquire callback_lock to query cpusets. |
263 | * Once it is ready to make the changes, it takes callback_mutex, blocking | 263 | * Once it is ready to make the changes, it takes callback_lock, blocking |
264 | * everyone else. | 264 | * everyone else. |
265 | * | 265 | * |
266 | * Calls to the kernel memory allocator can not be made while holding | 266 | * Calls to the kernel memory allocator can not be made while holding |
267 | * callback_mutex, as that would risk double tripping on callback_mutex | 267 | * callback_lock, as that would risk double tripping on callback_lock |
268 | * from one of the callbacks into the cpuset code from within | 268 | * from one of the callbacks into the cpuset code from within |
269 | * __alloc_pages(). | 269 | * __alloc_pages(). |
270 | * | 270 | * |
271 | * If a task is only holding callback_mutex, then it has read-only | 271 | * If a task is only holding callback_lock, then it has read-only |
272 | * access to cpusets. | 272 | * access to cpusets. |
273 | * | 273 | * |
274 | * Now, the task_struct fields mems_allowed and mempolicy may be changed | 274 | * Now, the task_struct fields mems_allowed and mempolicy may be changed |
275 | * by other task, we use alloc_lock in the task_struct fields to protect | 275 | * by other task, we use alloc_lock in the task_struct fields to protect |
276 | * them. | 276 | * them. |
277 | * | 277 | * |
278 | * The cpuset_common_file_read() handlers only hold callback_mutex across | 278 | * The cpuset_common_file_read() handlers only hold callback_lock across |
279 | * small pieces of code, such as when reading out possibly multi-word | 279 | * small pieces of code, such as when reading out possibly multi-word |
280 | * cpumasks and nodemasks. | 280 | * cpumasks and nodemasks. |
281 | * | 281 | * |
@@ -284,7 +284,7 @@ static struct cpuset top_cpuset = { | |||
284 | */ | 284 | */ |
285 | 285 | ||
286 | static DEFINE_MUTEX(cpuset_mutex); | 286 | static DEFINE_MUTEX(cpuset_mutex); |
287 | static DEFINE_MUTEX(callback_mutex); | 287 | static DEFINE_SPINLOCK(callback_lock); |
288 | 288 | ||
289 | /* | 289 | /* |
290 | * CPU / memory hotplug is handled asynchronously. | 290 | * CPU / memory hotplug is handled asynchronously. |
@@ -329,7 +329,7 @@ static struct file_system_type cpuset_fs_type = { | |||
329 | * One way or another, we guarantee to return some non-empty subset | 329 | * One way or another, we guarantee to return some non-empty subset |
330 | * of cpu_online_mask. | 330 | * of cpu_online_mask. |
331 | * | 331 | * |
332 | * Call with callback_mutex held. | 332 | * Call with callback_lock or cpuset_mutex held. |
333 | */ | 333 | */ |
334 | static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) | 334 | static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) |
335 | { | 335 | { |
@@ -347,7 +347,7 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask) | |||
347 | * One way or another, we guarantee to return some non-empty subset | 347 | * One way or another, we guarantee to return some non-empty subset |
348 | * of node_states[N_MEMORY]. | 348 | * of node_states[N_MEMORY]. |
349 | * | 349 | * |
350 | * Call with callback_mutex held. | 350 | * Call with callback_lock or cpuset_mutex held. |
351 | */ | 351 | */ |
352 | static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) | 352 | static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) |
353 | { | 353 | { |
@@ -359,7 +359,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask) | |||
359 | /* | 359 | /* |
360 | * update task's spread flag if cpuset's page/slab spread flag is set | 360 | * update task's spread flag if cpuset's page/slab spread flag is set |
361 | * | 361 | * |
362 | * Called with callback_mutex/cpuset_mutex held | 362 | * Call with callback_lock or cpuset_mutex held. |
363 | */ | 363 | */ |
364 | static void cpuset_update_task_spread_flag(struct cpuset *cs, | 364 | static void cpuset_update_task_spread_flag(struct cpuset *cs, |
365 | struct task_struct *tsk) | 365 | struct task_struct *tsk) |
@@ -886,9 +886,9 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) | |||
886 | continue; | 886 | continue; |
887 | rcu_read_unlock(); | 887 | rcu_read_unlock(); |
888 | 888 | ||
889 | mutex_lock(&callback_mutex); | 889 | spin_lock_irq(&callback_lock); |
890 | cpumask_copy(cp->effective_cpus, new_cpus); | 890 | cpumask_copy(cp->effective_cpus, new_cpus); |
891 | mutex_unlock(&callback_mutex); | 891 | spin_unlock_irq(&callback_lock); |
892 | 892 | ||
893 | WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && | 893 | WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && |
894 | !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); | 894 | !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); |
@@ -953,9 +953,9 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, | |||
953 | if (retval < 0) | 953 | if (retval < 0) |
954 | return retval; | 954 | return retval; |
955 | 955 | ||
956 | mutex_lock(&callback_mutex); | 956 | spin_lock_irq(&callback_lock); |
957 | cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); | 957 | cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); |
958 | mutex_unlock(&callback_mutex); | 958 | spin_unlock_irq(&callback_lock); |
959 | 959 | ||
960 | /* use trialcs->cpus_allowed as a temp variable */ | 960 | /* use trialcs->cpus_allowed as a temp variable */ |
961 | update_cpumasks_hier(cs, trialcs->cpus_allowed); | 961 | update_cpumasks_hier(cs, trialcs->cpus_allowed); |
@@ -1142,9 +1142,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) | |||
1142 | continue; | 1142 | continue; |
1143 | rcu_read_unlock(); | 1143 | rcu_read_unlock(); |
1144 | 1144 | ||
1145 | mutex_lock(&callback_mutex); | 1145 | spin_lock_irq(&callback_lock); |
1146 | cp->effective_mems = *new_mems; | 1146 | cp->effective_mems = *new_mems; |
1147 | mutex_unlock(&callback_mutex); | 1147 | spin_unlock_irq(&callback_lock); |
1148 | 1148 | ||
1149 | WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && | 1149 | WARN_ON(!cgroup_on_dfl(cp->css.cgroup) && |
1150 | !nodes_equal(cp->mems_allowed, cp->effective_mems)); | 1150 | !nodes_equal(cp->mems_allowed, cp->effective_mems)); |
@@ -1165,7 +1165,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) | |||
1165 | * mempolicies and if the cpuset is marked 'memory_migrate', | 1165 | * mempolicies and if the cpuset is marked 'memory_migrate', |
1166 | * migrate the tasks pages to the new memory. | 1166 | * migrate the tasks pages to the new memory. |
1167 | * | 1167 | * |
1168 | * Call with cpuset_mutex held. May take callback_mutex during call. | 1168 | * Call with cpuset_mutex held. May take callback_lock during call. |
1169 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, | 1169 | * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, |
1170 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind | 1170 | * lock each such tasks mm->mmap_sem, scan its vma's and rebind |
1171 | * their mempolicies to the cpusets new mems_allowed. | 1171 | * their mempolicies to the cpusets new mems_allowed. |
@@ -1212,9 +1212,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, | |||
1212 | if (retval < 0) | 1212 | if (retval < 0) |
1213 | goto done; | 1213 | goto done; |
1214 | 1214 | ||
1215 | mutex_lock(&callback_mutex); | 1215 | spin_lock_irq(&callback_lock); |
1216 | cs->mems_allowed = trialcs->mems_allowed; | 1216 | cs->mems_allowed = trialcs->mems_allowed; |
1217 | mutex_unlock(&callback_mutex); | 1217 | spin_unlock_irq(&callback_lock); |
1218 | 1218 | ||
1219 | /* use trialcs->mems_allowed as a temp variable */ | 1219 | /* use trialcs->mems_allowed as a temp variable */ |
1220 | update_nodemasks_hier(cs, &cs->mems_allowed); | 1220 | update_nodemasks_hier(cs, &cs->mems_allowed); |
@@ -1305,9 +1305,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, | |||
1305 | spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) | 1305 | spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) |
1306 | || (is_spread_page(cs) != is_spread_page(trialcs))); | 1306 | || (is_spread_page(cs) != is_spread_page(trialcs))); |
1307 | 1307 | ||
1308 | mutex_lock(&callback_mutex); | 1308 | spin_lock_irq(&callback_lock); |
1309 | cs->flags = trialcs->flags; | 1309 | cs->flags = trialcs->flags; |
1310 | mutex_unlock(&callback_mutex); | 1310 | spin_unlock_irq(&callback_lock); |
1311 | 1311 | ||
1312 | if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) | 1312 | if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) |
1313 | rebuild_sched_domains_locked(); | 1313 | rebuild_sched_domains_locked(); |
@@ -1714,7 +1714,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) | |||
1714 | count = seq_get_buf(sf, &buf); | 1714 | count = seq_get_buf(sf, &buf); |
1715 | s = buf; | 1715 | s = buf; |
1716 | 1716 | ||
1717 | mutex_lock(&callback_mutex); | 1717 | spin_lock_irq(&callback_lock); |
1718 | 1718 | ||
1719 | switch (type) { | 1719 | switch (type) { |
1720 | case FILE_CPULIST: | 1720 | case FILE_CPULIST: |
@@ -1741,7 +1741,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) | |||
1741 | seq_commit(sf, -1); | 1741 | seq_commit(sf, -1); |
1742 | } | 1742 | } |
1743 | out_unlock: | 1743 | out_unlock: |
1744 | mutex_unlock(&callback_mutex); | 1744 | spin_unlock_irq(&callback_lock); |
1745 | return ret; | 1745 | return ret; |
1746 | } | 1746 | } |
1747 | 1747 | ||
@@ -1958,12 +1958,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) | |||
1958 | 1958 | ||
1959 | cpuset_inc(); | 1959 | cpuset_inc(); |
1960 | 1960 | ||
1961 | mutex_lock(&callback_mutex); | 1961 | spin_lock_irq(&callback_lock); |
1962 | if (cgroup_on_dfl(cs->css.cgroup)) { | 1962 | if (cgroup_on_dfl(cs->css.cgroup)) { |
1963 | cpumask_copy(cs->effective_cpus, parent->effective_cpus); | 1963 | cpumask_copy(cs->effective_cpus, parent->effective_cpus); |
1964 | cs->effective_mems = parent->effective_mems; | 1964 | cs->effective_mems = parent->effective_mems; |
1965 | } | 1965 | } |
1966 | mutex_unlock(&callback_mutex); | 1966 | spin_unlock_irq(&callback_lock); |
1967 | 1967 | ||
1968 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) | 1968 | if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) |
1969 | goto out_unlock; | 1969 | goto out_unlock; |
@@ -1990,10 +1990,10 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) | |||
1990 | } | 1990 | } |
1991 | rcu_read_unlock(); | 1991 | rcu_read_unlock(); |
1992 | 1992 | ||
1993 | mutex_lock(&callback_mutex); | 1993 | spin_lock_irq(&callback_lock); |
1994 | cs->mems_allowed = parent->mems_allowed; | 1994 | cs->mems_allowed = parent->mems_allowed; |
1995 | cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); | 1995 | cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); |
1996 | mutex_unlock(&callback_mutex); | 1996 | spin_unlock_irq(&callback_lock); |
1997 | out_unlock: | 1997 | out_unlock: |
1998 | mutex_unlock(&cpuset_mutex); | 1998 | mutex_unlock(&cpuset_mutex); |
1999 | return 0; | 1999 | return 0; |
@@ -2032,7 +2032,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) | |||
2032 | static void cpuset_bind(struct cgroup_subsys_state *root_css) | 2032 | static void cpuset_bind(struct cgroup_subsys_state *root_css) |
2033 | { | 2033 | { |
2034 | mutex_lock(&cpuset_mutex); | 2034 | mutex_lock(&cpuset_mutex); |
2035 | mutex_lock(&callback_mutex); | 2035 | spin_lock_irq(&callback_lock); |
2036 | 2036 | ||
2037 | if (cgroup_on_dfl(root_css->cgroup)) { | 2037 | if (cgroup_on_dfl(root_css->cgroup)) { |
2038 | cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); | 2038 | cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); |
@@ -2043,7 +2043,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) | |||
2043 | top_cpuset.mems_allowed = top_cpuset.effective_mems; | 2043 | top_cpuset.mems_allowed = top_cpuset.effective_mems; |
2044 | } | 2044 | } |
2045 | 2045 | ||
2046 | mutex_unlock(&callback_mutex); | 2046 | spin_unlock_irq(&callback_lock); |
2047 | mutex_unlock(&cpuset_mutex); | 2047 | mutex_unlock(&cpuset_mutex); |
2048 | } | 2048 | } |
2049 | 2049 | ||
@@ -2128,12 +2128,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs, | |||
2128 | { | 2128 | { |
2129 | bool is_empty; | 2129 | bool is_empty; |
2130 | 2130 | ||
2131 | mutex_lock(&callback_mutex); | 2131 | spin_lock_irq(&callback_lock); |
2132 | cpumask_copy(cs->cpus_allowed, new_cpus); | 2132 | cpumask_copy(cs->cpus_allowed, new_cpus); |
2133 | cpumask_copy(cs->effective_cpus, new_cpus); | 2133 | cpumask_copy(cs->effective_cpus, new_cpus); |
2134 | cs->mems_allowed = *new_mems; | 2134 | cs->mems_allowed = *new_mems; |
2135 | cs->effective_mems = *new_mems; | 2135 | cs->effective_mems = *new_mems; |
2136 | mutex_unlock(&callback_mutex); | 2136 | spin_unlock_irq(&callback_lock); |
2137 | 2137 | ||
2138 | /* | 2138 | /* |
2139 | * Don't call update_tasks_cpumask() if the cpuset becomes empty, | 2139 | * Don't call update_tasks_cpumask() if the cpuset becomes empty, |
@@ -2170,10 +2170,10 @@ hotplug_update_tasks(struct cpuset *cs, | |||
2170 | if (nodes_empty(*new_mems)) | 2170 | if (nodes_empty(*new_mems)) |
2171 | *new_mems = parent_cs(cs)->effective_mems; | 2171 | *new_mems = parent_cs(cs)->effective_mems; |
2172 | 2172 | ||
2173 | mutex_lock(&callback_mutex); | 2173 | spin_lock_irq(&callback_lock); |
2174 | cpumask_copy(cs->effective_cpus, new_cpus); | 2174 | cpumask_copy(cs->effective_cpus, new_cpus); |
2175 | cs->effective_mems = *new_mems; | 2175 | cs->effective_mems = *new_mems; |
2176 | mutex_unlock(&callback_mutex); | 2176 | spin_unlock_irq(&callback_lock); |
2177 | 2177 | ||
2178 | if (cpus_updated) | 2178 | if (cpus_updated) |
2179 | update_tasks_cpumask(cs); | 2179 | update_tasks_cpumask(cs); |
@@ -2259,21 +2259,21 @@ static void cpuset_hotplug_workfn(struct work_struct *work) | |||
2259 | 2259 | ||
2260 | /* synchronize cpus_allowed to cpu_active_mask */ | 2260 | /* synchronize cpus_allowed to cpu_active_mask */ |
2261 | if (cpus_updated) { | 2261 | if (cpus_updated) { |
2262 | mutex_lock(&callback_mutex); | 2262 | spin_lock_irq(&callback_lock); |
2263 | if (!on_dfl) | 2263 | if (!on_dfl) |
2264 | cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); | 2264 | cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); |
2265 | cpumask_copy(top_cpuset.effective_cpus, &new_cpus); | 2265 | cpumask_copy(top_cpuset.effective_cpus, &new_cpus); |
2266 | mutex_unlock(&callback_mutex); | 2266 | spin_unlock_irq(&callback_lock); |
2267 | /* we don't mess with cpumasks of tasks in top_cpuset */ | 2267 | /* we don't mess with cpumasks of tasks in top_cpuset */ |
2268 | } | 2268 | } |
2269 | 2269 | ||
2270 | /* synchronize mems_allowed to N_MEMORY */ | 2270 | /* synchronize mems_allowed to N_MEMORY */ |
2271 | if (mems_updated) { | 2271 | if (mems_updated) { |
2272 | mutex_lock(&callback_mutex); | 2272 | spin_lock_irq(&callback_lock); |
2273 | if (!on_dfl) | 2273 | if (!on_dfl) |
2274 | top_cpuset.mems_allowed = new_mems; | 2274 | top_cpuset.mems_allowed = new_mems; |
2275 | top_cpuset.effective_mems = new_mems; | 2275 | top_cpuset.effective_mems = new_mems; |
2276 | mutex_unlock(&callback_mutex); | 2276 | spin_unlock_irq(&callback_lock); |
2277 | update_tasks_nodemask(&top_cpuset); | 2277 | update_tasks_nodemask(&top_cpuset); |
2278 | } | 2278 | } |
2279 | 2279 | ||
@@ -2366,11 +2366,13 @@ void __init cpuset_init_smp(void) | |||
2366 | 2366 | ||
2367 | void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) | 2367 | void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) |
2368 | { | 2368 | { |
2369 | mutex_lock(&callback_mutex); | 2369 | unsigned long flags; |
2370 | |||
2371 | spin_lock_irqsave(&callback_lock, flags); | ||
2370 | rcu_read_lock(); | 2372 | rcu_read_lock(); |
2371 | guarantee_online_cpus(task_cs(tsk), pmask); | 2373 | guarantee_online_cpus(task_cs(tsk), pmask); |
2372 | rcu_read_unlock(); | 2374 | rcu_read_unlock(); |
2373 | mutex_unlock(&callback_mutex); | 2375 | spin_unlock_irqrestore(&callback_lock, flags); |
2374 | } | 2376 | } |
2375 | 2377 | ||
2376 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) | 2378 | void cpuset_cpus_allowed_fallback(struct task_struct *tsk) |
@@ -2416,12 +2418,13 @@ void cpuset_init_current_mems_allowed(void) | |||
2416 | nodemask_t cpuset_mems_allowed(struct task_struct *tsk) | 2418 | nodemask_t cpuset_mems_allowed(struct task_struct *tsk) |
2417 | { | 2419 | { |
2418 | nodemask_t mask; | 2420 | nodemask_t mask; |
2421 | unsigned long flags; | ||
2419 | 2422 | ||
2420 | mutex_lock(&callback_mutex); | 2423 | spin_lock_irqsave(&callback_lock, flags); |
2421 | rcu_read_lock(); | 2424 | rcu_read_lock(); |
2422 | guarantee_online_mems(task_cs(tsk), &mask); | 2425 | guarantee_online_mems(task_cs(tsk), &mask); |
2423 | rcu_read_unlock(); | 2426 | rcu_read_unlock(); |
2424 | mutex_unlock(&callback_mutex); | 2427 | spin_unlock_irqrestore(&callback_lock, flags); |
2425 | 2428 | ||
2426 | return mask; | 2429 | return mask; |
2427 | } | 2430 | } |
@@ -2440,7 +2443,7 @@ int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask) | |||
2440 | /* | 2443 | /* |
2441 | * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or | 2444 | * nearest_hardwall_ancestor() - Returns the nearest mem_exclusive or |
2442 | * mem_hardwall ancestor to the specified cpuset. Call holding | 2445 | * mem_hardwall ancestor to the specified cpuset. Call holding |
2443 | * callback_mutex. If no ancestor is mem_exclusive or mem_hardwall | 2446 | * callback_lock. If no ancestor is mem_exclusive or mem_hardwall |
2444 | * (an unusual configuration), then returns the root cpuset. | 2447 | * (an unusual configuration), then returns the root cpuset. |
2445 | */ | 2448 | */ |
2446 | static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | 2449 | static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) |
@@ -2451,7 +2454,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | |||
2451 | } | 2454 | } |
2452 | 2455 | ||
2453 | /** | 2456 | /** |
2454 | * cpuset_node_allowed_softwall - Can we allocate on a memory node? | 2457 | * cpuset_node_allowed - Can we allocate on a memory node? |
2455 | * @node: is this an allowed node? | 2458 | * @node: is this an allowed node? |
2456 | * @gfp_mask: memory allocation flags | 2459 | * @gfp_mask: memory allocation flags |
2457 | * | 2460 | * |
@@ -2463,13 +2466,6 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | |||
2463 | * flag, yes. | 2466 | * flag, yes. |
2464 | * Otherwise, no. | 2467 | * Otherwise, no. |
2465 | * | 2468 | * |
2466 | * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to | ||
2467 | * cpuset_node_allowed_hardwall(). Otherwise, cpuset_node_allowed_softwall() | ||
2468 | * might sleep, and might allow a node from an enclosing cpuset. | ||
2469 | * | ||
2470 | * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall | ||
2471 | * cpusets, and never sleeps. | ||
2472 | * | ||
2473 | * The __GFP_THISNODE placement logic is really handled elsewhere, | 2469 | * The __GFP_THISNODE placement logic is really handled elsewhere, |
2474 | * by forcibly using a zonelist starting at a specified node, and by | 2470 | * by forcibly using a zonelist starting at a specified node, and by |
2475 | * (in get_page_from_freelist()) refusing to consider the zones for | 2471 | * (in get_page_from_freelist()) refusing to consider the zones for |
@@ -2482,13 +2478,12 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | |||
2482 | * GFP_KERNEL allocations are not so marked, so can escape to the | 2478 | * GFP_KERNEL allocations are not so marked, so can escape to the |
2483 | * nearest enclosing hardwalled ancestor cpuset. | 2479 | * nearest enclosing hardwalled ancestor cpuset. |
2484 | * | 2480 | * |
2485 | * Scanning up parent cpusets requires callback_mutex. The | 2481 | * Scanning up parent cpusets requires callback_lock. The |
2486 | * __alloc_pages() routine only calls here with __GFP_HARDWALL bit | 2482 | * __alloc_pages() routine only calls here with __GFP_HARDWALL bit |
2487 | * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the | 2483 | * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the |
2488 | * current tasks mems_allowed came up empty on the first pass over | 2484 | * current tasks mems_allowed came up empty on the first pass over |
2489 | * the zonelist. So only GFP_KERNEL allocations, if all nodes in the | 2485 | * the zonelist. So only GFP_KERNEL allocations, if all nodes in the |
2490 | * cpuset are short of memory, might require taking the callback_mutex | 2486 | * cpuset are short of memory, might require taking the callback_lock. |
2491 | * mutex. | ||
2492 | * | 2487 | * |
2493 | * The first call here from mm/page_alloc:get_page_from_freelist() | 2488 | * The first call here from mm/page_alloc:get_page_from_freelist() |
2494 | * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, | 2489 | * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, |
@@ -2505,20 +2500,15 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) | |||
2505 | * TIF_MEMDIE - any node ok | 2500 | * TIF_MEMDIE - any node ok |
2506 | * GFP_KERNEL - any node in enclosing hardwalled cpuset ok | 2501 | * GFP_KERNEL - any node in enclosing hardwalled cpuset ok |
2507 | * GFP_USER - only nodes in current tasks mems allowed ok. | 2502 | * GFP_USER - only nodes in current tasks mems allowed ok. |
2508 | * | ||
2509 | * Rule: | ||
2510 | * Don't call cpuset_node_allowed_softwall if you can't sleep, unless you | ||
2511 | * pass in the __GFP_HARDWALL flag set in gfp_flag, which disables | ||
2512 | * the code that might scan up ancestor cpusets and sleep. | ||
2513 | */ | 2503 | */ |
2514 | int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) | 2504 | int __cpuset_node_allowed(int node, gfp_t gfp_mask) |
2515 | { | 2505 | { |
2516 | struct cpuset *cs; /* current cpuset ancestors */ | 2506 | struct cpuset *cs; /* current cpuset ancestors */ |
2517 | int allowed; /* is allocation in zone z allowed? */ | 2507 | int allowed; /* is allocation in zone z allowed? */ |
2508 | unsigned long flags; | ||
2518 | 2509 | ||
2519 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) | 2510 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) |
2520 | return 1; | 2511 | return 1; |
2521 | might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); | ||
2522 | if (node_isset(node, current->mems_allowed)) | 2512 | if (node_isset(node, current->mems_allowed)) |
2523 | return 1; | 2513 | return 1; |
2524 | /* | 2514 | /* |
@@ -2534,55 +2524,17 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) | |||
2534 | return 1; | 2524 | return 1; |
2535 | 2525 | ||
2536 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ | 2526 | /* Not hardwall and node outside mems_allowed: scan up cpusets */ |
2537 | mutex_lock(&callback_mutex); | 2527 | spin_lock_irqsave(&callback_lock, flags); |
2538 | 2528 | ||
2539 | rcu_read_lock(); | 2529 | rcu_read_lock(); |
2540 | cs = nearest_hardwall_ancestor(task_cs(current)); | 2530 | cs = nearest_hardwall_ancestor(task_cs(current)); |
2541 | allowed = node_isset(node, cs->mems_allowed); | 2531 | allowed = node_isset(node, cs->mems_allowed); |
2542 | rcu_read_unlock(); | 2532 | rcu_read_unlock(); |
2543 | 2533 | ||
2544 | mutex_unlock(&callback_mutex); | 2534 | spin_unlock_irqrestore(&callback_lock, flags); |
2545 | return allowed; | 2535 | return allowed; |
2546 | } | 2536 | } |
2547 | 2537 | ||
2548 | /* | ||
2549 | * cpuset_node_allowed_hardwall - Can we allocate on a memory node? | ||
2550 | * @node: is this an allowed node? | ||
2551 | * @gfp_mask: memory allocation flags | ||
2552 | * | ||
2553 | * If we're in interrupt, yes, we can always allocate. If __GFP_THISNODE is | ||
2554 | * set, yes, we can always allocate. If node is in our task's mems_allowed, | ||
2555 | * yes. If the task has been OOM killed and has access to memory reserves as | ||
2556 | * specified by the TIF_MEMDIE flag, yes. | ||
2557 | * Otherwise, no. | ||
2558 | * | ||
2559 | * The __GFP_THISNODE placement logic is really handled elsewhere, | ||
2560 | * by forcibly using a zonelist starting at a specified node, and by | ||
2561 | * (in get_page_from_freelist()) refusing to consider the zones for | ||
2562 | * any node on the zonelist except the first. By the time any such | ||
2563 | * calls get to this routine, we should just shut up and say 'yes'. | ||
2564 | * | ||
2565 | * Unlike the cpuset_node_allowed_softwall() variant, above, | ||
2566 | * this variant requires that the node be in the current task's | ||
2567 | * mems_allowed or that we're in interrupt. It does not scan up the | ||
2568 | * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset. | ||
2569 | * It never sleeps. | ||
2570 | */ | ||
2571 | int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) | ||
2572 | { | ||
2573 | if (in_interrupt() || (gfp_mask & __GFP_THISNODE)) | ||
2574 | return 1; | ||
2575 | if (node_isset(node, current->mems_allowed)) | ||
2576 | return 1; | ||
2577 | /* | ||
2578 | * Allow tasks that have access to memory reserves because they have | ||
2579 | * been OOM killed to get memory anywhere. | ||
2580 | */ | ||
2581 | if (unlikely(test_thread_flag(TIF_MEMDIE))) | ||
2582 | return 1; | ||
2583 | return 0; | ||
2584 | } | ||
2585 | |||
2586 | /** | 2538 | /** |
2587 | * cpuset_mem_spread_node() - On which node to begin search for a file page | 2539 | * cpuset_mem_spread_node() - On which node to begin search for a file page |
2588 | * cpuset_slab_spread_node() - On which node to begin search for a slab page | 2540 | * cpuset_slab_spread_node() - On which node to begin search for a slab page |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 30cd96879152..919b86a2164d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
@@ -582,7 +582,7 @@ retry_cpuset: | |||
582 | 582 | ||
583 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 583 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
584 | MAX_NR_ZONES - 1, nodemask) { | 584 | MAX_NR_ZONES - 1, nodemask) { |
585 | if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask(h))) { | 585 | if (cpuset_zone_allowed(zone, htlb_alloc_mask(h))) { |
586 | page = dequeue_huge_page_node(h, zone_to_nid(zone)); | 586 | page = dequeue_huge_page_node(h, zone_to_nid(zone)); |
587 | if (page) { | 587 | if (page) { |
588 | if (avoid_reserve) | 588 | if (avoid_reserve) |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3b014d326151..864bba992735 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -233,7 +233,7 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | |||
233 | /* Check this allocation failure is caused by cpuset's wall function */ | 233 | /* Check this allocation failure is caused by cpuset's wall function */ |
234 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 234 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
235 | high_zoneidx, nodemask) | 235 | high_zoneidx, nodemask) |
236 | if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) | 236 | if (!cpuset_zone_allowed(zone, gfp_mask)) |
237 | cpuset_limited = true; | 237 | cpuset_limited = true; |
238 | 238 | ||
239 | if (cpuset_limited) { | 239 | if (cpuset_limited) { |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a7198c065999..df542feaac3b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1990,7 +1990,7 @@ zonelist_scan: | |||
1990 | 1990 | ||
1991 | /* | 1991 | /* |
1992 | * Scan zonelist, looking for a zone with enough free. | 1992 | * Scan zonelist, looking for a zone with enough free. |
1993 | * See also __cpuset_node_allowed_softwall() comment in kernel/cpuset.c. | 1993 | * See also __cpuset_node_allowed() comment in kernel/cpuset.c. |
1994 | */ | 1994 | */ |
1995 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | 1995 | for_each_zone_zonelist_nodemask(zone, z, zonelist, |
1996 | high_zoneidx, nodemask) { | 1996 | high_zoneidx, nodemask) { |
@@ -2001,7 +2001,7 @@ zonelist_scan: | |||
2001 | continue; | 2001 | continue; |
2002 | if (cpusets_enabled() && | 2002 | if (cpusets_enabled() && |
2003 | (alloc_flags & ALLOC_CPUSET) && | 2003 | (alloc_flags & ALLOC_CPUSET) && |
2004 | !cpuset_zone_allowed_softwall(zone, gfp_mask)) | 2004 | !cpuset_zone_allowed(zone, gfp_mask)) |
2005 | continue; | 2005 | continue; |
2006 | /* | 2006 | /* |
2007 | * Distribute pages in proportion to the individual | 2007 | * Distribute pages in proportion to the individual |
@@ -2529,7 +2529,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) | |||
2529 | alloc_flags |= ALLOC_HARDER; | 2529 | alloc_flags |= ALLOC_HARDER; |
2530 | /* | 2530 | /* |
2531 | * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the | 2531 | * Ignore cpuset mems for GFP_ATOMIC rather than fail, see the |
2532 | * comment for __cpuset_node_allowed_softwall(). | 2532 | * comment for __cpuset_node_allowed(). |
2533 | */ | 2533 | */ |
2534 | alloc_flags &= ~ALLOC_CPUSET; | 2534 | alloc_flags &= ~ALLOC_CPUSET; |
2535 | } else if (unlikely(rt_task(current)) && !in_interrupt()) | 2535 | } else if (unlikely(rt_task(current)) && !in_interrupt()) |
@@ -3015,7 +3015,7 @@ retry: | |||
3015 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { | 3015 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { |
3016 | nid = zone_to_nid(zone); | 3016 | nid = zone_to_nid(zone); |
3017 | 3017 | ||
3018 | if (cpuset_zone_allowed_hardwall(zone, flags) && | 3018 | if (cpuset_zone_allowed(zone, flags | __GFP_HARDWALL) && |
3019 | get_node(cache, nid) && | 3019 | get_node(cache, nid) && |
3020 | get_node(cache, nid)->free_objects) { | 3020 | get_node(cache, nid)->free_objects) { |
3021 | obj = ____cache_alloc_node(cache, | 3021 | obj = ____cache_alloc_node(cache, |
@@ -1665,7 +1665,8 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags, | |||
1665 | 1665 | ||
1666 | n = get_node(s, zone_to_nid(zone)); | 1666 | n = get_node(s, zone_to_nid(zone)); |
1667 | 1667 | ||
1668 | if (n && cpuset_zone_allowed_hardwall(zone, flags) && | 1668 | if (n && cpuset_zone_allowed(zone, |
1669 | flags | __GFP_HARDWALL) && | ||
1669 | n->nr_partial > s->min_partial) { | 1670 | n->nr_partial > s->min_partial) { |
1670 | object = get_partial_node(s, n, c, flags); | 1671 | object = get_partial_node(s, n, c, flags); |
1671 | if (object) { | 1672 | if (object) { |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 4636d9e822c1..a384339bf718 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -2405,7 +2405,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) | |||
2405 | * to global LRU. | 2405 | * to global LRU. |
2406 | */ | 2406 | */ |
2407 | if (global_reclaim(sc)) { | 2407 | if (global_reclaim(sc)) { |
2408 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 2408 | if (!cpuset_zone_allowed(zone, |
2409 | GFP_KERNEL | __GFP_HARDWALL)) | ||
2409 | continue; | 2410 | continue; |
2410 | 2411 | ||
2411 | lru_pages += zone_reclaimable_pages(zone); | 2412 | lru_pages += zone_reclaimable_pages(zone); |
@@ -3388,7 +3389,7 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) | |||
3388 | if (!populated_zone(zone)) | 3389 | if (!populated_zone(zone)) |
3389 | return; | 3390 | return; |
3390 | 3391 | ||
3391 | if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) | 3392 | if (!cpuset_zone_allowed(zone, GFP_KERNEL | __GFP_HARDWALL)) |
3392 | return; | 3393 | return; |
3393 | pgdat = zone->zone_pgdat; | 3394 | pgdat = zone->zone_pgdat; |
3394 | if (pgdat->kswapd_max_order < order) { | 3395 | if (pgdat->kswapd_max_order < order) { |