diff options
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r-- | kernel/cpuset.c | 152 |
1 files changed, 119 insertions, 33 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c86ee051b734..18aea1bd1284 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -4,15 +4,14 @@ | |||
4 | * Processor and Memory placement constraints for sets of tasks. | 4 | * Processor and Memory placement constraints for sets of tasks. |
5 | * | 5 | * |
6 | * Copyright (C) 2003 BULL SA. | 6 | * Copyright (C) 2003 BULL SA. |
7 | * Copyright (C) 2004 Silicon Graphics, Inc. | 7 | * Copyright (C) 2004-2006 Silicon Graphics, Inc. |
8 | * | 8 | * |
9 | * Portions derived from Patrick Mochel's sysfs code. | 9 | * Portions derived from Patrick Mochel's sysfs code. |
10 | * sysfs is Copyright (c) 2001-3 Patrick Mochel | 10 | * sysfs is Copyright (c) 2001-3 Patrick Mochel |
11 | * Portions Copyright (c) 2004 Silicon Graphics, Inc. | ||
12 | * | 11 | * |
13 | * 2003-10-10 Written by Simon Derr <simon.derr@bull.net> | 12 | * 2003-10-10 Written by Simon Derr. |
14 | * 2003-10-22 Updates by Stephen Hemminger. | 13 | * 2003-10-22 Updates by Stephen Hemminger. |
15 | * 2004 May-July Rework by Paul Jackson <pj@sgi.com> | 14 | * 2004 May-July Rework by Paul Jackson. |
16 | * | 15 | * |
17 | * This file is subject to the terms and conditions of the GNU General Public | 16 | * This file is subject to the terms and conditions of the GNU General Public |
18 | * License. See the file COPYING in the main directory of the Linux | 17 | * License. See the file COPYING in the main directory of the Linux |
@@ -108,37 +107,49 @@ typedef enum { | |||
108 | CS_MEM_EXCLUSIVE, | 107 | CS_MEM_EXCLUSIVE, |
109 | CS_MEMORY_MIGRATE, | 108 | CS_MEMORY_MIGRATE, |
110 | CS_REMOVED, | 109 | CS_REMOVED, |
111 | CS_NOTIFY_ON_RELEASE | 110 | CS_NOTIFY_ON_RELEASE, |
111 | CS_SPREAD_PAGE, | ||
112 | CS_SPREAD_SLAB, | ||
112 | } cpuset_flagbits_t; | 113 | } cpuset_flagbits_t; |
113 | 114 | ||
114 | /* convenient tests for these bits */ | 115 | /* convenient tests for these bits */ |
115 | static inline int is_cpu_exclusive(const struct cpuset *cs) | 116 | static inline int is_cpu_exclusive(const struct cpuset *cs) |
116 | { | 117 | { |
117 | return !!test_bit(CS_CPU_EXCLUSIVE, &cs->flags); | 118 | return test_bit(CS_CPU_EXCLUSIVE, &cs->flags); |
118 | } | 119 | } |
119 | 120 | ||
120 | static inline int is_mem_exclusive(const struct cpuset *cs) | 121 | static inline int is_mem_exclusive(const struct cpuset *cs) |
121 | { | 122 | { |
122 | return !!test_bit(CS_MEM_EXCLUSIVE, &cs->flags); | 123 | return test_bit(CS_MEM_EXCLUSIVE, &cs->flags); |
123 | } | 124 | } |
124 | 125 | ||
125 | static inline int is_removed(const struct cpuset *cs) | 126 | static inline int is_removed(const struct cpuset *cs) |
126 | { | 127 | { |
127 | return !!test_bit(CS_REMOVED, &cs->flags); | 128 | return test_bit(CS_REMOVED, &cs->flags); |
128 | } | 129 | } |
129 | 130 | ||
130 | static inline int notify_on_release(const struct cpuset *cs) | 131 | static inline int notify_on_release(const struct cpuset *cs) |
131 | { | 132 | { |
132 | return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 133 | return test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
133 | } | 134 | } |
134 | 135 | ||
135 | static inline int is_memory_migrate(const struct cpuset *cs) | 136 | static inline int is_memory_migrate(const struct cpuset *cs) |
136 | { | 137 | { |
137 | return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags); | 138 | return test_bit(CS_MEMORY_MIGRATE, &cs->flags); |
139 | } | ||
140 | |||
141 | static inline int is_spread_page(const struct cpuset *cs) | ||
142 | { | ||
143 | return test_bit(CS_SPREAD_PAGE, &cs->flags); | ||
144 | } | ||
145 | |||
146 | static inline int is_spread_slab(const struct cpuset *cs) | ||
147 | { | ||
148 | return test_bit(CS_SPREAD_SLAB, &cs->flags); | ||
138 | } | 149 | } |
139 | 150 | ||
140 | /* | 151 | /* |
141 | * Increment this atomic integer everytime any cpuset changes its | 152 | * Increment this integer everytime any cpuset changes its |
142 | * mems_allowed value. Users of cpusets can track this generation | 153 | * mems_allowed value. Users of cpusets can track this generation |
143 | * number, and avoid having to lock and reload mems_allowed unless | 154 | * number, and avoid having to lock and reload mems_allowed unless |
144 | * the cpuset they're using changes generation. | 155 | * the cpuset they're using changes generation. |
@@ -152,8 +163,11 @@ static inline int is_memory_migrate(const struct cpuset *cs) | |||
152 | * on every visit to __alloc_pages(), to efficiently check whether | 163 | * on every visit to __alloc_pages(), to efficiently check whether |
153 | * its current->cpuset->mems_allowed has changed, requiring an update | 164 | * its current->cpuset->mems_allowed has changed, requiring an update |
154 | * of its current->mems_allowed. | 165 | * of its current->mems_allowed. |
166 | * | ||
167 | * Since cpuset_mems_generation is guarded by manage_mutex, | ||
168 | * there is no need to mark it atomic. | ||
155 | */ | 169 | */ |
156 | static atomic_t cpuset_mems_generation = ATOMIC_INIT(1); | 170 | static int cpuset_mems_generation; |
157 | 171 | ||
158 | static struct cpuset top_cpuset = { | 172 | static struct cpuset top_cpuset = { |
159 | .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), | 173 | .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), |
@@ -657,6 +671,14 @@ void cpuset_update_task_memory_state(void) | |||
657 | cs = tsk->cpuset; /* Maybe changed when task not locked */ | 671 | cs = tsk->cpuset; /* Maybe changed when task not locked */ |
658 | guarantee_online_mems(cs, &tsk->mems_allowed); | 672 | guarantee_online_mems(cs, &tsk->mems_allowed); |
659 | tsk->cpuset_mems_generation = cs->mems_generation; | 673 | tsk->cpuset_mems_generation = cs->mems_generation; |
674 | if (is_spread_page(cs)) | ||
675 | tsk->flags |= PF_SPREAD_PAGE; | ||
676 | else | ||
677 | tsk->flags &= ~PF_SPREAD_PAGE; | ||
678 | if (is_spread_slab(cs)) | ||
679 | tsk->flags |= PF_SPREAD_SLAB; | ||
680 | else | ||
681 | tsk->flags &= ~PF_SPREAD_SLAB; | ||
660 | task_unlock(tsk); | 682 | task_unlock(tsk); |
661 | mutex_unlock(&callback_mutex); | 683 | mutex_unlock(&callback_mutex); |
662 | mpol_rebind_task(tsk, &tsk->mems_allowed); | 684 | mpol_rebind_task(tsk, &tsk->mems_allowed); |
@@ -858,8 +880,7 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
858 | 880 | ||
859 | mutex_lock(&callback_mutex); | 881 | mutex_lock(&callback_mutex); |
860 | cs->mems_allowed = trialcs.mems_allowed; | 882 | cs->mems_allowed = trialcs.mems_allowed; |
861 | atomic_inc(&cpuset_mems_generation); | 883 | cs->mems_generation = cpuset_mems_generation++; |
862 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | ||
863 | mutex_unlock(&callback_mutex); | 884 | mutex_unlock(&callback_mutex); |
864 | 885 | ||
865 | set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ | 886 | set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ |
@@ -957,7 +978,8 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | |||
957 | /* | 978 | /* |
958 | * update_flag - read a 0 or a 1 in a file and update associated flag | 979 | * update_flag - read a 0 or a 1 in a file and update associated flag |
959 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, | 980 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, |
960 | * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) | 981 | * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, |
982 | * CS_SPREAD_PAGE, CS_SPREAD_SLAB) | ||
961 | * cs: the cpuset to update | 983 | * cs: the cpuset to update |
962 | * buf: the buffer where we read the 0 or 1 | 984 | * buf: the buffer where we read the 0 or 1 |
963 | * | 985 | * |
@@ -1188,6 +1210,8 @@ typedef enum { | |||
1188 | FILE_NOTIFY_ON_RELEASE, | 1210 | FILE_NOTIFY_ON_RELEASE, |
1189 | FILE_MEMORY_PRESSURE_ENABLED, | 1211 | FILE_MEMORY_PRESSURE_ENABLED, |
1190 | FILE_MEMORY_PRESSURE, | 1212 | FILE_MEMORY_PRESSURE, |
1213 | FILE_SPREAD_PAGE, | ||
1214 | FILE_SPREAD_SLAB, | ||
1191 | FILE_TASKLIST, | 1215 | FILE_TASKLIST, |
1192 | } cpuset_filetype_t; | 1216 | } cpuset_filetype_t; |
1193 | 1217 | ||
@@ -1247,6 +1271,14 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
1247 | case FILE_MEMORY_PRESSURE: | 1271 | case FILE_MEMORY_PRESSURE: |
1248 | retval = -EACCES; | 1272 | retval = -EACCES; |
1249 | break; | 1273 | break; |
1274 | case FILE_SPREAD_PAGE: | ||
1275 | retval = update_flag(CS_SPREAD_PAGE, cs, buffer); | ||
1276 | cs->mems_generation = cpuset_mems_generation++; | ||
1277 | break; | ||
1278 | case FILE_SPREAD_SLAB: | ||
1279 | retval = update_flag(CS_SPREAD_SLAB, cs, buffer); | ||
1280 | cs->mems_generation = cpuset_mems_generation++; | ||
1281 | break; | ||
1250 | case FILE_TASKLIST: | 1282 | case FILE_TASKLIST: |
1251 | retval = attach_task(cs, buffer, &pathbuf); | 1283 | retval = attach_task(cs, buffer, &pathbuf); |
1252 | break; | 1284 | break; |
@@ -1356,6 +1388,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | |||
1356 | case FILE_MEMORY_PRESSURE: | 1388 | case FILE_MEMORY_PRESSURE: |
1357 | s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); | 1389 | s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); |
1358 | break; | 1390 | break; |
1391 | case FILE_SPREAD_PAGE: | ||
1392 | *s++ = is_spread_page(cs) ? '1' : '0'; | ||
1393 | break; | ||
1394 | case FILE_SPREAD_SLAB: | ||
1395 | *s++ = is_spread_slab(cs) ? '1' : '0'; | ||
1396 | break; | ||
1359 | default: | 1397 | default: |
1360 | retval = -EINVAL; | 1398 | retval = -EINVAL; |
1361 | goto out; | 1399 | goto out; |
@@ -1719,6 +1757,16 @@ static struct cftype cft_memory_pressure = { | |||
1719 | .private = FILE_MEMORY_PRESSURE, | 1757 | .private = FILE_MEMORY_PRESSURE, |
1720 | }; | 1758 | }; |
1721 | 1759 | ||
1760 | static struct cftype cft_spread_page = { | ||
1761 | .name = "memory_spread_page", | ||
1762 | .private = FILE_SPREAD_PAGE, | ||
1763 | }; | ||
1764 | |||
1765 | static struct cftype cft_spread_slab = { | ||
1766 | .name = "memory_spread_slab", | ||
1767 | .private = FILE_SPREAD_SLAB, | ||
1768 | }; | ||
1769 | |||
1722 | static int cpuset_populate_dir(struct dentry *cs_dentry) | 1770 | static int cpuset_populate_dir(struct dentry *cs_dentry) |
1723 | { | 1771 | { |
1724 | int err; | 1772 | int err; |
@@ -1737,6 +1785,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) | |||
1737 | return err; | 1785 | return err; |
1738 | if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) | 1786 | if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) |
1739 | return err; | 1787 | return err; |
1788 | if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0) | ||
1789 | return err; | ||
1790 | if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0) | ||
1791 | return err; | ||
1740 | if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) | 1792 | if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) |
1741 | return err; | 1793 | return err; |
1742 | return 0; | 1794 | return 0; |
@@ -1765,13 +1817,16 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1765 | cs->flags = 0; | 1817 | cs->flags = 0; |
1766 | if (notify_on_release(parent)) | 1818 | if (notify_on_release(parent)) |
1767 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 1819 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
1820 | if (is_spread_page(parent)) | ||
1821 | set_bit(CS_SPREAD_PAGE, &cs->flags); | ||
1822 | if (is_spread_slab(parent)) | ||
1823 | set_bit(CS_SPREAD_SLAB, &cs->flags); | ||
1768 | cs->cpus_allowed = CPU_MASK_NONE; | 1824 | cs->cpus_allowed = CPU_MASK_NONE; |
1769 | cs->mems_allowed = NODE_MASK_NONE; | 1825 | cs->mems_allowed = NODE_MASK_NONE; |
1770 | atomic_set(&cs->count, 0); | 1826 | atomic_set(&cs->count, 0); |
1771 | INIT_LIST_HEAD(&cs->sibling); | 1827 | INIT_LIST_HEAD(&cs->sibling); |
1772 | INIT_LIST_HEAD(&cs->children); | 1828 | INIT_LIST_HEAD(&cs->children); |
1773 | atomic_inc(&cpuset_mems_generation); | 1829 | cs->mems_generation = cpuset_mems_generation++; |
1774 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | ||
1775 | fmeter_init(&cs->fmeter); | 1830 | fmeter_init(&cs->fmeter); |
1776 | 1831 | ||
1777 | cs->parent = parent; | 1832 | cs->parent = parent; |
@@ -1861,7 +1916,7 @@ int __init cpuset_init_early(void) | |||
1861 | struct task_struct *tsk = current; | 1916 | struct task_struct *tsk = current; |
1862 | 1917 | ||
1863 | tsk->cpuset = &top_cpuset; | 1918 | tsk->cpuset = &top_cpuset; |
1864 | tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation); | 1919 | tsk->cpuset->mems_generation = cpuset_mems_generation++; |
1865 | return 0; | 1920 | return 0; |
1866 | } | 1921 | } |
1867 | 1922 | ||
@@ -1880,8 +1935,7 @@ int __init cpuset_init(void) | |||
1880 | top_cpuset.mems_allowed = NODE_MASK_ALL; | 1935 | top_cpuset.mems_allowed = NODE_MASK_ALL; |
1881 | 1936 | ||
1882 | fmeter_init(&top_cpuset.fmeter); | 1937 | fmeter_init(&top_cpuset.fmeter); |
1883 | atomic_inc(&cpuset_mems_generation); | 1938 | top_cpuset.mems_generation = cpuset_mems_generation++; |
1884 | top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); | ||
1885 | 1939 | ||
1886 | init_task.cpuset = &top_cpuset; | 1940 | init_task.cpuset = &top_cpuset; |
1887 | 1941 | ||
@@ -1972,7 +2026,7 @@ void cpuset_fork(struct task_struct *child) | |||
1972 | * because tsk is already marked PF_EXITING, so attach_task() won't | 2026 | * because tsk is already marked PF_EXITING, so attach_task() won't |
1973 | * mess with it, or task is a failed fork, never visible to attach_task. | 2027 | * mess with it, or task is a failed fork, never visible to attach_task. |
1974 | * | 2028 | * |
1975 | * Hack: | 2029 | * the_top_cpuset_hack: |
1976 | * | 2030 | * |
1977 | * Set the exiting tasks cpuset to the root cpuset (top_cpuset). | 2031 | * Set the exiting tasks cpuset to the root cpuset (top_cpuset). |
1978 | * | 2032 | * |
@@ -2011,7 +2065,7 @@ void cpuset_exit(struct task_struct *tsk) | |||
2011 | struct cpuset *cs; | 2065 | struct cpuset *cs; |
2012 | 2066 | ||
2013 | cs = tsk->cpuset; | 2067 | cs = tsk->cpuset; |
2014 | tsk->cpuset = &top_cpuset; /* Hack - see comment above */ | 2068 | tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */ |
2015 | 2069 | ||
2016 | if (notify_on_release(cs)) { | 2070 | if (notify_on_release(cs)) { |
2017 | char *pathbuf = NULL; | 2071 | char *pathbuf = NULL; |
@@ -2151,7 +2205,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) | |||
2151 | { | 2205 | { |
2152 | int node; /* node that zone z is on */ | 2206 | int node; /* node that zone z is on */ |
2153 | const struct cpuset *cs; /* current cpuset ancestors */ | 2207 | const struct cpuset *cs; /* current cpuset ancestors */ |
2154 | int allowed = 1; /* is allocation in zone z allowed? */ | 2208 | int allowed; /* is allocation in zone z allowed? */ |
2155 | 2209 | ||
2156 | if (in_interrupt()) | 2210 | if (in_interrupt()) |
2157 | return 1; | 2211 | return 1; |
@@ -2204,6 +2258,44 @@ void cpuset_unlock(void) | |||
2204 | } | 2258 | } |
2205 | 2259 | ||
2206 | /** | 2260 | /** |
2261 | * cpuset_mem_spread_node() - On which node to begin search for a page | ||
2262 | * | ||
2263 | * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for | ||
2264 | * tasks in a cpuset with is_spread_page or is_spread_slab set), | ||
2265 | * and if the memory allocation used cpuset_mem_spread_node() | ||
2266 | * to determine on which node to start looking, as it will for | ||
2267 | * certain page cache or slab cache pages such as used for file | ||
2268 | * system buffers and inode caches, then instead of starting on the | ||
2269 | * local node to look for a free page, rather spread the starting | ||
2270 | * node around the tasks mems_allowed nodes. | ||
2271 | * | ||
2272 | * We don't have to worry about the returned node being offline | ||
2273 | * because "it can't happen", and even if it did, it would be ok. | ||
2274 | * | ||
2275 | * The routines calling guarantee_online_mems() are careful to | ||
2276 | * only set nodes in task->mems_allowed that are online. So it | ||
2277 | * should not be possible for the following code to return an | ||
2278 | * offline node. But if it did, that would be ok, as this routine | ||
2279 | * is not returning the node where the allocation must be, only | ||
2280 | * the node where the search should start. The zonelist passed to | ||
2281 | * __alloc_pages() will include all nodes. If the slab allocator | ||
2282 | * is passed an offline node, it will fall back to the local node. | ||
2283 | * See kmem_cache_alloc_node(). | ||
2284 | */ | ||
2285 | |||
2286 | int cpuset_mem_spread_node(void) | ||
2287 | { | ||
2288 | int node; | ||
2289 | |||
2290 | node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed); | ||
2291 | if (node == MAX_NUMNODES) | ||
2292 | node = first_node(current->mems_allowed); | ||
2293 | current->cpuset_mem_spread_rotor = node; | ||
2294 | return node; | ||
2295 | } | ||
2296 | EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); | ||
2297 | |||
2298 | /** | ||
2207 | * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? | 2299 | * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? |
2208 | * @p: pointer to task_struct of some other task. | 2300 | * @p: pointer to task_struct of some other task. |
2209 | * | 2301 | * |
@@ -2284,12 +2376,12 @@ void __cpuset_memory_pressure_bump(void) | |||
2284 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it | 2376 | * - No need to task_lock(tsk) on this tsk->cpuset reference, as it |
2285 | * doesn't really matter if tsk->cpuset changes after we read it, | 2377 | * doesn't really matter if tsk->cpuset changes after we read it, |
2286 | * and we take manage_mutex, keeping attach_task() from changing it | 2378 | * and we take manage_mutex, keeping attach_task() from changing it |
2287 | * anyway. | 2379 | * anyway. No need to check that tsk->cpuset != NULL, thanks to |
2380 | * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks | ||
2381 | * cpuset to top_cpuset. | ||
2288 | */ | 2382 | */ |
2289 | |||
2290 | static int proc_cpuset_show(struct seq_file *m, void *v) | 2383 | static int proc_cpuset_show(struct seq_file *m, void *v) |
2291 | { | 2384 | { |
2292 | struct cpuset *cs; | ||
2293 | struct task_struct *tsk; | 2385 | struct task_struct *tsk; |
2294 | char *buf; | 2386 | char *buf; |
2295 | int retval = 0; | 2387 | int retval = 0; |
@@ -2300,13 +2392,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v) | |||
2300 | 2392 | ||
2301 | tsk = m->private; | 2393 | tsk = m->private; |
2302 | mutex_lock(&manage_mutex); | 2394 | mutex_lock(&manage_mutex); |
2303 | cs = tsk->cpuset; | 2395 | retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); |
2304 | if (!cs) { | ||
2305 | retval = -EINVAL; | ||
2306 | goto out; | ||
2307 | } | ||
2308 | |||
2309 | retval = cpuset_path(cs, buf, PAGE_SIZE); | ||
2310 | if (retval < 0) | 2396 | if (retval < 0) |
2311 | goto out; | 2397 | goto out; |
2312 | seq_puts(m, buf); | 2398 | seq_puts(m, buf); |