aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/cpuset.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/cpuset.c')
-rw-r--r--kernel/cpuset.c152
1 files changed, 119 insertions, 33 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c86ee051b734..18aea1bd1284 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -4,15 +4,14 @@
4 * Processor and Memory placement constraints for sets of tasks. 4 * Processor and Memory placement constraints for sets of tasks.
5 * 5 *
6 * Copyright (C) 2003 BULL SA. 6 * Copyright (C) 2003 BULL SA.
7 * Copyright (C) 2004 Silicon Graphics, Inc. 7 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
8 * 8 *
9 * Portions derived from Patrick Mochel's sysfs code. 9 * Portions derived from Patrick Mochel's sysfs code.
10 * sysfs is Copyright (c) 2001-3 Patrick Mochel 10 * sysfs is Copyright (c) 2001-3 Patrick Mochel
11 * Portions Copyright (c) 2004 Silicon Graphics, Inc.
12 * 11 *
13 * 2003-10-10 Written by Simon Derr <simon.derr@bull.net> 12 * 2003-10-10 Written by Simon Derr.
14 * 2003-10-22 Updates by Stephen Hemminger. 13 * 2003-10-22 Updates by Stephen Hemminger.
15 * 2004 May-July Rework by Paul Jackson <pj@sgi.com> 14 * 2004 May-July Rework by Paul Jackson.
16 * 15 *
17 * This file is subject to the terms and conditions of the GNU General Public 16 * This file is subject to the terms and conditions of the GNU General Public
18 * License. See the file COPYING in the main directory of the Linux 17 * License. See the file COPYING in the main directory of the Linux
@@ -108,37 +107,49 @@ typedef enum {
108 CS_MEM_EXCLUSIVE, 107 CS_MEM_EXCLUSIVE,
109 CS_MEMORY_MIGRATE, 108 CS_MEMORY_MIGRATE,
110 CS_REMOVED, 109 CS_REMOVED,
111 CS_NOTIFY_ON_RELEASE 110 CS_NOTIFY_ON_RELEASE,
111 CS_SPREAD_PAGE,
112 CS_SPREAD_SLAB,
112} cpuset_flagbits_t; 113} cpuset_flagbits_t;
113 114
114/* convenient tests for these bits */ 115/* convenient tests for these bits */
115static inline int is_cpu_exclusive(const struct cpuset *cs) 116static inline int is_cpu_exclusive(const struct cpuset *cs)
116{ 117{
117 return !!test_bit(CS_CPU_EXCLUSIVE, &cs->flags); 118 return test_bit(CS_CPU_EXCLUSIVE, &cs->flags);
118} 119}
119 120
120static inline int is_mem_exclusive(const struct cpuset *cs) 121static inline int is_mem_exclusive(const struct cpuset *cs)
121{ 122{
122 return !!test_bit(CS_MEM_EXCLUSIVE, &cs->flags); 123 return test_bit(CS_MEM_EXCLUSIVE, &cs->flags);
123} 124}
124 125
125static inline int is_removed(const struct cpuset *cs) 126static inline int is_removed(const struct cpuset *cs)
126{ 127{
127 return !!test_bit(CS_REMOVED, &cs->flags); 128 return test_bit(CS_REMOVED, &cs->flags);
128} 129}
129 130
130static inline int notify_on_release(const struct cpuset *cs) 131static inline int notify_on_release(const struct cpuset *cs)
131{ 132{
132 return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 133 return test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
133} 134}
134 135
135static inline int is_memory_migrate(const struct cpuset *cs) 136static inline int is_memory_migrate(const struct cpuset *cs)
136{ 137{
137 return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags); 138 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
139}
140
141static inline int is_spread_page(const struct cpuset *cs)
142{
143 return test_bit(CS_SPREAD_PAGE, &cs->flags);
144}
145
146static inline int is_spread_slab(const struct cpuset *cs)
147{
148 return test_bit(CS_SPREAD_SLAB, &cs->flags);
138} 149}
139 150
140/* 151/*
141 * Increment this atomic integer everytime any cpuset changes its 152 * Increment this integer everytime any cpuset changes its
142 * mems_allowed value. Users of cpusets can track this generation 153 * mems_allowed value. Users of cpusets can track this generation
143 * number, and avoid having to lock and reload mems_allowed unless 154 * number, and avoid having to lock and reload mems_allowed unless
144 * the cpuset they're using changes generation. 155 * the cpuset they're using changes generation.
@@ -152,8 +163,11 @@ static inline int is_memory_migrate(const struct cpuset *cs)
152 * on every visit to __alloc_pages(), to efficiently check whether 163 * on every visit to __alloc_pages(), to efficiently check whether
153 * its current->cpuset->mems_allowed has changed, requiring an update 164 * its current->cpuset->mems_allowed has changed, requiring an update
154 * of its current->mems_allowed. 165 * of its current->mems_allowed.
166 *
167 * Since cpuset_mems_generation is guarded by manage_mutex,
168 * there is no need to mark it atomic.
155 */ 169 */
156static atomic_t cpuset_mems_generation = ATOMIC_INIT(1); 170static int cpuset_mems_generation;
157 171
158static struct cpuset top_cpuset = { 172static struct cpuset top_cpuset = {
159 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)), 173 .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
@@ -657,6 +671,14 @@ void cpuset_update_task_memory_state(void)
657 cs = tsk->cpuset; /* Maybe changed when task not locked */ 671 cs = tsk->cpuset; /* Maybe changed when task not locked */
658 guarantee_online_mems(cs, &tsk->mems_allowed); 672 guarantee_online_mems(cs, &tsk->mems_allowed);
659 tsk->cpuset_mems_generation = cs->mems_generation; 673 tsk->cpuset_mems_generation = cs->mems_generation;
674 if (is_spread_page(cs))
675 tsk->flags |= PF_SPREAD_PAGE;
676 else
677 tsk->flags &= ~PF_SPREAD_PAGE;
678 if (is_spread_slab(cs))
679 tsk->flags |= PF_SPREAD_SLAB;
680 else
681 tsk->flags &= ~PF_SPREAD_SLAB;
660 task_unlock(tsk); 682 task_unlock(tsk);
661 mutex_unlock(&callback_mutex); 683 mutex_unlock(&callback_mutex);
662 mpol_rebind_task(tsk, &tsk->mems_allowed); 684 mpol_rebind_task(tsk, &tsk->mems_allowed);
@@ -858,8 +880,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
858 880
859 mutex_lock(&callback_mutex); 881 mutex_lock(&callback_mutex);
860 cs->mems_allowed = trialcs.mems_allowed; 882 cs->mems_allowed = trialcs.mems_allowed;
861 atomic_inc(&cpuset_mems_generation); 883 cs->mems_generation = cpuset_mems_generation++;
862 cs->mems_generation = atomic_read(&cpuset_mems_generation);
863 mutex_unlock(&callback_mutex); 884 mutex_unlock(&callback_mutex);
864 885
865 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ 886 set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */
@@ -957,7 +978,8 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
957/* 978/*
958 * update_flag - read a 0 or a 1 in a file and update associated flag 979 * update_flag - read a 0 or a 1 in a file and update associated flag
959 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 980 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
960 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) 981 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
982 * CS_SPREAD_PAGE, CS_SPREAD_SLAB)
961 * cs: the cpuset to update 983 * cs: the cpuset to update
962 * buf: the buffer where we read the 0 or 1 984 * buf: the buffer where we read the 0 or 1
963 * 985 *
@@ -1188,6 +1210,8 @@ typedef enum {
1188 FILE_NOTIFY_ON_RELEASE, 1210 FILE_NOTIFY_ON_RELEASE,
1189 FILE_MEMORY_PRESSURE_ENABLED, 1211 FILE_MEMORY_PRESSURE_ENABLED,
1190 FILE_MEMORY_PRESSURE, 1212 FILE_MEMORY_PRESSURE,
1213 FILE_SPREAD_PAGE,
1214 FILE_SPREAD_SLAB,
1191 FILE_TASKLIST, 1215 FILE_TASKLIST,
1192} cpuset_filetype_t; 1216} cpuset_filetype_t;
1193 1217
@@ -1247,6 +1271,14 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
1247 case FILE_MEMORY_PRESSURE: 1271 case FILE_MEMORY_PRESSURE:
1248 retval = -EACCES; 1272 retval = -EACCES;
1249 break; 1273 break;
1274 case FILE_SPREAD_PAGE:
1275 retval = update_flag(CS_SPREAD_PAGE, cs, buffer);
1276 cs->mems_generation = cpuset_mems_generation++;
1277 break;
1278 case FILE_SPREAD_SLAB:
1279 retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
1280 cs->mems_generation = cpuset_mems_generation++;
1281 break;
1250 case FILE_TASKLIST: 1282 case FILE_TASKLIST:
1251 retval = attach_task(cs, buffer, &pathbuf); 1283 retval = attach_task(cs, buffer, &pathbuf);
1252 break; 1284 break;
@@ -1356,6 +1388,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
1356 case FILE_MEMORY_PRESSURE: 1388 case FILE_MEMORY_PRESSURE:
1357 s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); 1389 s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
1358 break; 1390 break;
1391 case FILE_SPREAD_PAGE:
1392 *s++ = is_spread_page(cs) ? '1' : '0';
1393 break;
1394 case FILE_SPREAD_SLAB:
1395 *s++ = is_spread_slab(cs) ? '1' : '0';
1396 break;
1359 default: 1397 default:
1360 retval = -EINVAL; 1398 retval = -EINVAL;
1361 goto out; 1399 goto out;
@@ -1719,6 +1757,16 @@ static struct cftype cft_memory_pressure = {
1719 .private = FILE_MEMORY_PRESSURE, 1757 .private = FILE_MEMORY_PRESSURE,
1720}; 1758};
1721 1759
1760static struct cftype cft_spread_page = {
1761 .name = "memory_spread_page",
1762 .private = FILE_SPREAD_PAGE,
1763};
1764
1765static struct cftype cft_spread_slab = {
1766 .name = "memory_spread_slab",
1767 .private = FILE_SPREAD_SLAB,
1768};
1769
1722static int cpuset_populate_dir(struct dentry *cs_dentry) 1770static int cpuset_populate_dir(struct dentry *cs_dentry)
1723{ 1771{
1724 int err; 1772 int err;
@@ -1737,6 +1785,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
1737 return err; 1785 return err;
1738 if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) 1786 if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0)
1739 return err; 1787 return err;
1788 if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0)
1789 return err;
1790 if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0)
1791 return err;
1740 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) 1792 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
1741 return err; 1793 return err;
1742 return 0; 1794 return 0;
@@ -1765,13 +1817,16 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1765 cs->flags = 0; 1817 cs->flags = 0;
1766 if (notify_on_release(parent)) 1818 if (notify_on_release(parent))
1767 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 1819 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
1820 if (is_spread_page(parent))
1821 set_bit(CS_SPREAD_PAGE, &cs->flags);
1822 if (is_spread_slab(parent))
1823 set_bit(CS_SPREAD_SLAB, &cs->flags);
1768 cs->cpus_allowed = CPU_MASK_NONE; 1824 cs->cpus_allowed = CPU_MASK_NONE;
1769 cs->mems_allowed = NODE_MASK_NONE; 1825 cs->mems_allowed = NODE_MASK_NONE;
1770 atomic_set(&cs->count, 0); 1826 atomic_set(&cs->count, 0);
1771 INIT_LIST_HEAD(&cs->sibling); 1827 INIT_LIST_HEAD(&cs->sibling);
1772 INIT_LIST_HEAD(&cs->children); 1828 INIT_LIST_HEAD(&cs->children);
1773 atomic_inc(&cpuset_mems_generation); 1829 cs->mems_generation = cpuset_mems_generation++;
1774 cs->mems_generation = atomic_read(&cpuset_mems_generation);
1775 fmeter_init(&cs->fmeter); 1830 fmeter_init(&cs->fmeter);
1776 1831
1777 cs->parent = parent; 1832 cs->parent = parent;
@@ -1861,7 +1916,7 @@ int __init cpuset_init_early(void)
1861 struct task_struct *tsk = current; 1916 struct task_struct *tsk = current;
1862 1917
1863 tsk->cpuset = &top_cpuset; 1918 tsk->cpuset = &top_cpuset;
1864 tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation); 1919 tsk->cpuset->mems_generation = cpuset_mems_generation++;
1865 return 0; 1920 return 0;
1866} 1921}
1867 1922
@@ -1880,8 +1935,7 @@ int __init cpuset_init(void)
1880 top_cpuset.mems_allowed = NODE_MASK_ALL; 1935 top_cpuset.mems_allowed = NODE_MASK_ALL;
1881 1936
1882 fmeter_init(&top_cpuset.fmeter); 1937 fmeter_init(&top_cpuset.fmeter);
1883 atomic_inc(&cpuset_mems_generation); 1938 top_cpuset.mems_generation = cpuset_mems_generation++;
1884 top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
1885 1939
1886 init_task.cpuset = &top_cpuset; 1940 init_task.cpuset = &top_cpuset;
1887 1941
@@ -1972,7 +2026,7 @@ void cpuset_fork(struct task_struct *child)
1972 * because tsk is already marked PF_EXITING, so attach_task() won't 2026 * because tsk is already marked PF_EXITING, so attach_task() won't
1973 * mess with it, or task is a failed fork, never visible to attach_task. 2027 * mess with it, or task is a failed fork, never visible to attach_task.
1974 * 2028 *
1975 * Hack: 2029 * the_top_cpuset_hack:
1976 * 2030 *
1977 * Set the exiting tasks cpuset to the root cpuset (top_cpuset). 2031 * Set the exiting tasks cpuset to the root cpuset (top_cpuset).
1978 * 2032 *
@@ -2011,7 +2065,7 @@ void cpuset_exit(struct task_struct *tsk)
2011 struct cpuset *cs; 2065 struct cpuset *cs;
2012 2066
2013 cs = tsk->cpuset; 2067 cs = tsk->cpuset;
2014 tsk->cpuset = &top_cpuset; /* Hack - see comment above */ 2068 tsk->cpuset = &top_cpuset; /* the_top_cpuset_hack - see above */
2015 2069
2016 if (notify_on_release(cs)) { 2070 if (notify_on_release(cs)) {
2017 char *pathbuf = NULL; 2071 char *pathbuf = NULL;
@@ -2151,7 +2205,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
2151{ 2205{
2152 int node; /* node that zone z is on */ 2206 int node; /* node that zone z is on */
2153 const struct cpuset *cs; /* current cpuset ancestors */ 2207 const struct cpuset *cs; /* current cpuset ancestors */
2154 int allowed = 1; /* is allocation in zone z allowed? */ 2208 int allowed; /* is allocation in zone z allowed? */
2155 2209
2156 if (in_interrupt()) 2210 if (in_interrupt())
2157 return 1; 2211 return 1;
@@ -2204,6 +2258,44 @@ void cpuset_unlock(void)
2204} 2258}
2205 2259
2206/** 2260/**
2261 * cpuset_mem_spread_node() - On which node to begin search for a page
2262 *
2263 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
2264 * tasks in a cpuset with is_spread_page or is_spread_slab set),
2265 * and if the memory allocation used cpuset_mem_spread_node()
2266 * to determine on which node to start looking, as it will for
2267 * certain page cache or slab cache pages such as used for file
2268 * system buffers and inode caches, then instead of starting on the
2269 * local node to look for a free page, rather spread the starting
2270 * node around the tasks mems_allowed nodes.
2271 *
2272 * We don't have to worry about the returned node being offline
2273 * because "it can't happen", and even if it did, it would be ok.
2274 *
2275 * The routines calling guarantee_online_mems() are careful to
2276 * only set nodes in task->mems_allowed that are online. So it
2277 * should not be possible for the following code to return an
2278 * offline node. But if it did, that would be ok, as this routine
2279 * is not returning the node where the allocation must be, only
2280 * the node where the search should start. The zonelist passed to
2281 * __alloc_pages() will include all nodes. If the slab allocator
2282 * is passed an offline node, it will fall back to the local node.
2283 * See kmem_cache_alloc_node().
2284 */
2285
2286int cpuset_mem_spread_node(void)
2287{
2288 int node;
2289
2290 node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
2291 if (node == MAX_NUMNODES)
2292 node = first_node(current->mems_allowed);
2293 current->cpuset_mem_spread_rotor = node;
2294 return node;
2295}
2296EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2297
2298/**
2207 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? 2299 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
2208 * @p: pointer to task_struct of some other task. 2300 * @p: pointer to task_struct of some other task.
2209 * 2301 *
@@ -2284,12 +2376,12 @@ void __cpuset_memory_pressure_bump(void)
2284 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it 2376 * - No need to task_lock(tsk) on this tsk->cpuset reference, as it
2285 * doesn't really matter if tsk->cpuset changes after we read it, 2377 * doesn't really matter if tsk->cpuset changes after we read it,
2286 * and we take manage_mutex, keeping attach_task() from changing it 2378 * and we take manage_mutex, keeping attach_task() from changing it
2287 * anyway. 2379 * anyway. No need to check that tsk->cpuset != NULL, thanks to
2380 * the_top_cpuset_hack in cpuset_exit(), which sets an exiting tasks
2381 * cpuset to top_cpuset.
2288 */ 2382 */
2289
2290static int proc_cpuset_show(struct seq_file *m, void *v) 2383static int proc_cpuset_show(struct seq_file *m, void *v)
2291{ 2384{
2292 struct cpuset *cs;
2293 struct task_struct *tsk; 2385 struct task_struct *tsk;
2294 char *buf; 2386 char *buf;
2295 int retval = 0; 2387 int retval = 0;
@@ -2300,13 +2392,7 @@ static int proc_cpuset_show(struct seq_file *m, void *v)
2300 2392
2301 tsk = m->private; 2393 tsk = m->private;
2302 mutex_lock(&manage_mutex); 2394 mutex_lock(&manage_mutex);
2303 cs = tsk->cpuset; 2395 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
2304 if (!cs) {
2305 retval = -EINVAL;
2306 goto out;
2307 }
2308
2309 retval = cpuset_path(cs, buf, PAGE_SIZE);
2310 if (retval < 0) 2396 if (retval < 0)
2311 goto out; 2397 goto out;
2312 seq_puts(m, buf); 2398 seq_puts(m, buf);