aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPaul Jackson <pj@sgi.com>2006-03-24 06:16:03 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-24 10:33:22 -0500
commit825a46af5ac171f9f41f794a0a00165588ba1589 (patch)
treeb690fe9d809d7b047f0393097fc79892e1217d98 /kernel
parent8a39cc60bfa5a72f32d975729a354daca124f6de (diff)
[PATCH] cpuset memory spread basic implementation
This patch provides the implementation and cpuset interface for an alternative memory allocation policy that can be applied to certain kinds of memory allocations, such as the page cache (file system buffers) and some slab caches (such as inode caches). The policy is called "memory spreading." If enabled, it spreads out these kinds of memory allocations over all the nodes allowed to a task, instead of preferring to place them on the node where the task is executing. All other kinds of allocations, including anonymous pages for a tasks stack and data regions, are not affected by this policy choice, and continue to be allocated preferring the node local to execution, as modified by the NUMA mempolicy. There are two boolean flag files per cpuset that control where the kernel allocates pages for the file system buffers and related in kernel data structures. They are called 'memory_spread_page' and 'memory_spread_slab'. If the per-cpuset boolean flag file 'memory_spread_page' is set, then the kernel will spread the file system buffers (page cache) evenly over all the nodes that the faulting task is allowed to use, instead of preferring to put those pages on the node where the task is running. If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the kernel will spread some file system related slab caches, such as for inodes and dentries evenly over all the nodes that the faulting task is allowed to use, instead of preferring to put those pages on the node where the task is running. The implementation is simple. Setting the cpuset flags 'memory_spread_page' or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or subsequently joins that cpuset. In subsequent patches, the page allocation calls for the affected page cache and slab caches are modified to perform an inline check for these flags, and if set, a call to a new routine cpuset_mem_spread_node() returns the node to prefer for the allocation. The cpuset_mem_spread_node() routine is also simple. It uses the value of a per-task rotor cpuset_mem_spread_rotor to select the next node in the current tasks mems_allowed to prefer for the allocation. This policy can provide substantial improvements for jobs that need to place thread local data on the corresponding node, but that need to access large file system data sets that need to be spread across the several nodes in the jobs cpuset in order to fit. Without this patch, especially for jobs that might have one thread reading in the data set, the memory allocation across the nodes in the jobs cpuset can become very uneven. A couple of Copyright year ranges are updated as well. And a couple of email addresses that can be found in the MAINTAINERS file are removed. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cpuset.c104
1 files changed, 98 insertions, 6 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 44d13c246e5c..38f18b33de6c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -4,15 +4,14 @@
4 * Processor and Memory placement constraints for sets of tasks. 4 * Processor and Memory placement constraints for sets of tasks.
5 * 5 *
6 * Copyright (C) 2003 BULL SA. 6 * Copyright (C) 2003 BULL SA.
7 * Copyright (C) 2004 Silicon Graphics, Inc. 7 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
8 * 8 *
9 * Portions derived from Patrick Mochel's sysfs code. 9 * Portions derived from Patrick Mochel's sysfs code.
10 * sysfs is Copyright (c) 2001-3 Patrick Mochel 10 * sysfs is Copyright (c) 2001-3 Patrick Mochel
11 * Portions Copyright (c) 2004 Silicon Graphics, Inc.
12 * 11 *
13 * 2003-10-10 Written by Simon Derr <simon.derr@bull.net> 12 * 2003-10-10 Written by Simon Derr.
14 * 2003-10-22 Updates by Stephen Hemminger. 13 * 2003-10-22 Updates by Stephen Hemminger.
15 * 2004 May-July Rework by Paul Jackson <pj@sgi.com> 14 * 2004 May-July Rework by Paul Jackson.
16 * 15 *
17 * This file is subject to the terms and conditions of the GNU General Public 16 * This file is subject to the terms and conditions of the GNU General Public
18 * License. See the file COPYING in the main directory of the Linux 17 * License. See the file COPYING in the main directory of the Linux
@@ -108,7 +107,9 @@ typedef enum {
108 CS_MEM_EXCLUSIVE, 107 CS_MEM_EXCLUSIVE,
109 CS_MEMORY_MIGRATE, 108 CS_MEMORY_MIGRATE,
110 CS_REMOVED, 109 CS_REMOVED,
111 CS_NOTIFY_ON_RELEASE 110 CS_NOTIFY_ON_RELEASE,
111 CS_SPREAD_PAGE,
112 CS_SPREAD_SLAB,
112} cpuset_flagbits_t; 113} cpuset_flagbits_t;
113 114
114/* convenient tests for these bits */ 115/* convenient tests for these bits */
@@ -137,6 +138,16 @@ static inline int is_memory_migrate(const struct cpuset *cs)
137 return test_bit(CS_MEMORY_MIGRATE, &cs->flags); 138 return test_bit(CS_MEMORY_MIGRATE, &cs->flags);
138} 139}
139 140
141static inline int is_spread_page(const struct cpuset *cs)
142{
143 return test_bit(CS_SPREAD_PAGE, &cs->flags);
144}
145
146static inline int is_spread_slab(const struct cpuset *cs)
147{
148 return test_bit(CS_SPREAD_SLAB, &cs->flags);
149}
150
140/* 151/*
141 * Increment this atomic integer everytime any cpuset changes its 152 * Increment this atomic integer everytime any cpuset changes its
142 * mems_allowed value. Users of cpusets can track this generation 153 * mems_allowed value. Users of cpusets can track this generation
@@ -657,6 +668,14 @@ void cpuset_update_task_memory_state(void)
657 cs = tsk->cpuset; /* Maybe changed when task not locked */ 668 cs = tsk->cpuset; /* Maybe changed when task not locked */
658 guarantee_online_mems(cs, &tsk->mems_allowed); 669 guarantee_online_mems(cs, &tsk->mems_allowed);
659 tsk->cpuset_mems_generation = cs->mems_generation; 670 tsk->cpuset_mems_generation = cs->mems_generation;
671 if (is_spread_page(cs))
672 tsk->flags |= PF_SPREAD_PAGE;
673 else
674 tsk->flags &= ~PF_SPREAD_PAGE;
675 if (is_spread_slab(cs))
676 tsk->flags |= PF_SPREAD_SLAB;
677 else
678 tsk->flags &= ~PF_SPREAD_SLAB;
660 task_unlock(tsk); 679 task_unlock(tsk);
661 mutex_unlock(&callback_mutex); 680 mutex_unlock(&callback_mutex);
662 mpol_rebind_task(tsk, &tsk->mems_allowed); 681 mpol_rebind_task(tsk, &tsk->mems_allowed);
@@ -956,7 +975,8 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
956/* 975/*
957 * update_flag - read a 0 or a 1 in a file and update associated flag 976 * update_flag - read a 0 or a 1 in a file and update associated flag
958 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 977 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
959 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) 978 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE,
979 * CS_SPREAD_PAGE, CS_SPREAD_SLAB)
960 * cs: the cpuset to update 980 * cs: the cpuset to update
961 * buf: the buffer where we read the 0 or 1 981 * buf: the buffer where we read the 0 or 1
962 * 982 *
@@ -1187,6 +1207,8 @@ typedef enum {
1187 FILE_NOTIFY_ON_RELEASE, 1207 FILE_NOTIFY_ON_RELEASE,
1188 FILE_MEMORY_PRESSURE_ENABLED, 1208 FILE_MEMORY_PRESSURE_ENABLED,
1189 FILE_MEMORY_PRESSURE, 1209 FILE_MEMORY_PRESSURE,
1210 FILE_SPREAD_PAGE,
1211 FILE_SPREAD_SLAB,
1190 FILE_TASKLIST, 1212 FILE_TASKLIST,
1191} cpuset_filetype_t; 1213} cpuset_filetype_t;
1192 1214
@@ -1246,6 +1268,14 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
1246 case FILE_MEMORY_PRESSURE: 1268 case FILE_MEMORY_PRESSURE:
1247 retval = -EACCES; 1269 retval = -EACCES;
1248 break; 1270 break;
1271 case FILE_SPREAD_PAGE:
1272 retval = update_flag(CS_SPREAD_PAGE, cs, buffer);
1273 cs->mems_generation = atomic_inc_return(&cpuset_mems_generation);
1274 break;
1275 case FILE_SPREAD_SLAB:
1276 retval = update_flag(CS_SPREAD_SLAB, cs, buffer);
1277 cs->mems_generation = atomic_inc_return(&cpuset_mems_generation);
1278 break;
1249 case FILE_TASKLIST: 1279 case FILE_TASKLIST:
1250 retval = attach_task(cs, buffer, &pathbuf); 1280 retval = attach_task(cs, buffer, &pathbuf);
1251 break; 1281 break;
@@ -1355,6 +1385,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
1355 case FILE_MEMORY_PRESSURE: 1385 case FILE_MEMORY_PRESSURE:
1356 s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); 1386 s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
1357 break; 1387 break;
1388 case FILE_SPREAD_PAGE:
1389 *s++ = is_spread_page(cs) ? '1' : '0';
1390 break;
1391 case FILE_SPREAD_SLAB:
1392 *s++ = is_spread_slab(cs) ? '1' : '0';
1393 break;
1358 default: 1394 default:
1359 retval = -EINVAL; 1395 retval = -EINVAL;
1360 goto out; 1396 goto out;
@@ -1718,6 +1754,16 @@ static struct cftype cft_memory_pressure = {
1718 .private = FILE_MEMORY_PRESSURE, 1754 .private = FILE_MEMORY_PRESSURE,
1719}; 1755};
1720 1756
1757static struct cftype cft_spread_page = {
1758 .name = "memory_spread_page",
1759 .private = FILE_SPREAD_PAGE,
1760};
1761
1762static struct cftype cft_spread_slab = {
1763 .name = "memory_spread_slab",
1764 .private = FILE_SPREAD_SLAB,
1765};
1766
1721static int cpuset_populate_dir(struct dentry *cs_dentry) 1767static int cpuset_populate_dir(struct dentry *cs_dentry)
1722{ 1768{
1723 int err; 1769 int err;
@@ -1736,6 +1782,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
1736 return err; 1782 return err;
1737 if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) 1783 if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0)
1738 return err; 1784 return err;
1785 if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0)
1786 return err;
1787 if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0)
1788 return err;
1739 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) 1789 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
1740 return err; 1790 return err;
1741 return 0; 1791 return 0;
@@ -1764,6 +1814,10 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1764 cs->flags = 0; 1814 cs->flags = 0;
1765 if (notify_on_release(parent)) 1815 if (notify_on_release(parent))
1766 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); 1816 set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
1817 if (is_spread_page(parent))
1818 set_bit(CS_SPREAD_PAGE, &cs->flags);
1819 if (is_spread_slab(parent))
1820 set_bit(CS_SPREAD_SLAB, &cs->flags);
1767 cs->cpus_allowed = CPU_MASK_NONE; 1821 cs->cpus_allowed = CPU_MASK_NONE;
1768 cs->mems_allowed = NODE_MASK_NONE; 1822 cs->mems_allowed = NODE_MASK_NONE;
1769 atomic_set(&cs->count, 0); 1823 atomic_set(&cs->count, 0);
@@ -2201,6 +2255,44 @@ void cpuset_unlock(void)
2201} 2255}
2202 2256
2203/** 2257/**
2258 * cpuset_mem_spread_node() - On which node to begin search for a page
2259 *
2260 * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for
2261 * tasks in a cpuset with is_spread_page or is_spread_slab set),
2262 * and if the memory allocation used cpuset_mem_spread_node()
2263 * to determine on which node to start looking, as it will for
2264 * certain page cache or slab cache pages such as used for file
2265 * system buffers and inode caches, then instead of starting on the
2266 * local node to look for a free page, rather spread the starting
2267 * node around the tasks mems_allowed nodes.
2268 *
2269 * We don't have to worry about the returned node being offline
2270 * because "it can't happen", and even if it did, it would be ok.
2271 *
2272 * The routines calling guarantee_online_mems() are careful to
2273 * only set nodes in task->mems_allowed that are online. So it
2274 * should not be possible for the following code to return an
2275 * offline node. But if it did, that would be ok, as this routine
2276 * is not returning the node where the allocation must be, only
2277 * the node where the search should start. The zonelist passed to
2278 * __alloc_pages() will include all nodes. If the slab allocator
2279 * is passed an offline node, it will fall back to the local node.
2280 * See kmem_cache_alloc_node().
2281 */
2282
2283int cpuset_mem_spread_node(void)
2284{
2285 int node;
2286
2287 node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed);
2288 if (node == MAX_NUMNODES)
2289 node = first_node(current->mems_allowed);
2290 current->cpuset_mem_spread_rotor = node;
2291 return node;
2292}
2293EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2294
2295/**
2204 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? 2296 * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
2205 * @p: pointer to task_struct of some other task. 2297 * @p: pointer to task_struct of some other task.
2206 * 2298 *