aboutsummaryrefslogtreecommitdiffstats
path: root/include/linux
diff options
context:
space:
mode:
authorPaul Jackson <pj@sgi.com>2006-03-24 06:16:03 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-03-24 10:33:22 -0500
commit825a46af5ac171f9f41f794a0a00165588ba1589 (patch)
treeb690fe9d809d7b047f0393097fc79892e1217d98 /include/linux
parent8a39cc60bfa5a72f32d975729a354daca124f6de (diff)
[PATCH] cpuset memory spread basic implementation
This patch provides the implementation and cpuset interface for an alternative memory allocation policy that can be applied to certain kinds of memory allocations, such as the page cache (file system buffers) and some slab caches (such as inode caches). The policy is called "memory spreading." If enabled, it spreads out these kinds of memory allocations over all the nodes allowed to a task, instead of preferring to place them on the node where the task is executing. All other kinds of allocations, including anonymous pages for a tasks stack and data regions, are not affected by this policy choice, and continue to be allocated preferring the node local to execution, as modified by the NUMA mempolicy. There are two boolean flag files per cpuset that control where the kernel allocates pages for the file system buffers and related in kernel data structures. They are called 'memory_spread_page' and 'memory_spread_slab'. If the per-cpuset boolean flag file 'memory_spread_page' is set, then the kernel will spread the file system buffers (page cache) evenly over all the nodes that the faulting task is allowed to use, instead of preferring to put those pages on the node where the task is running. If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the kernel will spread some file system related slab caches, such as for inodes and dentries evenly over all the nodes that the faulting task is allowed to use, instead of preferring to put those pages on the node where the task is running. The implementation is simple. Setting the cpuset flags 'memory_spread_page' or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or subsequently joins that cpuset. In subsequent patches, the page allocation calls for the affected page cache and slab caches are modified to perform an inline check for these flags, and if set, a call to a new routine cpuset_mem_spread_node() returns the node to prefer for the allocation. The cpuset_mem_spread_node() routine is also simple. It uses the value of a per-task rotor cpuset_mem_spread_rotor to select the next node in the current tasks mems_allowed to prefer for the allocation. This policy can provide substantial improvements for jobs that need to place thread local data on the corresponding node, but that need to access large file system data sets that need to be spread across the several nodes in the jobs cpuset in order to fit. Without this patch, especially for jobs that might have one thread reading in the data set, the memory allocation across the nodes in the jobs cpuset can become very uneven. A couple of Copyright year ranges are updated as well. And a couple of email addresses that can be found in the MAINTAINERS file are removed. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'include/linux')
-rw-r--r--include/linux/cpuset.h29
-rw-r--r--include/linux/sched.h3
2 files changed, 31 insertions, 1 deletions
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 3bc606927116..9354722a9217 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -4,7 +4,7 @@
4 * cpuset interface 4 * cpuset interface
5 * 5 *
6 * Copyright (C) 2003 BULL SA 6 * Copyright (C) 2003 BULL SA
7 * Copyright (C) 2004 Silicon Graphics, Inc. 7 * Copyright (C) 2004-2006 Silicon Graphics, Inc.
8 * 8 *
9 */ 9 */
10 10
@@ -51,6 +51,18 @@ extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
51extern void cpuset_lock(void); 51extern void cpuset_lock(void);
52extern void cpuset_unlock(void); 52extern void cpuset_unlock(void);
53 53
54extern int cpuset_mem_spread_node(void);
55
56static inline int cpuset_do_page_mem_spread(void)
57{
58 return current->flags & PF_SPREAD_PAGE;
59}
60
61static inline int cpuset_do_slab_mem_spread(void)
62{
63 return current->flags & PF_SPREAD_SLAB;
64}
65
54#else /* !CONFIG_CPUSETS */ 66#else /* !CONFIG_CPUSETS */
55 67
56static inline int cpuset_init_early(void) { return 0; } 68static inline int cpuset_init_early(void) { return 0; }
@@ -99,6 +111,21 @@ static inline char *cpuset_task_status_allowed(struct task_struct *task,
99static inline void cpuset_lock(void) {} 111static inline void cpuset_lock(void) {}
100static inline void cpuset_unlock(void) {} 112static inline void cpuset_unlock(void) {}
101 113
114static inline int cpuset_mem_spread_node(void)
115{
116 return 0;
117}
118
119static inline int cpuset_do_page_mem_spread(void)
120{
121 return 0;
122}
123
124static inline int cpuset_do_slab_mem_spread(void)
125{
126 return 0;
127}
128
102#endif /* !CONFIG_CPUSETS */ 129#endif /* !CONFIG_CPUSETS */
103 130
104#endif /* _LINUX_CPUSET_H */ 131#endif /* _LINUX_CPUSET_H */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index e60a91d5b369..b0e37cfa09f5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -869,6 +869,7 @@ struct task_struct {
869 struct cpuset *cpuset; 869 struct cpuset *cpuset;
870 nodemask_t mems_allowed; 870 nodemask_t mems_allowed;
871 int cpuset_mems_generation; 871 int cpuset_mems_generation;
872 int cpuset_mem_spread_rotor;
872#endif 873#endif
873 atomic_t fs_excl; /* holding fs exclusive resources */ 874 atomic_t fs_excl; /* holding fs exclusive resources */
874 struct rcu_head rcu; 875 struct rcu_head rcu;
@@ -929,6 +930,8 @@ static inline void put_task_struct(struct task_struct *t)
929#define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */ 930#define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */
930#define PF_RANDOMIZE 0x00800000 /* randomize virtual address space */ 931#define PF_RANDOMIZE 0x00800000 /* randomize virtual address space */
931#define PF_SWAPWRITE 0x01000000 /* Allowed to write to swap */ 932#define PF_SWAPWRITE 0x01000000 /* Allowed to write to swap */
933#define PF_SPREAD_PAGE 0x04000000 /* Spread page cache over cpuset */
934#define PF_SPREAD_SLAB 0x08000000 /* Spread some slab caches over cpuset */
932 935
933/* 936/*
934 * Only the _current_ task can read/write to tsk->flags, but other 937 * Only the _current_ task can read/write to tsk->flags, but other