diff options
author | Paul Jackson <pj@sgi.com> | 2006-03-24 06:16:03 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-03-24 10:33:22 -0500 |
commit | 825a46af5ac171f9f41f794a0a00165588ba1589 (patch) | |
tree | b690fe9d809d7b047f0393097fc79892e1217d98 /include | |
parent | 8a39cc60bfa5a72f32d975729a354daca124f6de (diff) |
[PATCH] cpuset memory spread basic implementation
This patch provides the implementation and cpuset interface for an alternative
memory allocation policy that can be applied to certain kinds of memory
allocations, such as the page cache (file system buffers) and some slab caches
(such as inode caches).
The policy is called "memory spreading." If enabled, it spreads out these
kinds of memory allocations over all the nodes allowed to a task, instead of
preferring to place them on the node where the task is executing.
All other kinds of allocations, including anonymous pages for a tasks stack
and data regions, are not affected by this policy choice, and continue to be
allocated preferring the node local to execution, as modified by the NUMA
mempolicy.
There are two boolean flag files per cpuset that control where the kernel
allocates pages for the file system buffers and related in kernel data
structures. They are called 'memory_spread_page' and 'memory_spread_slab'.
If the per-cpuset boolean flag file 'memory_spread_page' is set, then the
kernel will spread the file system buffers (page cache) evenly over all the
nodes that the faulting task is allowed to use, instead of preferring to put
those pages on the node where the task is running.
If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the
kernel will spread some file system related slab caches, such as for inodes
and dentries evenly over all the nodes that the faulting task is allowed to
use, instead of preferring to put those pages on the node where the task is
running.
The implementation is simple. Setting the cpuset flags 'memory_spread_page'
or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or
PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or
subsequently joins that cpuset. In subsequent patches, the page allocation
calls for the affected page cache and slab caches are modified to perform an
inline check for these flags, and if set, a call to a new routine
cpuset_mem_spread_node() returns the node to prefer for the allocation.
The cpuset_mem_spread_node() routine is also simple. It uses the value of a
per-task rotor cpuset_mem_spread_rotor to select the next node in the current
tasks mems_allowed to prefer for the allocation.
This policy can provide substantial improvements for jobs that need to place
thread local data on the corresponding node, but that need to access large
file system data sets that need to be spread across the several nodes in the
jobs cpuset in order to fit. Without this patch, especially for jobs that
might have one thread reading in the data set, the memory allocation across
the nodes in the jobs cpuset can become very uneven.
A couple of Copyright year ranges are updated as well. And a couple of email
addresses that can be found in the MAINTAINERS file are removed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/cpuset.h | 29 | ||||
-rw-r--r-- | include/linux/sched.h | 3 |
2 files changed, 31 insertions, 1 deletions
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 3bc606927116..9354722a9217 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
@@ -4,7 +4,7 @@ | |||
4 | * cpuset interface | 4 | * cpuset interface |
5 | * | 5 | * |
6 | * Copyright (C) 2003 BULL SA | 6 | * Copyright (C) 2003 BULL SA |
7 | * Copyright (C) 2004 Silicon Graphics, Inc. | 7 | * Copyright (C) 2004-2006 Silicon Graphics, Inc. |
8 | * | 8 | * |
9 | */ | 9 | */ |
10 | 10 | ||
@@ -51,6 +51,18 @@ extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer); | |||
51 | extern void cpuset_lock(void); | 51 | extern void cpuset_lock(void); |
52 | extern void cpuset_unlock(void); | 52 | extern void cpuset_unlock(void); |
53 | 53 | ||
54 | extern int cpuset_mem_spread_node(void); | ||
55 | |||
56 | static inline int cpuset_do_page_mem_spread(void) | ||
57 | { | ||
58 | return current->flags & PF_SPREAD_PAGE; | ||
59 | } | ||
60 | |||
61 | static inline int cpuset_do_slab_mem_spread(void) | ||
62 | { | ||
63 | return current->flags & PF_SPREAD_SLAB; | ||
64 | } | ||
65 | |||
54 | #else /* !CONFIG_CPUSETS */ | 66 | #else /* !CONFIG_CPUSETS */ |
55 | 67 | ||
56 | static inline int cpuset_init_early(void) { return 0; } | 68 | static inline int cpuset_init_early(void) { return 0; } |
@@ -99,6 +111,21 @@ static inline char *cpuset_task_status_allowed(struct task_struct *task, | |||
99 | static inline void cpuset_lock(void) {} | 111 | static inline void cpuset_lock(void) {} |
100 | static inline void cpuset_unlock(void) {} | 112 | static inline void cpuset_unlock(void) {} |
101 | 113 | ||
114 | static inline int cpuset_mem_spread_node(void) | ||
115 | { | ||
116 | return 0; | ||
117 | } | ||
118 | |||
119 | static inline int cpuset_do_page_mem_spread(void) | ||
120 | { | ||
121 | return 0; | ||
122 | } | ||
123 | |||
124 | static inline int cpuset_do_slab_mem_spread(void) | ||
125 | { | ||
126 | return 0; | ||
127 | } | ||
128 | |||
102 | #endif /* !CONFIG_CPUSETS */ | 129 | #endif /* !CONFIG_CPUSETS */ |
103 | 130 | ||
104 | #endif /* _LINUX_CPUSET_H */ | 131 | #endif /* _LINUX_CPUSET_H */ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index e60a91d5b369..b0e37cfa09f5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -869,6 +869,7 @@ struct task_struct { | |||
869 | struct cpuset *cpuset; | 869 | struct cpuset *cpuset; |
870 | nodemask_t mems_allowed; | 870 | nodemask_t mems_allowed; |
871 | int cpuset_mems_generation; | 871 | int cpuset_mems_generation; |
872 | int cpuset_mem_spread_rotor; | ||
872 | #endif | 873 | #endif |
873 | atomic_t fs_excl; /* holding fs exclusive resources */ | 874 | atomic_t fs_excl; /* holding fs exclusive resources */ |
874 | struct rcu_head rcu; | 875 | struct rcu_head rcu; |
@@ -929,6 +930,8 @@ static inline void put_task_struct(struct task_struct *t) | |||
929 | #define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */ | 930 | #define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */ |
930 | #define PF_RANDOMIZE 0x00800000 /* randomize virtual address space */ | 931 | #define PF_RANDOMIZE 0x00800000 /* randomize virtual address space */ |
931 | #define PF_SWAPWRITE 0x01000000 /* Allowed to write to swap */ | 932 | #define PF_SWAPWRITE 0x01000000 /* Allowed to write to swap */ |
933 | #define PF_SPREAD_PAGE 0x04000000 /* Spread page cache over cpuset */ | ||
934 | #define PF_SPREAD_SLAB 0x08000000 /* Spread some slab caches over cpuset */ | ||
932 | 935 | ||
933 | /* | 936 | /* |
934 | * Only the _current_ task can read/write to tsk->flags, but other | 937 | * Only the _current_ task can read/write to tsk->flags, but other |