diff options
-rw-r--r-- | Documentation/cpusets.txt | 76 | ||||
-rw-r--r-- | include/linux/cpuset.h | 29 | ||||
-rw-r--r-- | include/linux/sched.h | 3 | ||||
-rw-r--r-- | kernel/cpuset.c | 104 |
4 files changed, 203 insertions, 9 deletions
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt index 30c41459953c..159e2a0c3e80 100644 --- a/Documentation/cpusets.txt +++ b/Documentation/cpusets.txt | |||
@@ -18,7 +18,8 @@ CONTENTS: | |||
18 | 1.4 What are exclusive cpusets ? | 18 | 1.4 What are exclusive cpusets ? |
19 | 1.5 What does notify_on_release do ? | 19 | 1.5 What does notify_on_release do ? |
20 | 1.6 What is memory_pressure ? | 20 | 1.6 What is memory_pressure ? |
21 | 1.7 How do I use cpusets ? | 21 | 1.7 What is memory spread ? |
22 | 1.8 How do I use cpusets ? | ||
22 | 2. Usage Examples and Syntax | 23 | 2. Usage Examples and Syntax |
23 | 2.1 Basic Usage | 24 | 2.1 Basic Usage |
24 | 2.2 Adding/removing cpus | 25 | 2.2 Adding/removing cpus |
@@ -317,7 +318,78 @@ the tasks in the cpuset, in units of reclaims attempted per second, | |||
317 | times 1000. | 318 | times 1000. |
318 | 319 | ||
319 | 320 | ||
320 | 1.7 How do I use cpusets ? | 321 | 1.7 What is memory spread ? |
322 | --------------------------- | ||
323 | There are two boolean flag files per cpuset that control where the | ||
324 | kernel allocates pages for the file system buffers and related in | ||
325 | kernel data structures. They are called 'memory_spread_page' and | ||
326 | 'memory_spread_slab'. | ||
327 | |||
328 | If the per-cpuset boolean flag file 'memory_spread_page' is set, then | ||
329 | the kernel will spread the file system buffers (page cache) evenly | ||
330 | over all the nodes that the faulting task is allowed to use, instead | ||
331 | of preferring to put those pages on the node where the task is running. | ||
332 | |||
333 | If the per-cpuset boolean flag file 'memory_spread_slab' is set, | ||
334 | then the kernel will spread some file system related slab caches, | ||
335 | such as for inodes and dentries evenly over all the nodes that the | ||
336 | faulting task is allowed to use, instead of preferring to put those | ||
337 | pages on the node where the task is running. | ||
338 | |||
339 | The setting of these flags does not affect anonymous data segment or | ||
340 | stack segment pages of a task. | ||
341 | |||
342 | By default, both kinds of memory spreading are off, and memory | ||
343 | pages are allocated on the node local to where the task is running, | ||
344 | except perhaps as modified by the tasks NUMA mempolicy or cpuset | ||
345 | configuration, so long as sufficient free memory pages are available. | ||
346 | |||
347 | When new cpusets are created, they inherit the memory spread settings | ||
348 | of their parent. | ||
349 | |||
350 | Setting memory spreading causes allocations for the affected page | ||
351 | or slab caches to ignore the tasks NUMA mempolicy and be spread | ||
352 | instead. Tasks using mbind() or set_mempolicy() calls to set NUMA | ||
353 | mempolicies will not notice any change in these calls as a result of | ||
354 | their containing tasks memory spread settings. If memory spreading | ||
355 | is turned off, then the currently specified NUMA mempolicy once again | ||
356 | applies to memory page allocations. | ||
357 | |||
358 | Both 'memory_spread_page' and 'memory_spread_slab' are boolean flag | ||
359 | files. By default they contain "0", meaning that the feature is off | ||
360 | for that cpuset. If a "1" is written to that file, then that turns | ||
361 | the named feature on. | ||
362 | |||
363 | The implementation is simple. | ||
364 | |||
365 | Setting the flag 'memory_spread_page' turns on a per-process flag | ||
366 | PF_SPREAD_PAGE for each task that is in that cpuset or subsequently | ||
367 | joins that cpuset. The page allocation calls for the page cache | ||
368 | is modified to perform an inline check for this PF_SPREAD_PAGE task | ||
369 | flag, and if set, a call to a new routine cpuset_mem_spread_node() | ||
370 | returns the node to prefer for the allocation. | ||
371 | |||
372 | Similarly, setting 'memory_spread_cache' turns on the flag | ||
373 | PF_SPREAD_SLAB, and appropriately marked slab caches will allocate | ||
374 | pages from the node returned by cpuset_mem_spread_node(). | ||
375 | |||
376 | The cpuset_mem_spread_node() routine is also simple. It uses the | ||
377 | value of a per-task rotor cpuset_mem_spread_rotor to select the next | ||
378 | node in the current tasks mems_allowed to prefer for the allocation. | ||
379 | |||
380 | This memory placement policy is also known (in other contexts) as | ||
381 | round-robin or interleave. | ||
382 | |||
383 | This policy can provide substantial improvements for jobs that need | ||
384 | to place thread local data on the corresponding node, but that need | ||
385 | to access large file system data sets that need to be spread across | ||
386 | the several nodes in the jobs cpuset in order to fit. Without this | ||
387 | policy, especially for jobs that might have one thread reading in the | ||
388 | data set, the memory allocation across the nodes in the jobs cpuset | ||
389 | can become very uneven. | ||
390 | |||
391 | |||
392 | 1.8 How do I use cpusets ? | ||
321 | -------------------------- | 393 | -------------------------- |
322 | 394 | ||
323 | In order to minimize the impact of cpusets on critical kernel | 395 | In order to minimize the impact of cpusets on critical kernel |
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 3bc606927116..9354722a9217 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h | |||
@@ -4,7 +4,7 @@ | |||
4 | * cpuset interface | 4 | * cpuset interface |
5 | * | 5 | * |
6 | * Copyright (C) 2003 BULL SA | 6 | * Copyright (C) 2003 BULL SA |
7 | * Copyright (C) 2004 Silicon Graphics, Inc. | 7 | * Copyright (C) 2004-2006 Silicon Graphics, Inc. |
8 | * | 8 | * |
9 | */ | 9 | */ |
10 | 10 | ||
@@ -51,6 +51,18 @@ extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer); | |||
51 | extern void cpuset_lock(void); | 51 | extern void cpuset_lock(void); |
52 | extern void cpuset_unlock(void); | 52 | extern void cpuset_unlock(void); |
53 | 53 | ||
54 | extern int cpuset_mem_spread_node(void); | ||
55 | |||
56 | static inline int cpuset_do_page_mem_spread(void) | ||
57 | { | ||
58 | return current->flags & PF_SPREAD_PAGE; | ||
59 | } | ||
60 | |||
61 | static inline int cpuset_do_slab_mem_spread(void) | ||
62 | { | ||
63 | return current->flags & PF_SPREAD_SLAB; | ||
64 | } | ||
65 | |||
54 | #else /* !CONFIG_CPUSETS */ | 66 | #else /* !CONFIG_CPUSETS */ |
55 | 67 | ||
56 | static inline int cpuset_init_early(void) { return 0; } | 68 | static inline int cpuset_init_early(void) { return 0; } |
@@ -99,6 +111,21 @@ static inline char *cpuset_task_status_allowed(struct task_struct *task, | |||
99 | static inline void cpuset_lock(void) {} | 111 | static inline void cpuset_lock(void) {} |
100 | static inline void cpuset_unlock(void) {} | 112 | static inline void cpuset_unlock(void) {} |
101 | 113 | ||
114 | static inline int cpuset_mem_spread_node(void) | ||
115 | { | ||
116 | return 0; | ||
117 | } | ||
118 | |||
119 | static inline int cpuset_do_page_mem_spread(void) | ||
120 | { | ||
121 | return 0; | ||
122 | } | ||
123 | |||
124 | static inline int cpuset_do_slab_mem_spread(void) | ||
125 | { | ||
126 | return 0; | ||
127 | } | ||
128 | |||
102 | #endif /* !CONFIG_CPUSETS */ | 129 | #endif /* !CONFIG_CPUSETS */ |
103 | 130 | ||
104 | #endif /* _LINUX_CPUSET_H */ | 131 | #endif /* _LINUX_CPUSET_H */ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index e60a91d5b369..b0e37cfa09f5 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -869,6 +869,7 @@ struct task_struct { | |||
869 | struct cpuset *cpuset; | 869 | struct cpuset *cpuset; |
870 | nodemask_t mems_allowed; | 870 | nodemask_t mems_allowed; |
871 | int cpuset_mems_generation; | 871 | int cpuset_mems_generation; |
872 | int cpuset_mem_spread_rotor; | ||
872 | #endif | 873 | #endif |
873 | atomic_t fs_excl; /* holding fs exclusive resources */ | 874 | atomic_t fs_excl; /* holding fs exclusive resources */ |
874 | struct rcu_head rcu; | 875 | struct rcu_head rcu; |
@@ -929,6 +930,8 @@ static inline void put_task_struct(struct task_struct *t) | |||
929 | #define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */ | 930 | #define PF_BORROWED_MM 0x00400000 /* I am a kthread doing use_mm */ |
930 | #define PF_RANDOMIZE 0x00800000 /* randomize virtual address space */ | 931 | #define PF_RANDOMIZE 0x00800000 /* randomize virtual address space */ |
931 | #define PF_SWAPWRITE 0x01000000 /* Allowed to write to swap */ | 932 | #define PF_SWAPWRITE 0x01000000 /* Allowed to write to swap */ |
933 | #define PF_SPREAD_PAGE 0x04000000 /* Spread page cache over cpuset */ | ||
934 | #define PF_SPREAD_SLAB 0x08000000 /* Spread some slab caches over cpuset */ | ||
932 | 935 | ||
933 | /* | 936 | /* |
934 | * Only the _current_ task can read/write to tsk->flags, but other | 937 | * Only the _current_ task can read/write to tsk->flags, but other |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 44d13c246e5c..38f18b33de6c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -4,15 +4,14 @@ | |||
4 | * Processor and Memory placement constraints for sets of tasks. | 4 | * Processor and Memory placement constraints for sets of tasks. |
5 | * | 5 | * |
6 | * Copyright (C) 2003 BULL SA. | 6 | * Copyright (C) 2003 BULL SA. |
7 | * Copyright (C) 2004 Silicon Graphics, Inc. | 7 | * Copyright (C) 2004-2006 Silicon Graphics, Inc. |
8 | * | 8 | * |
9 | * Portions derived from Patrick Mochel's sysfs code. | 9 | * Portions derived from Patrick Mochel's sysfs code. |
10 | * sysfs is Copyright (c) 2001-3 Patrick Mochel | 10 | * sysfs is Copyright (c) 2001-3 Patrick Mochel |
11 | * Portions Copyright (c) 2004 Silicon Graphics, Inc. | ||
12 | * | 11 | * |
13 | * 2003-10-10 Written by Simon Derr <simon.derr@bull.net> | 12 | * 2003-10-10 Written by Simon Derr. |
14 | * 2003-10-22 Updates by Stephen Hemminger. | 13 | * 2003-10-22 Updates by Stephen Hemminger. |
15 | * 2004 May-July Rework by Paul Jackson <pj@sgi.com> | 14 | * 2004 May-July Rework by Paul Jackson. |
16 | * | 15 | * |
17 | * This file is subject to the terms and conditions of the GNU General Public | 16 | * This file is subject to the terms and conditions of the GNU General Public |
18 | * License. See the file COPYING in the main directory of the Linux | 17 | * License. See the file COPYING in the main directory of the Linux |
@@ -108,7 +107,9 @@ typedef enum { | |||
108 | CS_MEM_EXCLUSIVE, | 107 | CS_MEM_EXCLUSIVE, |
109 | CS_MEMORY_MIGRATE, | 108 | CS_MEMORY_MIGRATE, |
110 | CS_REMOVED, | 109 | CS_REMOVED, |
111 | CS_NOTIFY_ON_RELEASE | 110 | CS_NOTIFY_ON_RELEASE, |
111 | CS_SPREAD_PAGE, | ||
112 | CS_SPREAD_SLAB, | ||
112 | } cpuset_flagbits_t; | 113 | } cpuset_flagbits_t; |
113 | 114 | ||
114 | /* convenient tests for these bits */ | 115 | /* convenient tests for these bits */ |
@@ -137,6 +138,16 @@ static inline int is_memory_migrate(const struct cpuset *cs) | |||
137 | return test_bit(CS_MEMORY_MIGRATE, &cs->flags); | 138 | return test_bit(CS_MEMORY_MIGRATE, &cs->flags); |
138 | } | 139 | } |
139 | 140 | ||
141 | static inline int is_spread_page(const struct cpuset *cs) | ||
142 | { | ||
143 | return test_bit(CS_SPREAD_PAGE, &cs->flags); | ||
144 | } | ||
145 | |||
146 | static inline int is_spread_slab(const struct cpuset *cs) | ||
147 | { | ||
148 | return test_bit(CS_SPREAD_SLAB, &cs->flags); | ||
149 | } | ||
150 | |||
140 | /* | 151 | /* |
141 | * Increment this atomic integer everytime any cpuset changes its | 152 | * Increment this atomic integer everytime any cpuset changes its |
142 | * mems_allowed value. Users of cpusets can track this generation | 153 | * mems_allowed value. Users of cpusets can track this generation |
@@ -657,6 +668,14 @@ void cpuset_update_task_memory_state(void) | |||
657 | cs = tsk->cpuset; /* Maybe changed when task not locked */ | 668 | cs = tsk->cpuset; /* Maybe changed when task not locked */ |
658 | guarantee_online_mems(cs, &tsk->mems_allowed); | 669 | guarantee_online_mems(cs, &tsk->mems_allowed); |
659 | tsk->cpuset_mems_generation = cs->mems_generation; | 670 | tsk->cpuset_mems_generation = cs->mems_generation; |
671 | if (is_spread_page(cs)) | ||
672 | tsk->flags |= PF_SPREAD_PAGE; | ||
673 | else | ||
674 | tsk->flags &= ~PF_SPREAD_PAGE; | ||
675 | if (is_spread_slab(cs)) | ||
676 | tsk->flags |= PF_SPREAD_SLAB; | ||
677 | else | ||
678 | tsk->flags &= ~PF_SPREAD_SLAB; | ||
660 | task_unlock(tsk); | 679 | task_unlock(tsk); |
661 | mutex_unlock(&callback_mutex); | 680 | mutex_unlock(&callback_mutex); |
662 | mpol_rebind_task(tsk, &tsk->mems_allowed); | 681 | mpol_rebind_task(tsk, &tsk->mems_allowed); |
@@ -956,7 +975,8 @@ static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | |||
956 | /* | 975 | /* |
957 | * update_flag - read a 0 or a 1 in a file and update associated flag | 976 | * update_flag - read a 0 or a 1 in a file and update associated flag |
958 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, | 977 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, |
959 | * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) | 978 | * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE, |
979 | * CS_SPREAD_PAGE, CS_SPREAD_SLAB) | ||
960 | * cs: the cpuset to update | 980 | * cs: the cpuset to update |
961 | * buf: the buffer where we read the 0 or 1 | 981 | * buf: the buffer where we read the 0 or 1 |
962 | * | 982 | * |
@@ -1187,6 +1207,8 @@ typedef enum { | |||
1187 | FILE_NOTIFY_ON_RELEASE, | 1207 | FILE_NOTIFY_ON_RELEASE, |
1188 | FILE_MEMORY_PRESSURE_ENABLED, | 1208 | FILE_MEMORY_PRESSURE_ENABLED, |
1189 | FILE_MEMORY_PRESSURE, | 1209 | FILE_MEMORY_PRESSURE, |
1210 | FILE_SPREAD_PAGE, | ||
1211 | FILE_SPREAD_SLAB, | ||
1190 | FILE_TASKLIST, | 1212 | FILE_TASKLIST, |
1191 | } cpuset_filetype_t; | 1213 | } cpuset_filetype_t; |
1192 | 1214 | ||
@@ -1246,6 +1268,14 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
1246 | case FILE_MEMORY_PRESSURE: | 1268 | case FILE_MEMORY_PRESSURE: |
1247 | retval = -EACCES; | 1269 | retval = -EACCES; |
1248 | break; | 1270 | break; |
1271 | case FILE_SPREAD_PAGE: | ||
1272 | retval = update_flag(CS_SPREAD_PAGE, cs, buffer); | ||
1273 | cs->mems_generation = atomic_inc_return(&cpuset_mems_generation); | ||
1274 | break; | ||
1275 | case FILE_SPREAD_SLAB: | ||
1276 | retval = update_flag(CS_SPREAD_SLAB, cs, buffer); | ||
1277 | cs->mems_generation = atomic_inc_return(&cpuset_mems_generation); | ||
1278 | break; | ||
1249 | case FILE_TASKLIST: | 1279 | case FILE_TASKLIST: |
1250 | retval = attach_task(cs, buffer, &pathbuf); | 1280 | retval = attach_task(cs, buffer, &pathbuf); |
1251 | break; | 1281 | break; |
@@ -1355,6 +1385,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | |||
1355 | case FILE_MEMORY_PRESSURE: | 1385 | case FILE_MEMORY_PRESSURE: |
1356 | s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); | 1386 | s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); |
1357 | break; | 1387 | break; |
1388 | case FILE_SPREAD_PAGE: | ||
1389 | *s++ = is_spread_page(cs) ? '1' : '0'; | ||
1390 | break; | ||
1391 | case FILE_SPREAD_SLAB: | ||
1392 | *s++ = is_spread_slab(cs) ? '1' : '0'; | ||
1393 | break; | ||
1358 | default: | 1394 | default: |
1359 | retval = -EINVAL; | 1395 | retval = -EINVAL; |
1360 | goto out; | 1396 | goto out; |
@@ -1718,6 +1754,16 @@ static struct cftype cft_memory_pressure = { | |||
1718 | .private = FILE_MEMORY_PRESSURE, | 1754 | .private = FILE_MEMORY_PRESSURE, |
1719 | }; | 1755 | }; |
1720 | 1756 | ||
1757 | static struct cftype cft_spread_page = { | ||
1758 | .name = "memory_spread_page", | ||
1759 | .private = FILE_SPREAD_PAGE, | ||
1760 | }; | ||
1761 | |||
1762 | static struct cftype cft_spread_slab = { | ||
1763 | .name = "memory_spread_slab", | ||
1764 | .private = FILE_SPREAD_SLAB, | ||
1765 | }; | ||
1766 | |||
1721 | static int cpuset_populate_dir(struct dentry *cs_dentry) | 1767 | static int cpuset_populate_dir(struct dentry *cs_dentry) |
1722 | { | 1768 | { |
1723 | int err; | 1769 | int err; |
@@ -1736,6 +1782,10 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) | |||
1736 | return err; | 1782 | return err; |
1737 | if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) | 1783 | if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) |
1738 | return err; | 1784 | return err; |
1785 | if ((err = cpuset_add_file(cs_dentry, &cft_spread_page)) < 0) | ||
1786 | return err; | ||
1787 | if ((err = cpuset_add_file(cs_dentry, &cft_spread_slab)) < 0) | ||
1788 | return err; | ||
1739 | if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) | 1789 | if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) |
1740 | return err; | 1790 | return err; |
1741 | return 0; | 1791 | return 0; |
@@ -1764,6 +1814,10 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1764 | cs->flags = 0; | 1814 | cs->flags = 0; |
1765 | if (notify_on_release(parent)) | 1815 | if (notify_on_release(parent)) |
1766 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); | 1816 | set_bit(CS_NOTIFY_ON_RELEASE, &cs->flags); |
1817 | if (is_spread_page(parent)) | ||
1818 | set_bit(CS_SPREAD_PAGE, &cs->flags); | ||
1819 | if (is_spread_slab(parent)) | ||
1820 | set_bit(CS_SPREAD_SLAB, &cs->flags); | ||
1767 | cs->cpus_allowed = CPU_MASK_NONE; | 1821 | cs->cpus_allowed = CPU_MASK_NONE; |
1768 | cs->mems_allowed = NODE_MASK_NONE; | 1822 | cs->mems_allowed = NODE_MASK_NONE; |
1769 | atomic_set(&cs->count, 0); | 1823 | atomic_set(&cs->count, 0); |
@@ -2201,6 +2255,44 @@ void cpuset_unlock(void) | |||
2201 | } | 2255 | } |
2202 | 2256 | ||
2203 | /** | 2257 | /** |
2258 | * cpuset_mem_spread_node() - On which node to begin search for a page | ||
2259 | * | ||
2260 | * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for | ||
2261 | * tasks in a cpuset with is_spread_page or is_spread_slab set), | ||
2262 | * and if the memory allocation used cpuset_mem_spread_node() | ||
2263 | * to determine on which node to start looking, as it will for | ||
2264 | * certain page cache or slab cache pages such as used for file | ||
2265 | * system buffers and inode caches, then instead of starting on the | ||
2266 | * local node to look for a free page, rather spread the starting | ||
2267 | * node around the tasks mems_allowed nodes. | ||
2268 | * | ||
2269 | * We don't have to worry about the returned node being offline | ||
2270 | * because "it can't happen", and even if it did, it would be ok. | ||
2271 | * | ||
2272 | * The routines calling guarantee_online_mems() are careful to | ||
2273 | * only set nodes in task->mems_allowed that are online. So it | ||
2274 | * should not be possible for the following code to return an | ||
2275 | * offline node. But if it did, that would be ok, as this routine | ||
2276 | * is not returning the node where the allocation must be, only | ||
2277 | * the node where the search should start. The zonelist passed to | ||
2278 | * __alloc_pages() will include all nodes. If the slab allocator | ||
2279 | * is passed an offline node, it will fall back to the local node. | ||
2280 | * See kmem_cache_alloc_node(). | ||
2281 | */ | ||
2282 | |||
2283 | int cpuset_mem_spread_node(void) | ||
2284 | { | ||
2285 | int node; | ||
2286 | |||
2287 | node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed); | ||
2288 | if (node == MAX_NUMNODES) | ||
2289 | node = first_node(current->mems_allowed); | ||
2290 | current->cpuset_mem_spread_rotor = node; | ||
2291 | return node; | ||
2292 | } | ||
2293 | EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); | ||
2294 | |||
2295 | /** | ||
2204 | * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? | 2296 | * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? |
2205 | * @p: pointer to task_struct of some other task. | 2297 | * @p: pointer to task_struct of some other task. |
2206 | * | 2298 | * |