aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul Jackson <pj@sgi.com>2006-01-08 04:01:49 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-01-08 23:13:42 -0500
commit3e0d98b9f1eb757fc98efc84e74e54a08308aa73 (patch)
tree7cf1c75994f734ede7ec89373de640c4a58b237a
parent5966514db662fb24c9bb43226a80106bcffd51f8 (diff)
[PATCH] cpuset: memory pressure meter
Provide a simple per-cpuset metric of memory pressure, tracking the -rate- that the tasks in a cpuset call try_to_free_pages(), the synchronous (direct) memory reclaim code. This enables batch managers monitoring jobs running in dedicated cpusets to efficiently detect what level of memory pressure that job is causing. This is useful both on tightly managed systems running a wide mix of submitted jobs, which may choose to terminate or reprioritize jobs that are trying to use more memory than allowed on the nodes assigned them, and with tightly coupled, long running, massively parallel scientific computing jobs that will dramatically fail to meet required performance goals if they start to use more memory than allowed to them. This patch just provides a very economical way for the batch manager to monitor a cpuset for signs of memory pressure. It's up to the batch manager or other user code to decide what to do about it and take action. ==> Unless this feature is enabled by writing "1" to the special file /dev/cpuset/memory_pressure_enabled, the hook in the rebalance code of __alloc_pages() for this metric reduces to simply noticing that the cpuset_memory_pressure_enabled flag is zero. So only systems that enable this feature will compute the metric. Why a per-cpuset, running average: Because this meter is per-cpuset, rather than per-task or mm, the system load imposed by a batch scheduler monitoring this metric is sharply reduced on large systems, because a scan of the tasklist can be avoided on each set of queries. Because this meter is a running average, instead of an accumulating counter, a batch scheduler can detect memory pressure with a single read, instead of having to read and accumulate results for a period of time. Because this meter is per-cpuset rather than per-task or mm, the batch scheduler can obtain the key information, memory pressure in a cpuset, with a single read, rather than having to query and accumulate results over all the (dynamically changing) set of tasks in the cpuset. A per-cpuset simple digital filter (requires a spinlock and 3 words of data per-cpuset) is kept, and updated by any task attached to that cpuset, if it enters the synchronous (direct) page reclaim code. A per-cpuset file provides an integer number representing the recent (half-life of 10 seconds) rate of direct page reclaims caused by the tasks in the cpuset, in units of reclaims attempted per second, times 1000. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--include/linux/cpuset.h11
-rw-r--r--kernel/cpuset.c193
-rw-r--r--mm/page_alloc.c1
3 files changed, 203 insertions, 2 deletions
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 8b21786490ee..736d73801cb6 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -26,6 +26,15 @@ void cpuset_update_current_mems_allowed(void);
26int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl); 26int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
27extern int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask); 27extern int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask);
28extern int cpuset_excl_nodes_overlap(const struct task_struct *p); 28extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
29
30#define cpuset_memory_pressure_bump() \
31 do { \
32 if (cpuset_memory_pressure_enabled) \
33 __cpuset_memory_pressure_bump(); \
34 } while (0)
35extern int cpuset_memory_pressure_enabled;
36extern void __cpuset_memory_pressure_bump(void);
37
29extern struct file_operations proc_cpuset_operations; 38extern struct file_operations proc_cpuset_operations;
30extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer); 39extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
31 40
@@ -60,6 +69,8 @@ static inline int cpuset_excl_nodes_overlap(const struct task_struct *p)
60 return 1; 69 return 1;
61} 70}
62 71
72static inline void cpuset_memory_pressure_bump(void) {}
73
63static inline char *cpuset_task_status_allowed(struct task_struct *task, 74static inline char *cpuset_task_status_allowed(struct task_struct *task,
64 char *buffer) 75 char *buffer)
65{ 76{
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6503c6da4c4f..5a06fef669f8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -56,6 +56,15 @@
56 56
57#define CPUSET_SUPER_MAGIC 0x27e0eb 57#define CPUSET_SUPER_MAGIC 0x27e0eb
58 58
59/* See "Frequency meter" comments, below. */
60
61struct fmeter {
62 int cnt; /* unprocessed events count */
63 int val; /* most recent output value */
64 time_t time; /* clock (secs) when val computed */
65 spinlock_t lock; /* guards read or write of above */
66};
67
59struct cpuset { 68struct cpuset {
60 unsigned long flags; /* "unsigned long" so bitops work */ 69 unsigned long flags; /* "unsigned long" so bitops work */
61 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 70 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
@@ -80,7 +89,9 @@ struct cpuset {
80 * Copy of global cpuset_mems_generation as of the most 89 * Copy of global cpuset_mems_generation as of the most
81 * recent time this cpuset changed its mems_allowed. 90 * recent time this cpuset changed its mems_allowed.
82 */ 91 */
83 int mems_generation; 92 int mems_generation;
93
94 struct fmeter fmeter; /* memory_pressure filter */
84}; 95};
85 96
86/* bits in struct cpuset flags field */ 97/* bits in struct cpuset flags field */
@@ -149,7 +160,7 @@ static struct cpuset top_cpuset = {
149}; 160};
150 161
151static struct vfsmount *cpuset_mount; 162static struct vfsmount *cpuset_mount;
152static struct super_block *cpuset_sb = NULL; 163static struct super_block *cpuset_sb;
153 164
154/* 165/*
155 * We have two global cpuset semaphores below. They can nest. 166 * We have two global cpuset semaphores below. They can nest.
@@ -807,6 +818,19 @@ static int update_nodemask(struct cpuset *cs, char *buf)
807} 818}
808 819
809/* 820/*
821 * Call with manage_sem held.
822 */
823
824static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
825{
826 if (simple_strtoul(buf, NULL, 10) != 0)
827 cpuset_memory_pressure_enabled = 1;
828 else
829 cpuset_memory_pressure_enabled = 0;
830 return 0;
831}
832
833/*
810 * update_flag - read a 0 or a 1 in a file and update associated flag 834 * update_flag - read a 0 or a 1 in a file and update associated flag
811 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 835 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
812 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) 836 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE)
@@ -848,6 +872,104 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
848} 872}
849 873
850/* 874/*
875 * Frequency meter - How fast is some event occuring?
876 *
877 * These routines manage a digitally filtered, constant time based,
878 * event frequency meter. There are four routines:
879 * fmeter_init() - initialize a frequency meter.
880 * fmeter_markevent() - called each time the event happens.
881 * fmeter_getrate() - returns the recent rate of such events.
882 * fmeter_update() - internal routine used to update fmeter.
883 *
884 * A common data structure is passed to each of these routines,
885 * which is used to keep track of the state required to manage the
886 * frequency meter and its digital filter.
887 *
888 * The filter works on the number of events marked per unit time.
889 * The filter is single-pole low-pass recursive (IIR). The time unit
890 * is 1 second. Arithmetic is done using 32-bit integers scaled to
891 * simulate 3 decimal digits of precision (multiplied by 1000).
892 *
893 * With an FM_COEF of 933, and a time base of 1 second, the filter
894 * has a half-life of 10 seconds, meaning that if the events quit
895 * happening, then the rate returned from the fmeter_getrate()
896 * will be cut in half each 10 seconds, until it converges to zero.
897 *
898 * It is not worth doing a real infinitely recursive filter. If more
899 * than FM_MAXTICKS ticks have elapsed since the last filter event,
900 * just compute FM_MAXTICKS ticks worth, by which point the level
901 * will be stable.
902 *
903 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
904 * arithmetic overflow in the fmeter_update() routine.
905 *
906 * Given the simple 32 bit integer arithmetic used, this meter works
907 * best for reporting rates between one per millisecond (msec) and
908 * one per 32 (approx) seconds. At constant rates faster than one
909 * per msec it maxes out at values just under 1,000,000. At constant
910 * rates between one per msec, and one per second it will stabilize
911 * to a value N*1000, where N is the rate of events per second.
912 * At constant rates between one per second and one per 32 seconds,
913 * it will be choppy, moving up on the seconds that have an event,
914 * and then decaying until the next event. At rates slower than
915 * about one in 32 seconds, it decays all the way back to zero between
916 * each event.
917 */
918
919#define FM_COEF 933 /* coefficient for half-life of 10 secs */
920#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
921#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
922#define FM_SCALE 1000 /* faux fixed point scale */
923
924/* Initialize a frequency meter */
925static void fmeter_init(struct fmeter *fmp)
926{
927 fmp->cnt = 0;
928 fmp->val = 0;
929 fmp->time = 0;
930 spin_lock_init(&fmp->lock);
931}
932
933/* Internal meter update - process cnt events and update value */
934static void fmeter_update(struct fmeter *fmp)
935{
936 time_t now = get_seconds();
937 time_t ticks = now - fmp->time;
938
939 if (ticks == 0)
940 return;
941
942 ticks = min(FM_MAXTICKS, ticks);
943 while (ticks-- > 0)
944 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
945 fmp->time = now;
946
947 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
948 fmp->cnt = 0;
949}
950
951/* Process any previous ticks, then bump cnt by one (times scale). */
952static void fmeter_markevent(struct fmeter *fmp)
953{
954 spin_lock(&fmp->lock);
955 fmeter_update(fmp);
956 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
957 spin_unlock(&fmp->lock);
958}
959
960/* Process any previous ticks, then return current value. */
961static int fmeter_getrate(struct fmeter *fmp)
962{
963 int val;
964
965 spin_lock(&fmp->lock);
966 fmeter_update(fmp);
967 val = fmp->val;
968 spin_unlock(&fmp->lock);
969 return val;
970}
971
972/*
851 * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly 973 * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly
852 * writing the path of the old cpuset in 'ppathbuf' if it needs to be 974 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
853 * notified on release. 975 * notified on release.
@@ -931,6 +1053,8 @@ typedef enum {
931 FILE_CPU_EXCLUSIVE, 1053 FILE_CPU_EXCLUSIVE,
932 FILE_MEM_EXCLUSIVE, 1054 FILE_MEM_EXCLUSIVE,
933 FILE_NOTIFY_ON_RELEASE, 1055 FILE_NOTIFY_ON_RELEASE,
1056 FILE_MEMORY_PRESSURE_ENABLED,
1057 FILE_MEMORY_PRESSURE,
934 FILE_TASKLIST, 1058 FILE_TASKLIST,
935} cpuset_filetype_t; 1059} cpuset_filetype_t;
936 1060
@@ -984,6 +1108,12 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
984 case FILE_MEMORY_MIGRATE: 1108 case FILE_MEMORY_MIGRATE:
985 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); 1109 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
986 break; 1110 break;
1111 case FILE_MEMORY_PRESSURE_ENABLED:
1112 retval = update_memory_pressure_enabled(cs, buffer);
1113 break;
1114 case FILE_MEMORY_PRESSURE:
1115 retval = -EACCES;
1116 break;
987 case FILE_TASKLIST: 1117 case FILE_TASKLIST:
988 retval = attach_task(cs, buffer, &pathbuf); 1118 retval = attach_task(cs, buffer, &pathbuf);
989 break; 1119 break;
@@ -1087,6 +1217,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
1087 case FILE_MEMORY_MIGRATE: 1217 case FILE_MEMORY_MIGRATE:
1088 *s++ = is_memory_migrate(cs) ? '1' : '0'; 1218 *s++ = is_memory_migrate(cs) ? '1' : '0';
1089 break; 1219 break;
1220 case FILE_MEMORY_PRESSURE_ENABLED:
1221 *s++ = cpuset_memory_pressure_enabled ? '1' : '0';
1222 break;
1223 case FILE_MEMORY_PRESSURE:
1224 s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
1225 break;
1090 default: 1226 default:
1091 retval = -EINVAL; 1227 retval = -EINVAL;
1092 goto out; 1228 goto out;
@@ -1440,6 +1576,16 @@ static struct cftype cft_memory_migrate = {
1440 .private = FILE_MEMORY_MIGRATE, 1576 .private = FILE_MEMORY_MIGRATE,
1441}; 1577};
1442 1578
1579static struct cftype cft_memory_pressure_enabled = {
1580 .name = "memory_pressure_enabled",
1581 .private = FILE_MEMORY_PRESSURE_ENABLED,
1582};
1583
1584static struct cftype cft_memory_pressure = {
1585 .name = "memory_pressure",
1586 .private = FILE_MEMORY_PRESSURE,
1587};
1588
1443static int cpuset_populate_dir(struct dentry *cs_dentry) 1589static int cpuset_populate_dir(struct dentry *cs_dentry)
1444{ 1590{
1445 int err; 1591 int err;
@@ -1456,6 +1602,8 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
1456 return err; 1602 return err;
1457 if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0) 1603 if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0)
1458 return err; 1604 return err;
1605 if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0)
1606 return err;
1459 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) 1607 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
1460 return err; 1608 return err;
1461 return 0; 1609 return 0;
@@ -1491,6 +1639,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1491 INIT_LIST_HEAD(&cs->children); 1639 INIT_LIST_HEAD(&cs->children);
1492 atomic_inc(&cpuset_mems_generation); 1640 atomic_inc(&cpuset_mems_generation);
1493 cs->mems_generation = atomic_read(&cpuset_mems_generation); 1641 cs->mems_generation = atomic_read(&cpuset_mems_generation);
1642 fmeter_init(&cs->fmeter);
1494 1643
1495 cs->parent = parent; 1644 cs->parent = parent;
1496 1645
@@ -1580,6 +1729,7 @@ int __init cpuset_init(void)
1580 top_cpuset.cpus_allowed = CPU_MASK_ALL; 1729 top_cpuset.cpus_allowed = CPU_MASK_ALL;
1581 top_cpuset.mems_allowed = NODE_MASK_ALL; 1730 top_cpuset.mems_allowed = NODE_MASK_ALL;
1582 1731
1732 fmeter_init(&top_cpuset.fmeter);
1583 atomic_inc(&cpuset_mems_generation); 1733 atomic_inc(&cpuset_mems_generation);
1584 top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); 1734 top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
1585 1735
@@ -1601,6 +1751,9 @@ int __init cpuset_init(void)
1601 top_cpuset.dentry = root; 1751 top_cpuset.dentry = root;
1602 root->d_inode->i_op = &cpuset_dir_inode_operations; 1752 root->d_inode->i_op = &cpuset_dir_inode_operations;
1603 err = cpuset_populate_dir(root); 1753 err = cpuset_populate_dir(root);
1754 /* memory_pressure_enabled is in root cpuset only */
1755 if (err == 0)
1756 err = cpuset_add_file(root, &cft_memory_pressure_enabled);
1604out: 1757out:
1605 return err; 1758 return err;
1606} 1759}
@@ -1891,6 +2044,42 @@ done:
1891} 2044}
1892 2045
1893/* 2046/*
2047 * Collection of memory_pressure is suppressed unless
2048 * this flag is enabled by writing "1" to the special
2049 * cpuset file 'memory_pressure_enabled' in the root cpuset.
2050 */
2051
2052int cpuset_memory_pressure_enabled;
2053
2054/**
2055 * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
2056 *
2057 * Keep a running average of the rate of synchronous (direct)
2058 * page reclaim efforts initiated by tasks in each cpuset.
2059 *
2060 * This represents the rate at which some task in the cpuset
2061 * ran low on memory on all nodes it was allowed to use, and
2062 * had to enter the kernels page reclaim code in an effort to
2063 * create more free memory by tossing clean pages or swapping
2064 * or writing dirty pages.
2065 *
2066 * Display to user space in the per-cpuset read-only file
2067 * "memory_pressure". Value displayed is an integer
2068 * representing the recent rate of entry into the synchronous
2069 * (direct) page reclaim by any task attached to the cpuset.
2070 **/
2071
2072void __cpuset_memory_pressure_bump(void)
2073{
2074 struct cpuset *cs;
2075
2076 task_lock(current);
2077 cs = current->cpuset;
2078 fmeter_markevent(&cs->fmeter);
2079 task_unlock(current);
2080}
2081
2082/*
1894 * proc_cpuset_show() 2083 * proc_cpuset_show()
1895 * - Print tasks cpuset path into seq_file. 2084 * - Print tasks cpuset path into seq_file.
1896 * - Used for /proc/<pid>/cpuset. 2085 * - Used for /proc/<pid>/cpuset.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ad3d0202cdef..e0e84924171b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -976,6 +976,7 @@ rebalance:
976 cond_resched(); 976 cond_resched();
977 977
978 /* We now go into synchronous reclaim */ 978 /* We now go into synchronous reclaim */
979 cpuset_memory_pressure_bump();
979 p->flags |= PF_MEMALLOC; 980 p->flags |= PF_MEMALLOC;
980 reclaim_state.reclaimed_slab = 0; 981 reclaim_state.reclaimed_slab = 0;
981 p->reclaim_state = &reclaim_state; 982 p->reclaim_state = &reclaim_state;