aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/cpuset.h11
-rw-r--r--kernel/cpuset.c193
-rw-r--r--mm/page_alloc.c1
3 files changed, 203 insertions, 2 deletions
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 8b21786490ee..736d73801cb6 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -26,6 +26,15 @@ void cpuset_update_current_mems_allowed(void);
26int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl); 26int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
27extern int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask); 27extern int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask);
28extern int cpuset_excl_nodes_overlap(const struct task_struct *p); 28extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
29
30#define cpuset_memory_pressure_bump() \
31 do { \
32 if (cpuset_memory_pressure_enabled) \
33 __cpuset_memory_pressure_bump(); \
34 } while (0)
35extern int cpuset_memory_pressure_enabled;
36extern void __cpuset_memory_pressure_bump(void);
37
29extern struct file_operations proc_cpuset_operations; 38extern struct file_operations proc_cpuset_operations;
30extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer); 39extern char *cpuset_task_status_allowed(struct task_struct *task, char *buffer);
31 40
@@ -60,6 +69,8 @@ static inline int cpuset_excl_nodes_overlap(const struct task_struct *p)
60 return 1; 69 return 1;
61} 70}
62 71
72static inline void cpuset_memory_pressure_bump(void) {}
73
63static inline char *cpuset_task_status_allowed(struct task_struct *task, 74static inline char *cpuset_task_status_allowed(struct task_struct *task,
64 char *buffer) 75 char *buffer)
65{ 76{
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 6503c6da4c4f..5a06fef669f8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -56,6 +56,15 @@
56 56
57#define CPUSET_SUPER_MAGIC 0x27e0eb 57#define CPUSET_SUPER_MAGIC 0x27e0eb
58 58
59/* See "Frequency meter" comments, below. */
60
61struct fmeter {
62 int cnt; /* unprocessed events count */
63 int val; /* most recent output value */
64 time_t time; /* clock (secs) when val computed */
65 spinlock_t lock; /* guards read or write of above */
66};
67
59struct cpuset { 68struct cpuset {
60 unsigned long flags; /* "unsigned long" so bitops work */ 69 unsigned long flags; /* "unsigned long" so bitops work */
61 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ 70 cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */
@@ -80,7 +89,9 @@ struct cpuset {
80 * Copy of global cpuset_mems_generation as of the most 89 * Copy of global cpuset_mems_generation as of the most
81 * recent time this cpuset changed its mems_allowed. 90 * recent time this cpuset changed its mems_allowed.
82 */ 91 */
83 int mems_generation; 92 int mems_generation;
93
94 struct fmeter fmeter; /* memory_pressure filter */
84}; 95};
85 96
86/* bits in struct cpuset flags field */ 97/* bits in struct cpuset flags field */
@@ -149,7 +160,7 @@ static struct cpuset top_cpuset = {
149}; 160};
150 161
151static struct vfsmount *cpuset_mount; 162static struct vfsmount *cpuset_mount;
152static struct super_block *cpuset_sb = NULL; 163static struct super_block *cpuset_sb;
153 164
154/* 165/*
155 * We have two global cpuset semaphores below. They can nest. 166 * We have two global cpuset semaphores below. They can nest.
@@ -807,6 +818,19 @@ static int update_nodemask(struct cpuset *cs, char *buf)
807} 818}
808 819
809/* 820/*
821 * Call with manage_sem held.
822 */
823
824static int update_memory_pressure_enabled(struct cpuset *cs, char *buf)
825{
826 if (simple_strtoul(buf, NULL, 10) != 0)
827 cpuset_memory_pressure_enabled = 1;
828 else
829 cpuset_memory_pressure_enabled = 0;
830 return 0;
831}
832
833/*
810 * update_flag - read a 0 or a 1 in a file and update associated flag 834 * update_flag - read a 0 or a 1 in a file and update associated flag
811 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, 835 * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
812 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) 836 * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE)
@@ -848,6 +872,104 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
848} 872}
849 873
850/* 874/*
875 * Frequency meter - How fast is some event occuring?
876 *
877 * These routines manage a digitally filtered, constant time based,
878 * event frequency meter. There are four routines:
879 * fmeter_init() - initialize a frequency meter.
880 * fmeter_markevent() - called each time the event happens.
881 * fmeter_getrate() - returns the recent rate of such events.
882 * fmeter_update() - internal routine used to update fmeter.
883 *
884 * A common data structure is passed to each of these routines,
885 * which is used to keep track of the state required to manage the
886 * frequency meter and its digital filter.
887 *
888 * The filter works on the number of events marked per unit time.
889 * The filter is single-pole low-pass recursive (IIR). The time unit
890 * is 1 second. Arithmetic is done using 32-bit integers scaled to
891 * simulate 3 decimal digits of precision (multiplied by 1000).
892 *
893 * With an FM_COEF of 933, and a time base of 1 second, the filter
894 * has a half-life of 10 seconds, meaning that if the events quit
895 * happening, then the rate returned from the fmeter_getrate()
896 * will be cut in half each 10 seconds, until it converges to zero.
897 *
898 * It is not worth doing a real infinitely recursive filter. If more
899 * than FM_MAXTICKS ticks have elapsed since the last filter event,
900 * just compute FM_MAXTICKS ticks worth, by which point the level
901 * will be stable.
902 *
903 * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid
904 * arithmetic overflow in the fmeter_update() routine.
905 *
906 * Given the simple 32 bit integer arithmetic used, this meter works
907 * best for reporting rates between one per millisecond (msec) and
908 * one per 32 (approx) seconds. At constant rates faster than one
909 * per msec it maxes out at values just under 1,000,000. At constant
910 * rates between one per msec, and one per second it will stabilize
911 * to a value N*1000, where N is the rate of events per second.
912 * At constant rates between one per second and one per 32 seconds,
913 * it will be choppy, moving up on the seconds that have an event,
914 * and then decaying until the next event. At rates slower than
915 * about one in 32 seconds, it decays all the way back to zero between
916 * each event.
917 */
918
919#define FM_COEF 933 /* coefficient for half-life of 10 secs */
920#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
921#define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
922#define FM_SCALE 1000 /* faux fixed point scale */
923
924/* Initialize a frequency meter */
925static void fmeter_init(struct fmeter *fmp)
926{
927 fmp->cnt = 0;
928 fmp->val = 0;
929 fmp->time = 0;
930 spin_lock_init(&fmp->lock);
931}
932
933/* Internal meter update - process cnt events and update value */
934static void fmeter_update(struct fmeter *fmp)
935{
936 time_t now = get_seconds();
937 time_t ticks = now - fmp->time;
938
939 if (ticks == 0)
940 return;
941
942 ticks = min(FM_MAXTICKS, ticks);
943 while (ticks-- > 0)
944 fmp->val = (FM_COEF * fmp->val) / FM_SCALE;
945 fmp->time = now;
946
947 fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE;
948 fmp->cnt = 0;
949}
950
951/* Process any previous ticks, then bump cnt by one (times scale). */
952static void fmeter_markevent(struct fmeter *fmp)
953{
954 spin_lock(&fmp->lock);
955 fmeter_update(fmp);
956 fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE);
957 spin_unlock(&fmp->lock);
958}
959
960/* Process any previous ticks, then return current value. */
961static int fmeter_getrate(struct fmeter *fmp)
962{
963 int val;
964
965 spin_lock(&fmp->lock);
966 fmeter_update(fmp);
967 val = fmp->val;
968 spin_unlock(&fmp->lock);
969 return val;
970}
971
972/*
851 * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly 973 * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly
852 * writing the path of the old cpuset in 'ppathbuf' if it needs to be 974 * writing the path of the old cpuset in 'ppathbuf' if it needs to be
853 * notified on release. 975 * notified on release.
@@ -931,6 +1053,8 @@ typedef enum {
931 FILE_CPU_EXCLUSIVE, 1053 FILE_CPU_EXCLUSIVE,
932 FILE_MEM_EXCLUSIVE, 1054 FILE_MEM_EXCLUSIVE,
933 FILE_NOTIFY_ON_RELEASE, 1055 FILE_NOTIFY_ON_RELEASE,
1056 FILE_MEMORY_PRESSURE_ENABLED,
1057 FILE_MEMORY_PRESSURE,
934 FILE_TASKLIST, 1058 FILE_TASKLIST,
935} cpuset_filetype_t; 1059} cpuset_filetype_t;
936 1060
@@ -984,6 +1108,12 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
984 case FILE_MEMORY_MIGRATE: 1108 case FILE_MEMORY_MIGRATE:
985 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); 1109 retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
986 break; 1110 break;
1111 case FILE_MEMORY_PRESSURE_ENABLED:
1112 retval = update_memory_pressure_enabled(cs, buffer);
1113 break;
1114 case FILE_MEMORY_PRESSURE:
1115 retval = -EACCES;
1116 break;
987 case FILE_TASKLIST: 1117 case FILE_TASKLIST:
988 retval = attach_task(cs, buffer, &pathbuf); 1118 retval = attach_task(cs, buffer, &pathbuf);
989 break; 1119 break;
@@ -1087,6 +1217,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
1087 case FILE_MEMORY_MIGRATE: 1217 case FILE_MEMORY_MIGRATE:
1088 *s++ = is_memory_migrate(cs) ? '1' : '0'; 1218 *s++ = is_memory_migrate(cs) ? '1' : '0';
1089 break; 1219 break;
1220 case FILE_MEMORY_PRESSURE_ENABLED:
1221 *s++ = cpuset_memory_pressure_enabled ? '1' : '0';
1222 break;
1223 case FILE_MEMORY_PRESSURE:
1224 s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
1225 break;
1090 default: 1226 default:
1091 retval = -EINVAL; 1227 retval = -EINVAL;
1092 goto out; 1228 goto out;
@@ -1440,6 +1576,16 @@ static struct cftype cft_memory_migrate = {
1440 .private = FILE_MEMORY_MIGRATE, 1576 .private = FILE_MEMORY_MIGRATE,
1441}; 1577};
1442 1578
1579static struct cftype cft_memory_pressure_enabled = {
1580 .name = "memory_pressure_enabled",
1581 .private = FILE_MEMORY_PRESSURE_ENABLED,
1582};
1583
1584static struct cftype cft_memory_pressure = {
1585 .name = "memory_pressure",
1586 .private = FILE_MEMORY_PRESSURE,
1587};
1588
1443static int cpuset_populate_dir(struct dentry *cs_dentry) 1589static int cpuset_populate_dir(struct dentry *cs_dentry)
1444{ 1590{
1445 int err; 1591 int err;
@@ -1456,6 +1602,8 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
1456 return err; 1602 return err;
1457 if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0) 1603 if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0)
1458 return err; 1604 return err;
1605 if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0)
1606 return err;
1459 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) 1607 if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
1460 return err; 1608 return err;
1461 return 0; 1609 return 0;
@@ -1491,6 +1639,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
1491 INIT_LIST_HEAD(&cs->children); 1639 INIT_LIST_HEAD(&cs->children);
1492 atomic_inc(&cpuset_mems_generation); 1640 atomic_inc(&cpuset_mems_generation);
1493 cs->mems_generation = atomic_read(&cpuset_mems_generation); 1641 cs->mems_generation = atomic_read(&cpuset_mems_generation);
1642 fmeter_init(&cs->fmeter);
1494 1643
1495 cs->parent = parent; 1644 cs->parent = parent;
1496 1645
@@ -1580,6 +1729,7 @@ int __init cpuset_init(void)
1580 top_cpuset.cpus_allowed = CPU_MASK_ALL; 1729 top_cpuset.cpus_allowed = CPU_MASK_ALL;
1581 top_cpuset.mems_allowed = NODE_MASK_ALL; 1730 top_cpuset.mems_allowed = NODE_MASK_ALL;
1582 1731
1732 fmeter_init(&top_cpuset.fmeter);
1583 atomic_inc(&cpuset_mems_generation); 1733 atomic_inc(&cpuset_mems_generation);
1584 top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); 1734 top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation);
1585 1735
@@ -1601,6 +1751,9 @@ int __init cpuset_init(void)
1601 top_cpuset.dentry = root; 1751 top_cpuset.dentry = root;
1602 root->d_inode->i_op = &cpuset_dir_inode_operations; 1752 root->d_inode->i_op = &cpuset_dir_inode_operations;
1603 err = cpuset_populate_dir(root); 1753 err = cpuset_populate_dir(root);
1754 /* memory_pressure_enabled is in root cpuset only */
1755 if (err == 0)
1756 err = cpuset_add_file(root, &cft_memory_pressure_enabled);
1604out: 1757out:
1605 return err; 1758 return err;
1606} 1759}
@@ -1891,6 +2044,42 @@ done:
1891} 2044}
1892 2045
1893/* 2046/*
2047 * Collection of memory_pressure is suppressed unless
2048 * this flag is enabled by writing "1" to the special
2049 * cpuset file 'memory_pressure_enabled' in the root cpuset.
2050 */
2051
2052int cpuset_memory_pressure_enabled;
2053
2054/**
2055 * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims.
2056 *
2057 * Keep a running average of the rate of synchronous (direct)
2058 * page reclaim efforts initiated by tasks in each cpuset.
2059 *
2060 * This represents the rate at which some task in the cpuset
2061 * ran low on memory on all nodes it was allowed to use, and
2062 * had to enter the kernels page reclaim code in an effort to
2063 * create more free memory by tossing clean pages or swapping
2064 * or writing dirty pages.
2065 *
2066 * Display to user space in the per-cpuset read-only file
2067 * "memory_pressure". Value displayed is an integer
2068 * representing the recent rate of entry into the synchronous
2069 * (direct) page reclaim by any task attached to the cpuset.
2070 **/
2071
2072void __cpuset_memory_pressure_bump(void)
2073{
2074 struct cpuset *cs;
2075
2076 task_lock(current);
2077 cs = current->cpuset;
2078 fmeter_markevent(&cs->fmeter);
2079 task_unlock(current);
2080}
2081
2082/*
1894 * proc_cpuset_show() 2083 * proc_cpuset_show()
1895 * - Print tasks cpuset path into seq_file. 2084 * - Print tasks cpuset path into seq_file.
1896 * - Used for /proc/<pid>/cpuset. 2085 * - Used for /proc/<pid>/cpuset.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ad3d0202cdef..e0e84924171b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -976,6 +976,7 @@ rebalance:
976 cond_resched(); 976 cond_resched();
977 977
978 /* We now go into synchronous reclaim */ 978 /* We now go into synchronous reclaim */
979 cpuset_memory_pressure_bump();
979 p->flags |= PF_MEMALLOC; 980 p->flags |= PF_MEMALLOC;
980 reclaim_state.reclaimed_slab = 0; 981 reclaim_state.reclaimed_slab = 0;
981 p->reclaim_state = &reclaim_state; 982 p->reclaim_state = &reclaim_state;