diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/cpuset.c | 193 |
1 files changed, 191 insertions, 2 deletions
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6503c6da4c4f..5a06fef669f8 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -56,6 +56,15 @@ | |||
56 | 56 | ||
57 | #define CPUSET_SUPER_MAGIC 0x27e0eb | 57 | #define CPUSET_SUPER_MAGIC 0x27e0eb |
58 | 58 | ||
59 | /* See "Frequency meter" comments, below. */ | ||
60 | |||
61 | struct fmeter { | ||
62 | int cnt; /* unprocessed events count */ | ||
63 | int val; /* most recent output value */ | ||
64 | time_t time; /* clock (secs) when val computed */ | ||
65 | spinlock_t lock; /* guards read or write of above */ | ||
66 | }; | ||
67 | |||
59 | struct cpuset { | 68 | struct cpuset { |
60 | unsigned long flags; /* "unsigned long" so bitops work */ | 69 | unsigned long flags; /* "unsigned long" so bitops work */ |
61 | cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ | 70 | cpumask_t cpus_allowed; /* CPUs allowed to tasks in cpuset */ |
@@ -80,7 +89,9 @@ struct cpuset { | |||
80 | * Copy of global cpuset_mems_generation as of the most | 89 | * Copy of global cpuset_mems_generation as of the most |
81 | * recent time this cpuset changed its mems_allowed. | 90 | * recent time this cpuset changed its mems_allowed. |
82 | */ | 91 | */ |
83 | int mems_generation; | 92 | int mems_generation; |
93 | |||
94 | struct fmeter fmeter; /* memory_pressure filter */ | ||
84 | }; | 95 | }; |
85 | 96 | ||
86 | /* bits in struct cpuset flags field */ | 97 | /* bits in struct cpuset flags field */ |
@@ -149,7 +160,7 @@ static struct cpuset top_cpuset = { | |||
149 | }; | 160 | }; |
150 | 161 | ||
151 | static struct vfsmount *cpuset_mount; | 162 | static struct vfsmount *cpuset_mount; |
152 | static struct super_block *cpuset_sb = NULL; | 163 | static struct super_block *cpuset_sb; |
153 | 164 | ||
154 | /* | 165 | /* |
155 | * We have two global cpuset semaphores below. They can nest. | 166 | * We have two global cpuset semaphores below. They can nest. |
@@ -807,6 +818,19 @@ static int update_nodemask(struct cpuset *cs, char *buf) | |||
807 | } | 818 | } |
808 | 819 | ||
809 | /* | 820 | /* |
821 | * Call with manage_sem held. | ||
822 | */ | ||
823 | |||
824 | static int update_memory_pressure_enabled(struct cpuset *cs, char *buf) | ||
825 | { | ||
826 | if (simple_strtoul(buf, NULL, 10) != 0) | ||
827 | cpuset_memory_pressure_enabled = 1; | ||
828 | else | ||
829 | cpuset_memory_pressure_enabled = 0; | ||
830 | return 0; | ||
831 | } | ||
832 | |||
833 | /* | ||
810 | * update_flag - read a 0 or a 1 in a file and update associated flag | 834 | * update_flag - read a 0 or a 1 in a file and update associated flag |
811 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, | 835 | * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE, |
812 | * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) | 836 | * CS_NOTIFY_ON_RELEASE, CS_MEMORY_MIGRATE) |
@@ -848,6 +872,104 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) | |||
848 | } | 872 | } |
849 | 873 | ||
850 | /* | 874 | /* |
875 | * Frequency meter - How fast is some event occuring? | ||
876 | * | ||
877 | * These routines manage a digitally filtered, constant time based, | ||
878 | * event frequency meter. There are four routines: | ||
879 | * fmeter_init() - initialize a frequency meter. | ||
880 | * fmeter_markevent() - called each time the event happens. | ||
881 | * fmeter_getrate() - returns the recent rate of such events. | ||
882 | * fmeter_update() - internal routine used to update fmeter. | ||
883 | * | ||
884 | * A common data structure is passed to each of these routines, | ||
885 | * which is used to keep track of the state required to manage the | ||
886 | * frequency meter and its digital filter. | ||
887 | * | ||
888 | * The filter works on the number of events marked per unit time. | ||
889 | * The filter is single-pole low-pass recursive (IIR). The time unit | ||
890 | * is 1 second. Arithmetic is done using 32-bit integers scaled to | ||
891 | * simulate 3 decimal digits of precision (multiplied by 1000). | ||
892 | * | ||
893 | * With an FM_COEF of 933, and a time base of 1 second, the filter | ||
894 | * has a half-life of 10 seconds, meaning that if the events quit | ||
895 | * happening, then the rate returned from the fmeter_getrate() | ||
896 | * will be cut in half each 10 seconds, until it converges to zero. | ||
897 | * | ||
898 | * It is not worth doing a real infinitely recursive filter. If more | ||
899 | * than FM_MAXTICKS ticks have elapsed since the last filter event, | ||
900 | * just compute FM_MAXTICKS ticks worth, by which point the level | ||
901 | * will be stable. | ||
902 | * | ||
903 | * Limit the count of unprocessed events to FM_MAXCNT, so as to avoid | ||
904 | * arithmetic overflow in the fmeter_update() routine. | ||
905 | * | ||
906 | * Given the simple 32 bit integer arithmetic used, this meter works | ||
907 | * best for reporting rates between one per millisecond (msec) and | ||
908 | * one per 32 (approx) seconds. At constant rates faster than one | ||
909 | * per msec it maxes out at values just under 1,000,000. At constant | ||
910 | * rates between one per msec, and one per second it will stabilize | ||
911 | * to a value N*1000, where N is the rate of events per second. | ||
912 | * At constant rates between one per second and one per 32 seconds, | ||
913 | * it will be choppy, moving up on the seconds that have an event, | ||
914 | * and then decaying until the next event. At rates slower than | ||
915 | * about one in 32 seconds, it decays all the way back to zero between | ||
916 | * each event. | ||
917 | */ | ||
918 | |||
919 | #define FM_COEF 933 /* coefficient for half-life of 10 secs */ | ||
920 | #define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */ | ||
921 | #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ | ||
922 | #define FM_SCALE 1000 /* faux fixed point scale */ | ||
923 | |||
924 | /* Initialize a frequency meter */ | ||
925 | static void fmeter_init(struct fmeter *fmp) | ||
926 | { | ||
927 | fmp->cnt = 0; | ||
928 | fmp->val = 0; | ||
929 | fmp->time = 0; | ||
930 | spin_lock_init(&fmp->lock); | ||
931 | } | ||
932 | |||
933 | /* Internal meter update - process cnt events and update value */ | ||
934 | static void fmeter_update(struct fmeter *fmp) | ||
935 | { | ||
936 | time_t now = get_seconds(); | ||
937 | time_t ticks = now - fmp->time; | ||
938 | |||
939 | if (ticks == 0) | ||
940 | return; | ||
941 | |||
942 | ticks = min(FM_MAXTICKS, ticks); | ||
943 | while (ticks-- > 0) | ||
944 | fmp->val = (FM_COEF * fmp->val) / FM_SCALE; | ||
945 | fmp->time = now; | ||
946 | |||
947 | fmp->val += ((FM_SCALE - FM_COEF) * fmp->cnt) / FM_SCALE; | ||
948 | fmp->cnt = 0; | ||
949 | } | ||
950 | |||
951 | /* Process any previous ticks, then bump cnt by one (times scale). */ | ||
952 | static void fmeter_markevent(struct fmeter *fmp) | ||
953 | { | ||
954 | spin_lock(&fmp->lock); | ||
955 | fmeter_update(fmp); | ||
956 | fmp->cnt = min(FM_MAXCNT, fmp->cnt + FM_SCALE); | ||
957 | spin_unlock(&fmp->lock); | ||
958 | } | ||
959 | |||
960 | /* Process any previous ticks, then return current value. */ | ||
961 | static int fmeter_getrate(struct fmeter *fmp) | ||
962 | { | ||
963 | int val; | ||
964 | |||
965 | spin_lock(&fmp->lock); | ||
966 | fmeter_update(fmp); | ||
967 | val = fmp->val; | ||
968 | spin_unlock(&fmp->lock); | ||
969 | return val; | ||
970 | } | ||
971 | |||
972 | /* | ||
851 | * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly | 973 | * Attack task specified by pid in 'pidbuf' to cpuset 'cs', possibly |
852 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be | 974 | * writing the path of the old cpuset in 'ppathbuf' if it needs to be |
853 | * notified on release. | 975 | * notified on release. |
@@ -931,6 +1053,8 @@ typedef enum { | |||
931 | FILE_CPU_EXCLUSIVE, | 1053 | FILE_CPU_EXCLUSIVE, |
932 | FILE_MEM_EXCLUSIVE, | 1054 | FILE_MEM_EXCLUSIVE, |
933 | FILE_NOTIFY_ON_RELEASE, | 1055 | FILE_NOTIFY_ON_RELEASE, |
1056 | FILE_MEMORY_PRESSURE_ENABLED, | ||
1057 | FILE_MEMORY_PRESSURE, | ||
934 | FILE_TASKLIST, | 1058 | FILE_TASKLIST, |
935 | } cpuset_filetype_t; | 1059 | } cpuset_filetype_t; |
936 | 1060 | ||
@@ -984,6 +1108,12 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us | |||
984 | case FILE_MEMORY_MIGRATE: | 1108 | case FILE_MEMORY_MIGRATE: |
985 | retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); | 1109 | retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer); |
986 | break; | 1110 | break; |
1111 | case FILE_MEMORY_PRESSURE_ENABLED: | ||
1112 | retval = update_memory_pressure_enabled(cs, buffer); | ||
1113 | break; | ||
1114 | case FILE_MEMORY_PRESSURE: | ||
1115 | retval = -EACCES; | ||
1116 | break; | ||
987 | case FILE_TASKLIST: | 1117 | case FILE_TASKLIST: |
988 | retval = attach_task(cs, buffer, &pathbuf); | 1118 | retval = attach_task(cs, buffer, &pathbuf); |
989 | break; | 1119 | break; |
@@ -1087,6 +1217,12 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf, | |||
1087 | case FILE_MEMORY_MIGRATE: | 1217 | case FILE_MEMORY_MIGRATE: |
1088 | *s++ = is_memory_migrate(cs) ? '1' : '0'; | 1218 | *s++ = is_memory_migrate(cs) ? '1' : '0'; |
1089 | break; | 1219 | break; |
1220 | case FILE_MEMORY_PRESSURE_ENABLED: | ||
1221 | *s++ = cpuset_memory_pressure_enabled ? '1' : '0'; | ||
1222 | break; | ||
1223 | case FILE_MEMORY_PRESSURE: | ||
1224 | s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter)); | ||
1225 | break; | ||
1090 | default: | 1226 | default: |
1091 | retval = -EINVAL; | 1227 | retval = -EINVAL; |
1092 | goto out; | 1228 | goto out; |
@@ -1440,6 +1576,16 @@ static struct cftype cft_memory_migrate = { | |||
1440 | .private = FILE_MEMORY_MIGRATE, | 1576 | .private = FILE_MEMORY_MIGRATE, |
1441 | }; | 1577 | }; |
1442 | 1578 | ||
1579 | static struct cftype cft_memory_pressure_enabled = { | ||
1580 | .name = "memory_pressure_enabled", | ||
1581 | .private = FILE_MEMORY_PRESSURE_ENABLED, | ||
1582 | }; | ||
1583 | |||
1584 | static struct cftype cft_memory_pressure = { | ||
1585 | .name = "memory_pressure", | ||
1586 | .private = FILE_MEMORY_PRESSURE, | ||
1587 | }; | ||
1588 | |||
1443 | static int cpuset_populate_dir(struct dentry *cs_dentry) | 1589 | static int cpuset_populate_dir(struct dentry *cs_dentry) |
1444 | { | 1590 | { |
1445 | int err; | 1591 | int err; |
@@ -1456,6 +1602,8 @@ static int cpuset_populate_dir(struct dentry *cs_dentry) | |||
1456 | return err; | 1602 | return err; |
1457 | if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0) | 1603 | if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0) |
1458 | return err; | 1604 | return err; |
1605 | if ((err = cpuset_add_file(cs_dentry, &cft_memory_pressure)) < 0) | ||
1606 | return err; | ||
1459 | if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) | 1607 | if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0) |
1460 | return err; | 1608 | return err; |
1461 | return 0; | 1609 | return 0; |
@@ -1491,6 +1639,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) | |||
1491 | INIT_LIST_HEAD(&cs->children); | 1639 | INIT_LIST_HEAD(&cs->children); |
1492 | atomic_inc(&cpuset_mems_generation); | 1640 | atomic_inc(&cpuset_mems_generation); |
1493 | cs->mems_generation = atomic_read(&cpuset_mems_generation); | 1641 | cs->mems_generation = atomic_read(&cpuset_mems_generation); |
1642 | fmeter_init(&cs->fmeter); | ||
1494 | 1643 | ||
1495 | cs->parent = parent; | 1644 | cs->parent = parent; |
1496 | 1645 | ||
@@ -1580,6 +1729,7 @@ int __init cpuset_init(void) | |||
1580 | top_cpuset.cpus_allowed = CPU_MASK_ALL; | 1729 | top_cpuset.cpus_allowed = CPU_MASK_ALL; |
1581 | top_cpuset.mems_allowed = NODE_MASK_ALL; | 1730 | top_cpuset.mems_allowed = NODE_MASK_ALL; |
1582 | 1731 | ||
1732 | fmeter_init(&top_cpuset.fmeter); | ||
1583 | atomic_inc(&cpuset_mems_generation); | 1733 | atomic_inc(&cpuset_mems_generation); |
1584 | top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); | 1734 | top_cpuset.mems_generation = atomic_read(&cpuset_mems_generation); |
1585 | 1735 | ||
@@ -1601,6 +1751,9 @@ int __init cpuset_init(void) | |||
1601 | top_cpuset.dentry = root; | 1751 | top_cpuset.dentry = root; |
1602 | root->d_inode->i_op = &cpuset_dir_inode_operations; | 1752 | root->d_inode->i_op = &cpuset_dir_inode_operations; |
1603 | err = cpuset_populate_dir(root); | 1753 | err = cpuset_populate_dir(root); |
1754 | /* memory_pressure_enabled is in root cpuset only */ | ||
1755 | if (err == 0) | ||
1756 | err = cpuset_add_file(root, &cft_memory_pressure_enabled); | ||
1604 | out: | 1757 | out: |
1605 | return err; | 1758 | return err; |
1606 | } | 1759 | } |
@@ -1891,6 +2044,42 @@ done: | |||
1891 | } | 2044 | } |
1892 | 2045 | ||
1893 | /* | 2046 | /* |
2047 | * Collection of memory_pressure is suppressed unless | ||
2048 | * this flag is enabled by writing "1" to the special | ||
2049 | * cpuset file 'memory_pressure_enabled' in the root cpuset. | ||
2050 | */ | ||
2051 | |||
2052 | int cpuset_memory_pressure_enabled; | ||
2053 | |||
2054 | /** | ||
2055 | * cpuset_memory_pressure_bump - keep stats of per-cpuset reclaims. | ||
2056 | * | ||
2057 | * Keep a running average of the rate of synchronous (direct) | ||
2058 | * page reclaim efforts initiated by tasks in each cpuset. | ||
2059 | * | ||
2060 | * This represents the rate at which some task in the cpuset | ||
2061 | * ran low on memory on all nodes it was allowed to use, and | ||
2062 | * had to enter the kernels page reclaim code in an effort to | ||
2063 | * create more free memory by tossing clean pages or swapping | ||
2064 | * or writing dirty pages. | ||
2065 | * | ||
2066 | * Display to user space in the per-cpuset read-only file | ||
2067 | * "memory_pressure". Value displayed is an integer | ||
2068 | * representing the recent rate of entry into the synchronous | ||
2069 | * (direct) page reclaim by any task attached to the cpuset. | ||
2070 | **/ | ||
2071 | |||
2072 | void __cpuset_memory_pressure_bump(void) | ||
2073 | { | ||
2074 | struct cpuset *cs; | ||
2075 | |||
2076 | task_lock(current); | ||
2077 | cs = current->cpuset; | ||
2078 | fmeter_markevent(&cs->fmeter); | ||
2079 | task_unlock(current); | ||
2080 | } | ||
2081 | |||
2082 | /* | ||
1894 | * proc_cpuset_show() | 2083 | * proc_cpuset_show() |
1895 | * - Print tasks cpuset path into seq_file. | 2084 | * - Print tasks cpuset path into seq_file. |
1896 | * - Used for /proc/<pid>/cpuset. | 2085 | * - Used for /proc/<pid>/cpuset. |