mm: /proc/sys/vm/stat_refresh to force vmstat update

Provide /proc/sys/vm/stat_refresh to force an immediate update of per-cpu into global vmstats: useful to avoid a sleep(2) or whatever before checking counts when testing. Originally added to work around a bug which left counts stranded indefinitely on a cpu going idle (an inaccuracy magnified when small below-batch numbers represent "huge" amounts of memory), but I believe that bug is now fixed: nonetheless, this is still a useful knob. Its schedule_on_each_cpu() is probably too expensive just to fold into reading /proc/meminfo itself: give this mode 0600 to prevent abuse. Allow a write or a read to do the same: nothing to read, but "grep -h Shmem /proc/sys/vm/stat_refresh /proc/meminfo" is convenient. Oh, and since global_page_state() itself is careful to disguise any underflow as 0, hack in an "Invalid argument" and pr_warn() if a counter is negative after the refresh - this helped to fix a misaccounting of NR_ISOLATED_FILE in my migration code. But on recent kernels, I find that NR_ALLOC_BATCH and NR_PAGES_SCANNED often go negative some of the time. I have not yet worked out why, but have no evidence that it's actually harmful. Punt for the moment by just ignoring the anomaly on those. Signed-off-by: Hugh Dickins <hughd@google.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Andres Lagar-Cavilla <andreslc@google.com> Cc: Yang Shi <yang.shi@linaro.org> Cc: Ning Qu <quning@gmail.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Andres Lagar-Cavilla <andreslc@google.com> Cc: Konstantin Khlebnikov <koct9i@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Hugh Dickins <hughd@google.com> 2016-05-19 20:12:50 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2016-05-19 22:12:14 -0400
commit: 52b6f46bc163eef17ecba4cd552beeafe2b24453 (patch)
tree: f1f8a9dc258f548fcdb7fa63b9b9a5eebd17976f
parent: 9e18eb29356b7dfd55183bd42cf73919d1590835 (diff)
4 files changed, 85 insertions, 0 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 34a5fece3121..720355cbdf45 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -57,6 +57,7 @@ Currently, these files are in /proc/sys/vm:
 - panic_on_oom
 - percpu_pagelist_fraction
 - stat_interval
+- stat_refresh
 - swappiness
 - user_reserve_kbytes
 - vfs_cache_pressure
@@ -755,6 +756,19 @@ is 1 second.
 ==============================================================
+stat_refresh
+Any read or write (by root only) flushes all the per-cpu vm statistics
+into their global totals, for more accurate reports when testing
+e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo
+As a side-effect, it also checks for negative totals (elsewhere reported
+as 0) and "fails" with EINVAL if any are found, with a warning in dmesg.
+(At time of writing, a few stats are known sometimes to be found negative,
+with no ill effects: errors and warnings on these stats are suppressed.)
+==============================================================
 swappiness
 This control is used to define how aggressive the kernel will swap
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 73fae8c4a5fb..02fce415b3d9 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -193,6 +193,10 @@ void quiet_vmstat(void);
 void cpu_vm_stats_fold(int cpu);
 void refresh_zone_stat_thresholds(void);
+struct ctl_table;
+int vmstat_refresh(struct ctl_table *, int write,
+                   void __user *buffer, size_t *lenp, loff_t *ppos);
 void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
 int calculate_pressure_threshold(struct zone *zone);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8b318663525..2effd84d83e3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1521,6 +1521,13 @@ static struct ctl_table vm_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec_jiffies,
        },
+        {
+                .procname       = "stat_refresh",
+                .data           = NULL,
+                .maxlen         = 0,
+                .mode           = 0600,
+                .proc_handler   = vmstat_refresh,
+        },
 #endif
 #ifdef CONFIG_MMU
        {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a7de9adacbd9..c831be32a1a3 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1379,6 +1379,66 @@ static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
 int sysctl_stat_interval __read_mostly = HZ;
 static cpumask_var_t cpu_stat_off;
+#ifdef CONFIG_PROC_FS
+static void refresh_vm_stats(struct work_struct *work)
+{
+        refresh_cpu_vm_stats(true);
+}
+int vmstat_refresh(struct ctl_table *table, int write,
+                   void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+        long val;
+        int err;
+        int i;
+        /*
+         * The regular update, every sysctl_stat_interval, may come later
+         * than expected: leaving a significant amount in per_cpu buckets.
+         * This is particularly misleading when checking a quantity of HUGE
+         * pages, immediately after running a test.  /proc/sys/vm/stat_refresh,
+         * which can equally be echo'ed to or cat'ted from (by root),
+         * can be used to update the stats just before reading them.
+         *
+         * Oh, and since global_page_state() etc. are so careful to hide
+         * transiently negative values, report an error here if any of
+         * the stats is negative, so we know to go looking for imbalance.
+         */
+        err = schedule_on_each_cpu(refresh_vm_stats);
+        if (err)
+                return err;
+        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
+                val = atomic_long_read(&vm_stat[i]);
+                if (val < 0) {
+                        switch (i) {
+                        case NR_ALLOC_BATCH:
+                        case NR_PAGES_SCANNED:
+                                /*
+                                 * These are often seen to go negative in
+                                 * recent kernels, but not to go permanently
+                                 * negative.  Whilst it would be nicer not to
+                                 * have exceptions, rooting them out would be
+                                 * another task, of rather low priority.
+                                 */
+                                break;
+                        default:
+                                pr_warn("%s: %s %ld\n",
+                                        __func__, vmstat_text[i], val);
+                                err = -EINVAL;
+                                break;
+                        }
+                }
+        }
+        if (err)
+                return err;
+        if (write)
+                *ppos += *lenp;
+        else
+                *lenp = 0;
+        return 0;
+}
+#endif /* CONFIG_PROC_FS */
 static void vmstat_update(struct work_struct *w)
 {
        if (refresh_cpu_vm_stats(true)) {
author	Hugh Dickins <hughd@google.com>	2016-05-19 20:12:50 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2016-05-19 22:12:14 -0400
commit	52b6f46bc163eef17ecba4cd552beeafe2b24453 (patch)
tree	f1f8a9dc258f548fcdb7fa63b9b9a5eebd17976f
parent	9e18eb29356b7dfd55183bd42cf73919d1590835 (diff)

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 34a5fece3121..720355cbdf45 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt
@@ -57,6 +57,7 @@ Currently, these files are in /proc/sys/vm:
57	- panic_on_oom	57	- panic_on_oom
58	- percpu_pagelist_fraction	58	- percpu_pagelist_fraction
59	- stat_interval	59	- stat_interval
		60	- stat_refresh
60	- swappiness	61	- swappiness
61	- user_reserve_kbytes	62	- user_reserve_kbytes
62	- vfs_cache_pressure	63	- vfs_cache_pressure
@@ -755,6 +756,19 @@ is 1 second.
755		756
756	==============================================================	757	==============================================================
757		758
		759	stat_refresh
		760
		761	Any read or write (by root only) flushes all the per-cpu vm statistics
		762	into their global totals, for more accurate reports when testing
		763	e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo
		764
		765	As a side-effect, it also checks for negative totals (elsewhere reported
		766	as 0) and "fails" with EINVAL if any are found, with a warning in dmesg.
		767	(At time of writing, a few stats are known sometimes to be found negative,
		768	with no ill effects: errors and warnings on these stats are suppressed.)
		769
		770	==============================================================
		771
758	swappiness	772	swappiness
759		773
760	This control is used to define how aggressive the kernel will swap	774	This control is used to define how aggressive the kernel will swap


diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 73fae8c4a5fb..02fce415b3d9 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h
@@ -193,6 +193,10 @@ void quiet_vmstat(void);
193	void cpu_vm_stats_fold(int cpu);	193	void cpu_vm_stats_fold(int cpu);
194	void refresh_zone_stat_thresholds(void);	194	void refresh_zone_stat_thresholds(void);
195		195
		196	struct ctl_table;
		197	int vmstat_refresh(struct ctl_table *, int write,
		198	void __user buffer, size_t lenp, loff_t *ppos);
		199
196	void drain_zonestat(struct zone zone, struct per_cpu_pageset );	200	void drain_zonestat(struct zone zone, struct per_cpu_pageset );
197		201
198	int calculate_pressure_threshold(struct zone *zone);	202	int calculate_pressure_threshold(struct zone *zone);


diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8b318663525..2effd84d83e3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c
@@ -1521,6 +1521,13 @@ static struct ctl_table vm_table[] = {
1521	.mode = 0644,	1521	.mode = 0644,
1522	.proc_handler = proc_dointvec_jiffies,	1522	.proc_handler = proc_dointvec_jiffies,
1523	},	1523	},
		1524	{
		1525	.procname = "stat_refresh",
		1526	.data = NULL,
		1527	.maxlen = 0,
		1528	.mode = 0600,
		1529	.proc_handler = vmstat_refresh,
		1530	},
1524	#endif	1531	#endif
1525	#ifdef CONFIG_MMU	1532	#ifdef CONFIG_MMU
1526	{	1533	{


diff --git a/mm/vmstat.c b/mm/vmstat.c index a7de9adacbd9..c831be32a1a3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c
@@ -1379,6 +1379,66 @@ static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
1379	int sysctl_stat_interval __read_mostly = HZ;	1379	int sysctl_stat_interval __read_mostly = HZ;
1380	static cpumask_var_t cpu_stat_off;	1380	static cpumask_var_t cpu_stat_off;
1381		1381
		1382	#ifdef CONFIG_PROC_FS
		1383	static void refresh_vm_stats(struct work_struct *work)
		1384	{
		1385	refresh_cpu_vm_stats(true);
		1386	}
		1387
		1388	int vmstat_refresh(struct ctl_table *table, int write,
		1389	void __user buffer, size_t lenp, loff_t *ppos)
		1390	{
		1391	long val;
		1392	int err;
		1393	int i;
		1394
		1395	/*
		1396	* The regular update, every sysctl_stat_interval, may come later
		1397	* than expected: leaving a significant amount in per_cpu buckets.
		1398	* This is particularly misleading when checking a quantity of HUGE
		1399	* pages, immediately after running a test. /proc/sys/vm/stat_refresh,
		1400	* which can equally be echo'ed to or cat'ted from (by root),
		1401	* can be used to update the stats just before reading them.
		1402	*
		1403	* Oh, and since global_page_state() etc. are so careful to hide
		1404	* transiently negative values, report an error here if any of
		1405	* the stats is negative, so we know to go looking for imbalance.
		1406	*/
		1407	err = schedule_on_each_cpu(refresh_vm_stats);
		1408	if (err)
		1409	return err;
		1410	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
		1411	val = atomic_long_read(&vm_stat[i]);
		1412	if (val < 0) {
		1413	switch (i) {
		1414	case NR_ALLOC_BATCH:
		1415	case NR_PAGES_SCANNED:
		1416	/*
		1417	* These are often seen to go negative in
		1418	* recent kernels, but not to go permanently
		1419	* negative. Whilst it would be nicer not to
		1420	* have exceptions, rooting them out would be
		1421	* another task, of rather low priority.
		1422	*/
		1423	break;
		1424	default:
		1425	pr_warn("%s: %s %ld\n",
		1426	__func__, vmstat_text[i], val);
		1427	err = -EINVAL;
		1428	break;
		1429	}
		1430	}
		1431	}
		1432	if (err)
		1433	return err;
		1434	if (write)
		1435	ppos += lenp;
		1436	else
		1437	*lenp = 0;
		1438	return 0;
		1439	}
		1440	#endif /* CONFIG_PROC_FS */
		1441
1382	static void vmstat_update(struct work_struct *w)	1442	static void vmstat_update(struct work_struct *w)
1383	{	1443	{
1384	if (refresh_cpu_vm_stats(true)) {	1444	if (refresh_cpu_vm_stats(true)) {