aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHugh Dickins <hughd@google.com>2016-05-19 20:12:50 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-05-19 22:12:14 -0400
commit52b6f46bc163eef17ecba4cd552beeafe2b24453 (patch)
treef1f8a9dc258f548fcdb7fa63b9b9a5eebd17976f
parent9e18eb29356b7dfd55183bd42cf73919d1590835 (diff)
mm: /proc/sys/vm/stat_refresh to force vmstat update
Provide /proc/sys/vm/stat_refresh to force an immediate update of per-cpu into global vmstats: useful to avoid a sleep(2) or whatever before checking counts when testing. Originally added to work around a bug which left counts stranded indefinitely on a cpu going idle (an inaccuracy magnified when small below-batch numbers represent "huge" amounts of memory), but I believe that bug is now fixed: nonetheless, this is still a useful knob. Its schedule_on_each_cpu() is probably too expensive just to fold into reading /proc/meminfo itself: give this mode 0600 to prevent abuse. Allow a write or a read to do the same: nothing to read, but "grep -h Shmem /proc/sys/vm/stat_refresh /proc/meminfo" is convenient. Oh, and since global_page_state() itself is careful to disguise any underflow as 0, hack in an "Invalid argument" and pr_warn() if a counter is negative after the refresh - this helped to fix a misaccounting of NR_ISOLATED_FILE in my migration code. But on recent kernels, I find that NR_ALLOC_BATCH and NR_PAGES_SCANNED often go negative some of the time. I have not yet worked out why, but have no evidence that it's actually harmful. Punt for the moment by just ignoring the anomaly on those. Signed-off-by: Hugh Dickins <hughd@google.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Andres Lagar-Cavilla <andreslc@google.com> Cc: Yang Shi <yang.shi@linaro.org> Cc: Ning Qu <quning@gmail.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Andres Lagar-Cavilla <andreslc@google.com> Cc: Konstantin Khlebnikov <koct9i@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--Documentation/sysctl/vm.txt14
-rw-r--r--include/linux/vmstat.h4
-rw-r--r--kernel/sysctl.c7
-rw-r--r--mm/vmstat.c60
4 files changed, 85 insertions, 0 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 34a5fece3121..720355cbdf45 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -57,6 +57,7 @@ Currently, these files are in /proc/sys/vm:
57- panic_on_oom 57- panic_on_oom
58- percpu_pagelist_fraction 58- percpu_pagelist_fraction
59- stat_interval 59- stat_interval
60- stat_refresh
60- swappiness 61- swappiness
61- user_reserve_kbytes 62- user_reserve_kbytes
62- vfs_cache_pressure 63- vfs_cache_pressure
@@ -755,6 +756,19 @@ is 1 second.
755 756
756============================================================== 757==============================================================
757 758
759stat_refresh
760
761Any read or write (by root only) flushes all the per-cpu vm statistics
762into their global totals, for more accurate reports when testing
763e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo
764
765As a side-effect, it also checks for negative totals (elsewhere reported
766as 0) and "fails" with EINVAL if any are found, with a warning in dmesg.
767(At time of writing, a few stats are known sometimes to be found negative,
768with no ill effects: errors and warnings on these stats are suppressed.)
769
770==============================================================
771
758swappiness 772swappiness
759 773
760This control is used to define how aggressive the kernel will swap 774This control is used to define how aggressive the kernel will swap
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 73fae8c4a5fb..02fce415b3d9 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -193,6 +193,10 @@ void quiet_vmstat(void);
193void cpu_vm_stats_fold(int cpu); 193void cpu_vm_stats_fold(int cpu);
194void refresh_zone_stat_thresholds(void); 194void refresh_zone_stat_thresholds(void);
195 195
196struct ctl_table;
197int vmstat_refresh(struct ctl_table *, int write,
198 void __user *buffer, size_t *lenp, loff_t *ppos);
199
196void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); 200void drain_zonestat(struct zone *zone, struct per_cpu_pageset *);
197 201
198int calculate_pressure_threshold(struct zone *zone); 202int calculate_pressure_threshold(struct zone *zone);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c8b318663525..2effd84d83e3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1521,6 +1521,13 @@ static struct ctl_table vm_table[] = {
1521 .mode = 0644, 1521 .mode = 0644,
1522 .proc_handler = proc_dointvec_jiffies, 1522 .proc_handler = proc_dointvec_jiffies,
1523 }, 1523 },
1524 {
1525 .procname = "stat_refresh",
1526 .data = NULL,
1527 .maxlen = 0,
1528 .mode = 0600,
1529 .proc_handler = vmstat_refresh,
1530 },
1524#endif 1531#endif
1525#ifdef CONFIG_MMU 1532#ifdef CONFIG_MMU
1526 { 1533 {
diff --git a/mm/vmstat.c b/mm/vmstat.c
index a7de9adacbd9..c831be32a1a3 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1379,6 +1379,66 @@ static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
1379int sysctl_stat_interval __read_mostly = HZ; 1379int sysctl_stat_interval __read_mostly = HZ;
1380static cpumask_var_t cpu_stat_off; 1380static cpumask_var_t cpu_stat_off;
1381 1381
1382#ifdef CONFIG_PROC_FS
1383static void refresh_vm_stats(struct work_struct *work)
1384{
1385 refresh_cpu_vm_stats(true);
1386}
1387
1388int vmstat_refresh(struct ctl_table *table, int write,
1389 void __user *buffer, size_t *lenp, loff_t *ppos)
1390{
1391 long val;
1392 int err;
1393 int i;
1394
1395 /*
1396 * The regular update, every sysctl_stat_interval, may come later
1397 * than expected: leaving a significant amount in per_cpu buckets.
1398 * This is particularly misleading when checking a quantity of HUGE
1399 * pages, immediately after running a test. /proc/sys/vm/stat_refresh,
1400 * which can equally be echo'ed to or cat'ted from (by root),
1401 * can be used to update the stats just before reading them.
1402 *
1403 * Oh, and since global_page_state() etc. are so careful to hide
1404 * transiently negative values, report an error here if any of
1405 * the stats is negative, so we know to go looking for imbalance.
1406 */
1407 err = schedule_on_each_cpu(refresh_vm_stats);
1408 if (err)
1409 return err;
1410 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
1411 val = atomic_long_read(&vm_stat[i]);
1412 if (val < 0) {
1413 switch (i) {
1414 case NR_ALLOC_BATCH:
1415 case NR_PAGES_SCANNED:
1416 /*
1417 * These are often seen to go negative in
1418 * recent kernels, but not to go permanently
1419 * negative. Whilst it would be nicer not to
1420 * have exceptions, rooting them out would be
1421 * another task, of rather low priority.
1422 */
1423 break;
1424 default:
1425 pr_warn("%s: %s %ld\n",
1426 __func__, vmstat_text[i], val);
1427 err = -EINVAL;
1428 break;
1429 }
1430 }
1431 }
1432 if (err)
1433 return err;
1434 if (write)
1435 *ppos += *lenp;
1436 else
1437 *lenp = 0;
1438 return 0;
1439}
1440#endif /* CONFIG_PROC_FS */
1441
1382static void vmstat_update(struct work_struct *w) 1442static void vmstat_update(struct work_struct *w)
1383{ 1443{
1384 if (refresh_cpu_vm_stats(true)) { 1444 if (refresh_cpu_vm_stats(true)) {