diff options
author | Hugh Dickins <hughd@google.com> | 2016-05-19 20:12:50 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-05-19 22:12:14 -0400 |
commit | 52b6f46bc163eef17ecba4cd552beeafe2b24453 (patch) | |
tree | f1f8a9dc258f548fcdb7fa63b9b9a5eebd17976f | |
parent | 9e18eb29356b7dfd55183bd42cf73919d1590835 (diff) |
mm: /proc/sys/vm/stat_refresh to force vmstat update
Provide /proc/sys/vm/stat_refresh to force an immediate update of
per-cpu into global vmstats: useful to avoid a sleep(2) or whatever
before checking counts when testing. Originally added to work around a
bug which left counts stranded indefinitely on a cpu going idle (an
inaccuracy magnified when small below-batch numbers represent "huge"
amounts of memory), but I believe that bug is now fixed: nonetheless,
this is still a useful knob.
Its schedule_on_each_cpu() is probably too expensive just to fold into
reading /proc/meminfo itself: give this mode 0600 to prevent abuse.
Allow a write or a read to do the same: nothing to read, but "grep -h
Shmem /proc/sys/vm/stat_refresh /proc/meminfo" is convenient. Oh, and
since global_page_state() itself is careful to disguise any underflow as
0, hack in an "Invalid argument" and pr_warn() if a counter is negative
after the refresh - this helped to fix a misaccounting of
NR_ISOLATED_FILE in my migration code.
But on recent kernels, I find that NR_ALLOC_BATCH and NR_PAGES_SCANNED
often go negative some of the time. I have not yet worked out why, but
have no evidence that it's actually harmful. Punt for the moment by
just ignoring the anomaly on those.
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Yang Shi <yang.shi@linaro.org>
Cc: Ning Qu <quning@gmail.com>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Andres Lagar-Cavilla <andreslc@google.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | Documentation/sysctl/vm.txt | 14 | ||||
-rw-r--r-- | include/linux/vmstat.h | 4 | ||||
-rw-r--r-- | kernel/sysctl.c | 7 | ||||
-rw-r--r-- | mm/vmstat.c | 60 |
4 files changed, 85 insertions, 0 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 34a5fece3121..720355cbdf45 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt | |||
@@ -57,6 +57,7 @@ Currently, these files are in /proc/sys/vm: | |||
57 | - panic_on_oom | 57 | - panic_on_oom |
58 | - percpu_pagelist_fraction | 58 | - percpu_pagelist_fraction |
59 | - stat_interval | 59 | - stat_interval |
60 | - stat_refresh | ||
60 | - swappiness | 61 | - swappiness |
61 | - user_reserve_kbytes | 62 | - user_reserve_kbytes |
62 | - vfs_cache_pressure | 63 | - vfs_cache_pressure |
@@ -755,6 +756,19 @@ is 1 second. | |||
755 | 756 | ||
756 | ============================================================== | 757 | ============================================================== |
757 | 758 | ||
759 | stat_refresh | ||
760 | |||
761 | Any read or write (by root only) flushes all the per-cpu vm statistics | ||
762 | into their global totals, for more accurate reports when testing | ||
763 | e.g. cat /proc/sys/vm/stat_refresh /proc/meminfo | ||
764 | |||
765 | As a side-effect, it also checks for negative totals (elsewhere reported | ||
766 | as 0) and "fails" with EINVAL if any are found, with a warning in dmesg. | ||
767 | (At time of writing, a few stats are known sometimes to be found negative, | ||
768 | with no ill effects: errors and warnings on these stats are suppressed.) | ||
769 | |||
770 | ============================================================== | ||
771 | |||
758 | swappiness | 772 | swappiness |
759 | 773 | ||
760 | This control is used to define how aggressive the kernel will swap | 774 | This control is used to define how aggressive the kernel will swap |
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 73fae8c4a5fb..02fce415b3d9 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h | |||
@@ -193,6 +193,10 @@ void quiet_vmstat(void); | |||
193 | void cpu_vm_stats_fold(int cpu); | 193 | void cpu_vm_stats_fold(int cpu); |
194 | void refresh_zone_stat_thresholds(void); | 194 | void refresh_zone_stat_thresholds(void); |
195 | 195 | ||
196 | struct ctl_table; | ||
197 | int vmstat_refresh(struct ctl_table *, int write, | ||
198 | void __user *buffer, size_t *lenp, loff_t *ppos); | ||
199 | |||
196 | void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); | 200 | void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); |
197 | 201 | ||
198 | int calculate_pressure_threshold(struct zone *zone); | 202 | int calculate_pressure_threshold(struct zone *zone); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c8b318663525..2effd84d83e3 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -1521,6 +1521,13 @@ static struct ctl_table vm_table[] = { | |||
1521 | .mode = 0644, | 1521 | .mode = 0644, |
1522 | .proc_handler = proc_dointvec_jiffies, | 1522 | .proc_handler = proc_dointvec_jiffies, |
1523 | }, | 1523 | }, |
1524 | { | ||
1525 | .procname = "stat_refresh", | ||
1526 | .data = NULL, | ||
1527 | .maxlen = 0, | ||
1528 | .mode = 0600, | ||
1529 | .proc_handler = vmstat_refresh, | ||
1530 | }, | ||
1524 | #endif | 1531 | #endif |
1525 | #ifdef CONFIG_MMU | 1532 | #ifdef CONFIG_MMU |
1526 | { | 1533 | { |
diff --git a/mm/vmstat.c b/mm/vmstat.c index a7de9adacbd9..c831be32a1a3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c | |||
@@ -1379,6 +1379,66 @@ static DEFINE_PER_CPU(struct delayed_work, vmstat_work); | |||
1379 | int sysctl_stat_interval __read_mostly = HZ; | 1379 | int sysctl_stat_interval __read_mostly = HZ; |
1380 | static cpumask_var_t cpu_stat_off; | 1380 | static cpumask_var_t cpu_stat_off; |
1381 | 1381 | ||
1382 | #ifdef CONFIG_PROC_FS | ||
1383 | static void refresh_vm_stats(struct work_struct *work) | ||
1384 | { | ||
1385 | refresh_cpu_vm_stats(true); | ||
1386 | } | ||
1387 | |||
1388 | int vmstat_refresh(struct ctl_table *table, int write, | ||
1389 | void __user *buffer, size_t *lenp, loff_t *ppos) | ||
1390 | { | ||
1391 | long val; | ||
1392 | int err; | ||
1393 | int i; | ||
1394 | |||
1395 | /* | ||
1396 | * The regular update, every sysctl_stat_interval, may come later | ||
1397 | * than expected: leaving a significant amount in per_cpu buckets. | ||
1398 | * This is particularly misleading when checking a quantity of HUGE | ||
1399 | * pages, immediately after running a test. /proc/sys/vm/stat_refresh, | ||
1400 | * which can equally be echo'ed to or cat'ted from (by root), | ||
1401 | * can be used to update the stats just before reading them. | ||
1402 | * | ||
1403 | * Oh, and since global_page_state() etc. are so careful to hide | ||
1404 | * transiently negative values, report an error here if any of | ||
1405 | * the stats is negative, so we know to go looking for imbalance. | ||
1406 | */ | ||
1407 | err = schedule_on_each_cpu(refresh_vm_stats); | ||
1408 | if (err) | ||
1409 | return err; | ||
1410 | for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) { | ||
1411 | val = atomic_long_read(&vm_stat[i]); | ||
1412 | if (val < 0) { | ||
1413 | switch (i) { | ||
1414 | case NR_ALLOC_BATCH: | ||
1415 | case NR_PAGES_SCANNED: | ||
1416 | /* | ||
1417 | * These are often seen to go negative in | ||
1418 | * recent kernels, but not to go permanently | ||
1419 | * negative. Whilst it would be nicer not to | ||
1420 | * have exceptions, rooting them out would be | ||
1421 | * another task, of rather low priority. | ||
1422 | */ | ||
1423 | break; | ||
1424 | default: | ||
1425 | pr_warn("%s: %s %ld\n", | ||
1426 | __func__, vmstat_text[i], val); | ||
1427 | err = -EINVAL; | ||
1428 | break; | ||
1429 | } | ||
1430 | } | ||
1431 | } | ||
1432 | if (err) | ||
1433 | return err; | ||
1434 | if (write) | ||
1435 | *ppos += *lenp; | ||
1436 | else | ||
1437 | *lenp = 0; | ||
1438 | return 0; | ||
1439 | } | ||
1440 | #endif /* CONFIG_PROC_FS */ | ||
1441 | |||
1382 | static void vmstat_update(struct work_struct *w) | 1442 | static void vmstat_update(struct work_struct *w) |
1383 | { | 1443 | { |
1384 | if (refresh_cpu_vm_stats(true)) { | 1444 | if (refresh_cpu_vm_stats(true)) { |