aboutsummaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorChristoph Lameter <cl@gentwo.org>2014-10-09 18:29:43 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-10-09 22:26:02 -0400
commit7cc36bbddde5cd0c98f0c06e3304ab833d662565 (patch)
tree098a8f85268e13bb2137dc9193e094c73ab9dfa4 /mm
parentf0d6d1f6ff6f8525cfa396ec1969b8f402391445 (diff)
vmstat: on-demand vmstat workers V8
vmstat workers are used for folding counter differentials into the zone, per node and global counters at certain time intervals. They currently run at defined intervals on all processors which will cause some holdoff for processors that need minimal intrusion by the OS. The current vmstat_update mechanism depends on a deferrable timer firing every other second by default which registers a work queue item that runs on the local CPU, with the result that we have 1 interrupt and one additional schedulable task on each CPU every 2 seconds If a workload indeed causes VM activity or multiple tasks are running on a CPU, then there are probably bigger issues to deal with. However, some workloads dedicate a CPU for a single CPU bound task. This is done in high performance computing, in high frequency financial applications, in networking (Intel DPDK, EZchip NPS) and with the advent of systems with more and more CPUs over time, this may become more and more common to do since when one has enough CPUs one cares less about efficiently sharing a CPU with other tasks and more about efficiently monopolizing a CPU per task. The difference of having this timer firing and workqueue kernel thread scheduled per second can be enormous. An artificial test measuring the worst case time to do a simple "i++" in an endless loop on a bare metal system and under Linux on an isolated CPU with dynticks and with and without this patch, have Linux match the bare metal performance (~700 cycles) with this patch and loose by couple of orders of magnitude (~200k cycles) without it[*]. The loss occurs for something that just calculates statistics. For networking applications, for example, this could be the difference between dropping packets or sustaining line rate. Statistics are important and useful, but it would be great if there would be a way to not cause statistics gathering produce a huge performance difference. This patche does just that. This patch creates a vmstat shepherd worker that monitors the per cpu differentials on all processors. If there are differentials on a processor then a vmstat worker local to the processors with the differentials is created. That worker will then start folding the diffs in regular intervals. Should the worker find that there is no work to be done then it will make the shepherd worker monitor the differentials again. With this patch it is possible then to have periods longer than 2 seconds without any OS event on a "cpu" (hardware thread). The patch shows a very minor increased in system performance. hackbench -s 512 -l 2000 -g 15 -f 25 -P Results before the patch: Running in process mode with 15 groups using 50 file descriptors each (== 750 tasks) Each sender will pass 2000 messages of 512 bytes Time: 4.992 Running in process mode with 15 groups using 50 file descriptors each (== 750 tasks) Each sender will pass 2000 messages of 512 bytes Time: 4.971 Running in process mode with 15 groups using 50 file descriptors each (== 750 tasks) Each sender will pass 2000 messages of 512 bytes Time: 5.063 Hackbench after the patch: Running in process mode with 15 groups using 50 file descriptors each (== 750 tasks) Each sender will pass 2000 messages of 512 bytes Time: 4.973 Running in process mode with 15 groups using 50 file descriptors each (== 750 tasks) Each sender will pass 2000 messages of 512 bytes Time: 4.990 Running in process mode with 15 groups using 50 file descriptors each (== 750 tasks) Each sender will pass 2000 messages of 512 bytes Time: 4.993 [fengguang.wu@intel.com: cpu_stat_off can be static] Signed-off-by: Christoph Lameter <cl@linux.com> Reviewed-by: Gilad Ben-Yossef <gilad@benyossef.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Tejun Heo <tj@kernel.org> Cc: John Stultz <john.stultz@linaro.org> Cc: Mike Frysinger <vapier@gentoo.org> Cc: Minchan Kim <minchan.kim@gmail.com> Cc: Hakan Akkan <hakanakkan@gmail.com> Cc: Max Krasnyansky <maxk@qti.qualcomm.com> Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> Cc: Hugh Dickins <hughd@google.com> Cc: Viresh Kumar <viresh.kumar@linaro.org> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Ingo Molnar <mingo@kernel.org> Cc: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Fengguang Wu <fengguang.wu@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/vmstat.c141
1 files changed, 120 insertions, 21 deletions
diff --git a/mm/vmstat.c b/mm/vmstat.c
index cce7c766da7a..1b12d390dc68 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -7,6 +7,7 @@
7 * zoned VM statistics 7 * zoned VM statistics
8 * Copyright (C) 2006 Silicon Graphics, Inc., 8 * Copyright (C) 2006 Silicon Graphics, Inc.,
9 * Christoph Lameter <christoph@lameter.com> 9 * Christoph Lameter <christoph@lameter.com>
10 * Copyright (C) 2008-2014 Christoph Lameter
10 */ 11 */
11#include <linux/fs.h> 12#include <linux/fs.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
@@ -14,6 +15,7 @@
14#include <linux/module.h> 15#include <linux/module.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16#include <linux/cpu.h> 17#include <linux/cpu.h>
18#include <linux/cpumask.h>
17#include <linux/vmstat.h> 19#include <linux/vmstat.h>
18#include <linux/sched.h> 20#include <linux/sched.h>
19#include <linux/math64.h> 21#include <linux/math64.h>
@@ -419,13 +421,22 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
419EXPORT_SYMBOL(dec_zone_page_state); 421EXPORT_SYMBOL(dec_zone_page_state);
420#endif 422#endif
421 423
422static inline void fold_diff(int *diff) 424
425/*
426 * Fold a differential into the global counters.
427 * Returns the number of counters updated.
428 */
429static int fold_diff(int *diff)
423{ 430{
424 int i; 431 int i;
432 int changes = 0;
425 433
426 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) 434 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
427 if (diff[i]) 435 if (diff[i]) {
428 atomic_long_add(diff[i], &vm_stat[i]); 436 atomic_long_add(diff[i], &vm_stat[i]);
437 changes++;
438 }
439 return changes;
429} 440}
430 441
431/* 442/*
@@ -441,12 +452,15 @@ static inline void fold_diff(int *diff)
441 * statistics in the remote zone struct as well as the global cachelines 452 * statistics in the remote zone struct as well as the global cachelines
442 * with the global counters. These could cause remote node cache line 453 * with the global counters. These could cause remote node cache line
443 * bouncing and will have to be only done when necessary. 454 * bouncing and will have to be only done when necessary.
455 *
456 * The function returns the number of global counters updated.
444 */ 457 */
445static void refresh_cpu_vm_stats(void) 458static int refresh_cpu_vm_stats(void)
446{ 459{
447 struct zone *zone; 460 struct zone *zone;
448 int i; 461 int i;
449 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; 462 int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
463 int changes = 0;
450 464
451 for_each_populated_zone(zone) { 465 for_each_populated_zone(zone) {
452 struct per_cpu_pageset __percpu *p = zone->pageset; 466 struct per_cpu_pageset __percpu *p = zone->pageset;
@@ -486,15 +500,17 @@ static void refresh_cpu_vm_stats(void)
486 continue; 500 continue;
487 } 501 }
488 502
489
490 if (__this_cpu_dec_return(p->expire)) 503 if (__this_cpu_dec_return(p->expire))
491 continue; 504 continue;
492 505
493 if (__this_cpu_read(p->pcp.count)) 506 if (__this_cpu_read(p->pcp.count)) {
494 drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); 507 drain_zone_pages(zone, this_cpu_ptr(&p->pcp));
508 changes++;
509 }
495#endif 510#endif
496 } 511 }
497 fold_diff(global_diff); 512 changes += fold_diff(global_diff);
513 return changes;
498} 514}
499 515
500/* 516/*
@@ -1239,20 +1255,108 @@ static const struct file_operations proc_vmstat_file_operations = {
1239#ifdef CONFIG_SMP 1255#ifdef CONFIG_SMP
1240static DEFINE_PER_CPU(struct delayed_work, vmstat_work); 1256static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
1241int sysctl_stat_interval __read_mostly = HZ; 1257int sysctl_stat_interval __read_mostly = HZ;
1258static cpumask_var_t cpu_stat_off;
1242 1259
1243static void vmstat_update(struct work_struct *w) 1260static void vmstat_update(struct work_struct *w)
1244{ 1261{
1245 refresh_cpu_vm_stats(); 1262 if (refresh_cpu_vm_stats())
1246 schedule_delayed_work(this_cpu_ptr(&vmstat_work), 1263 /*
1264 * Counters were updated so we expect more updates
1265 * to occur in the future. Keep on running the
1266 * update worker thread.
1267 */
1268 schedule_delayed_work(this_cpu_ptr(&vmstat_work),
1269 round_jiffies_relative(sysctl_stat_interval));
1270 else {
1271 /*
1272 * We did not update any counters so the app may be in
1273 * a mode where it does not cause counter updates.
1274 * We may be uselessly running vmstat_update.
1275 * Defer the checking for differentials to the
1276 * shepherd thread on a different processor.
1277 */
1278 int r;
1279 /*
1280 * Shepherd work thread does not race since it never
1281 * changes the bit if its zero but the cpu
1282 * online / off line code may race if
1283 * worker threads are still allowed during
1284 * shutdown / startup.
1285 */
1286 r = cpumask_test_and_set_cpu(smp_processor_id(),
1287 cpu_stat_off);
1288 VM_BUG_ON(r);
1289 }
1290}
1291
1292/*
1293 * Check if the diffs for a certain cpu indicate that
1294 * an update is needed.
1295 */
1296static bool need_update(int cpu)
1297{
1298 struct zone *zone;
1299
1300 for_each_populated_zone(zone) {
1301 struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
1302
1303 BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
1304 /*
1305 * The fast way of checking if there are any vmstat diffs.
1306 * This works because the diffs are byte sized items.
1307 */
1308 if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
1309 return true;
1310
1311 }
1312 return false;
1313}
1314
1315
1316/*
1317 * Shepherd worker thread that checks the
1318 * differentials of processors that have their worker
1319 * threads for vm statistics updates disabled because of
1320 * inactivity.
1321 */
1322static void vmstat_shepherd(struct work_struct *w);
1323
1324static DECLARE_DELAYED_WORK(shepherd, vmstat_shepherd);
1325
1326static void vmstat_shepherd(struct work_struct *w)
1327{
1328 int cpu;
1329
1330 get_online_cpus();
1331 /* Check processors whose vmstat worker threads have been disabled */
1332 for_each_cpu(cpu, cpu_stat_off)
1333 if (need_update(cpu) &&
1334 cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
1335
1336 schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu),
1337 __round_jiffies_relative(sysctl_stat_interval, cpu));
1338
1339 put_online_cpus();
1340
1341 schedule_delayed_work(&shepherd,
1247 round_jiffies_relative(sysctl_stat_interval)); 1342 round_jiffies_relative(sysctl_stat_interval));
1343
1248} 1344}
1249 1345
1250static void start_cpu_timer(int cpu) 1346static void __init start_shepherd_timer(void)
1251{ 1347{
1252 struct delayed_work *work = &per_cpu(vmstat_work, cpu); 1348 int cpu;
1349
1350 for_each_possible_cpu(cpu)
1351 INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
1352 vmstat_update);
1353
1354 if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
1355 BUG();
1356 cpumask_copy(cpu_stat_off, cpu_online_mask);
1253 1357
1254 INIT_DEFERRABLE_WORK(work, vmstat_update); 1358 schedule_delayed_work(&shepherd,
1255 schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); 1359 round_jiffies_relative(sysctl_stat_interval));
1256} 1360}
1257 1361
1258static void vmstat_cpu_dead(int node) 1362static void vmstat_cpu_dead(int node)
@@ -1283,17 +1387,17 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb,
1283 case CPU_ONLINE: 1387 case CPU_ONLINE:
1284 case CPU_ONLINE_FROZEN: 1388 case CPU_ONLINE_FROZEN:
1285 refresh_zone_stat_thresholds(); 1389 refresh_zone_stat_thresholds();
1286 start_cpu_timer(cpu);
1287 node_set_state(cpu_to_node(cpu), N_CPU); 1390 node_set_state(cpu_to_node(cpu), N_CPU);
1391 cpumask_set_cpu(cpu, cpu_stat_off);
1288 break; 1392 break;
1289 case CPU_DOWN_PREPARE: 1393 case CPU_DOWN_PREPARE:
1290 case CPU_DOWN_PREPARE_FROZEN: 1394 case CPU_DOWN_PREPARE_FROZEN:
1291 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); 1395 cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu));
1292 per_cpu(vmstat_work, cpu).work.func = NULL; 1396 cpumask_clear_cpu(cpu, cpu_stat_off);
1293 break; 1397 break;
1294 case CPU_DOWN_FAILED: 1398 case CPU_DOWN_FAILED:
1295 case CPU_DOWN_FAILED_FROZEN: 1399 case CPU_DOWN_FAILED_FROZEN:
1296 start_cpu_timer(cpu); 1400 cpumask_set_cpu(cpu, cpu_stat_off);
1297 break; 1401 break;
1298 case CPU_DEAD: 1402 case CPU_DEAD:
1299 case CPU_DEAD_FROZEN: 1403 case CPU_DEAD_FROZEN:
@@ -1313,15 +1417,10 @@ static struct notifier_block vmstat_notifier =
1313static int __init setup_vmstat(void) 1417static int __init setup_vmstat(void)
1314{ 1418{
1315#ifdef CONFIG_SMP 1419#ifdef CONFIG_SMP
1316 int cpu;
1317
1318 cpu_notifier_register_begin(); 1420 cpu_notifier_register_begin();
1319 __register_cpu_notifier(&vmstat_notifier); 1421 __register_cpu_notifier(&vmstat_notifier);
1320 1422
1321 for_each_online_cpu(cpu) { 1423 start_shepherd_timer();
1322 start_cpu_timer(cpu);
1323 node_set_state(cpu_to_node(cpu), N_CPU);
1324 }
1325 cpu_notifier_register_done(); 1424 cpu_notifier_register_done();
1326#endif 1425#endif
1327#ifdef CONFIG_PROC_FS 1426#ifdef CONFIG_PROC_FS