vmstat: use our own timer events

vmstat is currently using the cache reaper to periodically bring the statistics up to date. The cache reaper does only exists in SLUB as a way to provide compatibility with SLAB. This patch removes the vmstat calls from the slab allocators and provides its own handling. The advantage is also that we can use a different frequency for the updates. Refreshing vm stats is a pretty fast job so we can run this every second and stagger this by only one tick. This will lead to some overlap in large systems. F.e a system running at 250 HZ with 1024 processors will have 4 vm updates occurring at once. However, the vm stats update only accesses per node information. It is only necessary to stagger the vm statistics updates per processor in each node. Vm counter updates occurring on distant nodes will not cause cacheline contention. We could implement an alternate approach that runs the first processor on each node at the second and then each of the other processor on a node on a subsequent tick. That may be useful to keep a large amount of the second free of timer activity. Maybe the timer folks will have some feedback on this one? [jirislaby@gmail.com: add missing break] Cc: Arjan van de Ven <arjan@linux.intel.com> Signed-off-by: Christoph Lameter <clameter@sgi.com> Signed-off-by: Jiri Slaby <jirislaby@gmail.com> Cc: Oleg Nesterov <oleg@tv-sign.ru> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Christoph Lameter <clameter@sgi.com> 2007-05-09 05:35:12 -0400
committer: Linus Torvalds <torvalds@woody.linux-foundation.org> 2007-05-09 15:30:56 -0400
commit: d1187ed21026fd512b87851d0ca26d9ae16f9059 (patch)
tree: 35d77758f134f3b69d3e00ca042a5d5ca6a59373 /mm
parent: 455c017ae3934797653549704c286e7bcc3a9397 (diff)
3 files changed, 36 insertions, 6 deletions
diff --git a/mm/slab.c b/mm/slab.c
index 6f3d6e240c61..e50908b2bfac 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -4156,7 +4156,6 @@ next:
        check_irq_on();
        mutex_unlock(&cache_chain_mutex);
        next_reap_node();
-        refresh_cpu_vm_stats(smp_processor_id());
 out:
        /* Set up the next iteration */
        schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
diff --git a/mm/slub.c b/mm/slub.c
index a581fa8ae11a..dbb206503a8d 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2580,7 +2580,6 @@ static DEFINE_PER_CPU(struct delayed_work, reap_work);
 static void cache_reap(struct work_struct *unused)
 {
        next_reap_node();
-        refresh_cpu_vm_stats(smp_processor_id());
        schedule_delayed_work(&__get_cpu_var(reap_work),
                                      REAPTIMEOUT_CPUC);
 }
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 9a66dc4aed43..9d824643a22f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -640,6 +640,22 @@ const struct seq_operations vmstat_op = {
 #endif /* CONFIG_PROC_FS */
 #ifdef CONFIG_SMP
+static DEFINE_PER_CPU(struct delayed_work, vmstat_work);
+static void vmstat_update(struct work_struct *w)
+{
+        refresh_cpu_vm_stats(smp_processor_id());
+        schedule_delayed_work(&__get_cpu_var(vmstat_work), HZ);
+}
+static void __devinit start_cpu_timer(int cpu)
+{
+        struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu);
+        INIT_DELAYED_WORK(vmstat_work, vmstat_update);
+        schedule_delayed_work_on(cpu, vmstat_work, HZ + cpu);
+}
 /*
 * Use the cpu notifier to insure that the thresholds are recalculated
 * when necessary.
@@ -648,11 +664,22 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
                unsigned long action,
                void *hcpu)
 {
+        long cpu = (long)hcpu;
        switch (action) {
-        case CPU_UP_PREPARE:
+        case CPU_ONLINE:
-        case CPU_UP_PREPARE_FROZEN:
+        case CPU_ONLINE_FROZEN:
-        case CPU_UP_CANCELED:
+                start_cpu_timer(cpu);
-        case CPU_UP_CANCELED_FROZEN:
+                break;
+        case CPU_DOWN_PREPARE:
+        case CPU_DOWN_PREPARE_FROZEN:
+                cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu));
+                per_cpu(vmstat_work, cpu).work.func = NULL;
+                break;
+        case CPU_DOWN_FAILED:
+        case CPU_DOWN_FAILED_FROZEN:
+                start_cpu_timer(cpu);
+                break;
        case CPU_DEAD:
        case CPU_DEAD_FROZEN:
                refresh_zone_stat_thresholds();
@@ -668,8 +695,13 @@ static struct notifier_block __cpuinitdata vmstat_notifier =
 int __init setup_vmstat(void)
 {
+        int cpu;
        refresh_zone_stat_thresholds();
        register_cpu_notifier(&vmstat_notifier);
+        for_each_online_cpu(cpu)
+                start_cpu_timer(cpu);
        return 0;
 }
 module_init(setup_vmstat)
author	Christoph Lameter <clameter@sgi.com>	2007-05-09 05:35:12 -0400
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>	2007-05-09 15:30:56 -0400
commit	d1187ed21026fd512b87851d0ca26d9ae16f9059 (patch)
tree	35d77758f134f3b69d3e00ca042a5d5ca6a59373 /mm
parent	455c017ae3934797653549704c286e7bcc3a9397 (diff)