mm: change the call sites of numa statistics items

Patch series "Separate NUMA statistics from zone statistics", v2. Each page allocation updates a set of per-zone statistics with a call to zone_statistics(). As discussed in 2017 MM summit, these are a substantial source of overhead in the page allocator and are very rarely consumed. This significant overhead in cache bouncing caused by zone counters (NUMA associated counters) update in parallel in multi-threaded page allocation (pointed out by Dave Hansen). A link to the MM summit slides: http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017-JesperBrouer.pdf To mitigate this overhead, this patchset separates NUMA statistics from zone statistics framework, and update NUMA counter threshold to a fixed size of MAX_U16 - 2, as a small threshold greatly increases the update frequency of the global counter from local per cpu counter (suggested by Ying Huang). The rationality is that these statistics counters don't need to be read often, unlike other VM counters, so it's not a problem to use a large threshold and make readers more expensive. With this patchset, we see 31.3% drop of CPU cycles(537-->369, see below) for per single page allocation and reclaim on Jesper's page_bench03 benchmark. Meanwhile, this patchset keeps the same style of virtual memory statistics with little end-user-visible effects (only move the numa stats to show behind zone page stats, see the first patch for details). I did an experiment of single page allocation and reclaim concurrently using Jesper's page_bench03 benchmark on a 2-Socket Broadwell-based server (88 processors with 126G memory) with different size of threshold of pcp counter. Benchmark provided by Jesper D Brouer(increase loop times to 10000000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench Threshold CPU cycles Throughput(88 threads) 32 799 241760478 64 640 301628829 125 537 358906028 <==> system by default 256 468 412397590 512 428 450550704 4096 399 482520943 20000 394 489009617 30000 395 488017817 65533 369(-31.3%) 521661345(+45.3%) <==> with this patchset N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics This patch (of 3): In this patch, NUMA statistics is separated from zone statistics framework, all the call sites of NUMA stats are changed to use numa-stats-specific functions, it does not have any functionality change except that the number of NUMA stats is shown behind zone page stats when users *read* the zone info. E.g. cat /proc/zoneinfo ***Base*** ***With this patch*** nr_free_pages 3976 nr_free_pages 3976 nr_zone_inactive_anon 0 nr_zone_inactive_anon 0 nr_zone_active_anon 0 nr_zone_active_anon 0 nr_zone_inactive_file 0 nr_zone_inactive_file 0 nr_zone_active_file 0 nr_zone_active_file 0 nr_zone_unevictable 0 nr_zone_unevictable 0 nr_zone_write_pending 0 nr_zone_write_pending 0 nr_mlock 0 nr_mlock 0 nr_page_table_pages 0 nr_page_table_pages 0 nr_kernel_stack 0 nr_kernel_stack 0 nr_bounce 0 nr_bounce 0 nr_zspages 0 nr_zspages 0 numa_hit 0 *nr_free_cma 0* numa_miss 0 numa_hit 0 numa_foreign 0 numa_miss 0 numa_interleave 0 numa_foreign 0 numa_local 0 numa_interleave 0 numa_other 0 numa_local 0 *nr_free_cma 0* numa_other 0 ... ... vm stats threshold: 10 vm stats threshold: 10 ... ... The next patch updates the numa stats counter size and threshold. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1503568801-21305-2-git-send-email-kemi.wang@intel.com Signed-off-by: Kemi Wang <kemi.wang@intel.com> Reported-by: Jesper Dangaard Brouer <brouer@redhat.com> Acked-by: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Christopher Lameter <cl@linux.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Andi Kleen <andi.kleen@intel.com> Cc: Ying Huang <ying.huang@intel.com> Cc: Aaron Lu <aaron.lu@intel.com> Cc: Tim Chen <tim.c.chen@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Kemi Wang <kemi.wang@intel.com> 2017-09-08 19:12:48 -0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-09-08 21:26:47 -0400
commit: 3a321d2a3dde812142e06ab5c2f062ed860182a5 (patch)
tree: 8ea5fb1684e34a0eedf2e2e9f669171f16622c2e /mm/vmstat.c
parent: fde26bed588918a11831841b219f74b20b32b080 (diff)
1 files changed, 155 insertions, 6 deletions
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c7e4b8458023..daea02833e2e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -87,8 +87,10 @@ void vm_events_fold_cpu(int cpu)
 * vm_stat contains the global counters
 */
 atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
+atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp;
 atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
 EXPORT_SYMBOL(vm_zone_stat);
+EXPORT_SYMBOL(vm_numa_stat);
 EXPORT_SYMBOL(vm_node_stat);
 #ifdef CONFIG_SMP
@@ -192,7 +194,10 @@ void refresh_zone_stat_thresholds(void)
                        per_cpu_ptr(zone->pageset, cpu)->stat_threshold
                                                        = threshold;
+#ifdef CONFIG_NUMA
+                        per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
+                                                        = threshold;
+#endif
                        /* Base nodestat threshold on the largest populated zone. */
                        pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
                        per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
@@ -226,9 +231,14 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
                        continue;
                threshold = (*calculate_pressure)(zone);
-                for_each_online_cpu(cpu)
+                for_each_online_cpu(cpu) {
                        per_cpu_ptr(zone->pageset, cpu)->stat_threshold
                                                        = threshold;
+#ifdef CONFIG_NUMA
+                        per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
+                                                        = threshold;
+#endif
+                }
        }
 }
@@ -604,6 +614,32 @@ EXPORT_SYMBOL(dec_node_page_state);
 * Fold a differential into the global counters.
 * Returns the number of counters updated.
 */
+#ifdef CONFIG_NUMA
+static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
+{
+        int i;
+        int changes = 0;
+        for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+                if (zone_diff[i]) {
+                        atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
+                        changes++;
+        }
+        for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+                if (numa_diff[i]) {
+                        atomic_long_add(numa_diff[i], &vm_numa_stat[i]);
+                        changes++;
+        }
+        for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
+                if (node_diff[i]) {
+                        atomic_long_add(node_diff[i], &vm_node_stat[i]);
+                        changes++;
+        }
+        return changes;
+}
+#else
 static int fold_diff(int *zone_diff, int *node_diff)
 {
        int i;
@@ -622,6 +658,7 @@ static int fold_diff(int *zone_diff, int *node_diff)
        }
        return changes;
 }
+#endif /* CONFIG_NUMA */
 /*
 * Update the zone counters for the current cpu.
@@ -645,6 +682,9 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
        struct zone *zone;
        int i;
        int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+#ifdef CONFIG_NUMA
+        int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
+#endif
        int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
        int changes = 0;
@@ -666,6 +706,18 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
                        }
                }
 #ifdef CONFIG_NUMA
+                for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
+                        int v;
+                        v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0);
+                        if (v) {
+                                atomic_long_add(v, &zone->vm_numa_stat[i]);
+                                global_numa_diff[i] += v;
+                                __this_cpu_write(p->expire, 3);
+                        }
+                }
                if (do_pagesets) {
                        cond_resched();
                        /*
@@ -712,7 +764,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
                }
        }
+#ifdef CONFIG_NUMA
+        changes += fold_diff(global_zone_diff, global_numa_diff,
+                             global_node_diff);
+#else
        changes += fold_diff(global_zone_diff, global_node_diff);
+#endif
        return changes;
 }
@@ -727,6 +784,9 @@ void cpu_vm_stats_fold(int cpu)
        struct zone *zone;
        int i;
        int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
+#ifdef CONFIG_NUMA
+        int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
+#endif
        int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
        for_each_populated_zone(zone) {
@@ -743,6 +803,18 @@ void cpu_vm_stats_fold(int cpu)
                                atomic_long_add(v, &zone->vm_stat[i]);
                                global_zone_diff[i] += v;
                        }
+#ifdef CONFIG_NUMA
+                for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+                        if (p->vm_numa_stat_diff[i]) {
+                                int v;
+                                v = p->vm_numa_stat_diff[i];
+                                p->vm_numa_stat_diff[i] = 0;
+                                atomic_long_add(v, &zone->vm_numa_stat[i]);
+                                global_numa_diff[i] += v;
+                        }
+#endif
        }
        for_each_online_pgdat(pgdat) {
@@ -761,7 +833,11 @@ void cpu_vm_stats_fold(int cpu)
                        }
        }
+#ifdef CONFIG_NUMA
+        fold_diff(global_zone_diff, global_numa_diff, global_node_diff);
+#else
        fold_diff(global_zone_diff, global_node_diff);
+#endif
 }
 /*
@@ -779,10 +855,38 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
                        atomic_long_add(v, &zone->vm_stat[i]);
                        atomic_long_add(v, &vm_zone_stat[i]);
                }
+#ifdef CONFIG_NUMA
+        for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+                if (pset->vm_numa_stat_diff[i]) {
+                        int v = pset->vm_numa_stat_diff[i];
+                        pset->vm_numa_stat_diff[i] = 0;
+                        atomic_long_add(v, &zone->vm_numa_stat[i]);
+                        atomic_long_add(v, &vm_numa_stat[i]);
+                }
+#endif
 }
 #endif
 #ifdef CONFIG_NUMA
+void __inc_numa_state(struct zone *zone,
+                                 enum numa_stat_item item)
+{
+        struct per_cpu_pageset __percpu *pcp = zone->pageset;
+        s8 __percpu *p = pcp->vm_numa_stat_diff + item;
+        s8 v, t;
+        v = __this_cpu_inc_return(*p);
+        t = __this_cpu_read(pcp->numa_stat_threshold);
+        if (unlikely(v > t)) {
+                s8 overstep = t >> 1;
+                zone_numa_state_add(v + overstep, zone, item);
+                __this_cpu_write(*p, -overstep);
+        }
+}
 /*
 * Determine the per node value of a stat item. This function
 * is called frequently in a NUMA machine, so try to be as
@@ -801,6 +905,19 @@ unsigned long sum_zone_node_page_state(int node,
        return count;
 }
+unsigned long sum_zone_numa_state(int node,
+                                 enum numa_stat_item item)
+{
+        struct zone *zones = NODE_DATA(node)->node_zones;
+        int i;
+        unsigned long count = 0;
+        for (i = 0; i < MAX_NR_ZONES; i++)
+                count += zone_numa_state(zones + i, item);
+        return count;
+}
 /*
 * Determine the per node value of a stat item.
 */
@@ -937,6 +1054,9 @@ const char * const vmstat_text[] = {
 #if IS_ENABLED(CONFIG_ZSMALLOC)
        "nr_zspages",
 #endif
+        "nr_free_cma",
+        /* enum numa_stat_item counters */
 #ifdef CONFIG_NUMA
        "numa_hit",
        "numa_miss",
@@ -945,7 +1065,6 @@ const char * const vmstat_text[] = {
        "numa_local",
        "numa_other",
 #endif
-        "nr_free_cma",
        /* Node-based counters */
        "nr_inactive_anon",
@@ -1106,7 +1225,6 @@ const char * const vmstat_text[] = {
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
 #if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
     defined(CONFIG_PROC_FS)
 static void *frag_start(struct seq_file *m, loff_t *pos)
@@ -1384,7 +1502,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                seq_printf(m, "\n  per-node stats");
                for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
                        seq_printf(m, "\n      %-12s %lu",
-                                vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+                                vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
+                                NR_VM_NUMA_STAT_ITEMS],
                                node_page_state(pgdat, i));
                }
        }
@@ -1421,6 +1540,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
                seq_printf(m, "\n      %-12s %lu", vmstat_text[i],
                                zone_page_state(zone, i));
+#ifdef CONFIG_NUMA
+        for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+                seq_printf(m, "\n      %-12s %lu",
+                                vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
+                                zone_numa_state(zone, i));
+#endif
        seq_printf(m, "\n  pagesets");
        for_each_online_cpu(i) {
                struct per_cpu_pageset *pageset;
@@ -1497,6 +1623,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
        if (*pos >= ARRAY_SIZE(vmstat_text))
                return NULL;
        stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
+                          NR_VM_NUMA_STAT_ITEMS * sizeof(unsigned long) +
                          NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
                          NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
@@ -1512,6 +1639,12 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
                v[i] = global_zone_page_state(i);
        v += NR_VM_ZONE_STAT_ITEMS;
+#ifdef CONFIG_NUMA
+        for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
+                v[i] = global_numa_state(i);
+        v += NR_VM_NUMA_STAT_ITEMS;
+#endif
        for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
                v[i] = global_node_page_state(i);
        v += NR_VM_NODE_STAT_ITEMS;
@@ -1613,6 +1746,16 @@ int vmstat_refresh(struct ctl_table *table, int write,
                        err = -EINVAL;
                }
        }
+#ifdef CONFIG_NUMA
+        for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
+                val = atomic_long_read(&vm_numa_stat[i]);
+                if (val < 0) {
+                        pr_warn("%s: %s %ld\n",
+                                __func__, vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], val);
+                        err = -EINVAL;
+                }
+        }
+#endif
        if (err)
                return err;
        if (write)
@@ -1654,13 +1797,19 @@ static bool need_update(int cpu)
                struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
                BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
+#ifdef CONFIG_NUMA
+                BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 1);
+#endif
                /*
                 * The fast way of checking if there are any vmstat diffs.
                 * This works because the diffs are byte sized items.
                 */
                if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
                        return true;
+#ifdef CONFIG_NUMA
+                if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS))
+                        return true;
+#endif
        }
        return false;
 }
author	Kemi Wang <kemi.wang@intel.com>	2017-09-08 19:12:48 -0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-09-08 21:26:47 -0400
commit	3a321d2a3dde812142e06ab5c2f062ed860182a5 (patch)
tree	8ea5fb1684e34a0eedf2e2e9f669171f16622c2e /mm/vmstat.c
parent	fde26bed588918a11831841b219f74b20b32b080 (diff)

diff --git a/mm/vmstat.c b/mm/vmstat.c index c7e4b8458023..daea02833e2e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c
@@ -87,8 +87,10 @@ void vm_events_fold_cpu(int cpu)
87	* vm_stat contains the global counters	87	* vm_stat contains the global counters
88	*/	88	*/
89	atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;	89	atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
		90	atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp;
90	atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;	91	atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
91	EXPORT_SYMBOL(vm_zone_stat);	92	EXPORT_SYMBOL(vm_zone_stat);
		93	EXPORT_SYMBOL(vm_numa_stat);
92	EXPORT_SYMBOL(vm_node_stat);	94	EXPORT_SYMBOL(vm_node_stat);
93		95
94	#ifdef CONFIG_SMP	96	#ifdef CONFIG_SMP
@@ -192,7 +194,10 @@ void refresh_zone_stat_thresholds(void)
192		194
193	per_cpu_ptr(zone->pageset, cpu)->stat_threshold	195	per_cpu_ptr(zone->pageset, cpu)->stat_threshold
194	= threshold;	196	= threshold;
195		197	#ifdef CONFIG_NUMA
		198	per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
		199	= threshold;
		200	#endif
196	/* Base nodestat threshold on the largest populated zone. */	201	/* Base nodestat threshold on the largest populated zone. */
197	pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;	202	pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
198	per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold	203	per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
@@ -226,9 +231,14 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
226	continue;	231	continue;
227		232
228	threshold = (*calculate_pressure)(zone);	233	threshold = (*calculate_pressure)(zone);
229	for_each_online_cpu(cpu)	234	for_each_online_cpu(cpu) {
230	per_cpu_ptr(zone->pageset, cpu)->stat_threshold	235	per_cpu_ptr(zone->pageset, cpu)->stat_threshold
231	= threshold;	236	= threshold;
		237	#ifdef CONFIG_NUMA
		238	per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
		239	= threshold;
		240	#endif
		241	}
232	}	242	}
233	}	243	}
234		244
@@ -604,6 +614,32 @@ EXPORT_SYMBOL(dec_node_page_state);
604	* Fold a differential into the global counters.	614	* Fold a differential into the global counters.
605	* Returns the number of counters updated.	615	* Returns the number of counters updated.
606	*/	616	*/
		617	#ifdef CONFIG_NUMA
		618	static int fold_diff(int zone_diff, int numa_diff, int *node_diff)
		619	{
		620	int i;
		621	int changes = 0;
		622
		623	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
		624	if (zone_diff[i]) {
		625	atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
		626	changes++;
		627	}
		628
		629	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
		630	if (numa_diff[i]) {
		631	atomic_long_add(numa_diff[i], &vm_numa_stat[i]);
		632	changes++;
		633	}
		634
		635	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
		636	if (node_diff[i]) {
		637	atomic_long_add(node_diff[i], &vm_node_stat[i]);
		638	changes++;
		639	}
		640	return changes;
		641	}
		642	#else
607	static int fold_diff(int zone_diff, int node_diff)	643	static int fold_diff(int zone_diff, int node_diff)
608	{	644	{
609	int i;	645	int i;
@@ -622,6 +658,7 @@ static int fold_diff(int zone_diff, int node_diff)
622	}	658	}
623	return changes;	659	return changes;
624	}	660	}
		661	#endif /* CONFIG_NUMA */
625		662
626	/*	663	/*
627	* Update the zone counters for the current cpu.	664	* Update the zone counters for the current cpu.
@@ -645,6 +682,9 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
645	struct zone *zone;	682	struct zone *zone;
646	int i;	683	int i;
647	int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };	684	int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
		685	#ifdef CONFIG_NUMA
		686	int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
		687	#endif
648	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };	688	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
649	int changes = 0;	689	int changes = 0;
650		690
@@ -666,6 +706,18 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
666	}	706	}
667	}	707	}
668	#ifdef CONFIG_NUMA	708	#ifdef CONFIG_NUMA
		709	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
		710	int v;
		711
		712	v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0);
		713	if (v) {
		714
		715	atomic_long_add(v, &zone->vm_numa_stat[i]);
		716	global_numa_diff[i] += v;
		717	__this_cpu_write(p->expire, 3);
		718	}
		719	}
		720
669	if (do_pagesets) {	721	if (do_pagesets) {
670	cond_resched();	722	cond_resched();
671	/*	723	/*
@@ -712,7 +764,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
712	}	764	}
713	}	765	}
714		766
		767	#ifdef CONFIG_NUMA
		768	changes += fold_diff(global_zone_diff, global_numa_diff,
		769	global_node_diff);
		770	#else
715	changes += fold_diff(global_zone_diff, global_node_diff);	771	changes += fold_diff(global_zone_diff, global_node_diff);
		772	#endif
716	return changes;	773	return changes;
717	}	774	}
718		775
@@ -727,6 +784,9 @@ void cpu_vm_stats_fold(int cpu)
727	struct zone *zone;	784	struct zone *zone;
728	int i;	785	int i;
729	int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };	786	int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
		787	#ifdef CONFIG_NUMA
		788	int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
		789	#endif
730	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };	790	int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
731		791
732	for_each_populated_zone(zone) {	792	for_each_populated_zone(zone) {
@@ -743,6 +803,18 @@ void cpu_vm_stats_fold(int cpu)
743	atomic_long_add(v, &zone->vm_stat[i]);	803	atomic_long_add(v, &zone->vm_stat[i]);
744	global_zone_diff[i] += v;	804	global_zone_diff[i] += v;
745	}	805	}
		806
		807	#ifdef CONFIG_NUMA
		808	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
		809	if (p->vm_numa_stat_diff[i]) {
		810	int v;
		811
		812	v = p->vm_numa_stat_diff[i];
		813	p->vm_numa_stat_diff[i] = 0;
		814	atomic_long_add(v, &zone->vm_numa_stat[i]);
		815	global_numa_diff[i] += v;
		816	}
		817	#endif
746	}	818	}
747		819
748	for_each_online_pgdat(pgdat) {	820	for_each_online_pgdat(pgdat) {
@@ -761,7 +833,11 @@ void cpu_vm_stats_fold(int cpu)
761	}	833	}
762	}	834	}
763		835
		836	#ifdef CONFIG_NUMA
		837	fold_diff(global_zone_diff, global_numa_diff, global_node_diff);
		838	#else
764	fold_diff(global_zone_diff, global_node_diff);	839	fold_diff(global_zone_diff, global_node_diff);
		840	#endif
765	}	841	}
766		842
767	/*	843	/*
@@ -779,10 +855,38 @@ void drain_zonestat(struct zone zone, struct per_cpu_pageset pset)
779	atomic_long_add(v, &zone->vm_stat[i]);	855	atomic_long_add(v, &zone->vm_stat[i]);
780	atomic_long_add(v, &vm_zone_stat[i]);	856	atomic_long_add(v, &vm_zone_stat[i]);
781	}	857	}
		858
		859	#ifdef CONFIG_NUMA
		860	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
		861	if (pset->vm_numa_stat_diff[i]) {
		862	int v = pset->vm_numa_stat_diff[i];
		863
		864	pset->vm_numa_stat_diff[i] = 0;
		865	atomic_long_add(v, &zone->vm_numa_stat[i]);
		866	atomic_long_add(v, &vm_numa_stat[i]);
		867	}
		868	#endif
782	}	869	}
783	#endif	870	#endif
784		871
785	#ifdef CONFIG_NUMA	872	#ifdef CONFIG_NUMA
		873	void __inc_numa_state(struct zone *zone,
		874	enum numa_stat_item item)
		875	{
		876	struct per_cpu_pageset __percpu *pcp = zone->pageset;
		877	s8 __percpu *p = pcp->vm_numa_stat_diff + item;
		878	s8 v, t;
		879
		880	v = __this_cpu_inc_return(*p);
		881	t = __this_cpu_read(pcp->numa_stat_threshold);
		882	if (unlikely(v > t)) {
		883	s8 overstep = t >> 1;
		884
		885	zone_numa_state_add(v + overstep, zone, item);
		886	__this_cpu_write(*p, -overstep);
		887	}
		888	}
		889
786	/*	890	/*
787	* Determine the per node value of a stat item. This function	891	* Determine the per node value of a stat item. This function
788	* is called frequently in a NUMA machine, so try to be as	892	* is called frequently in a NUMA machine, so try to be as
@@ -801,6 +905,19 @@ unsigned long sum_zone_node_page_state(int node,
801	return count;	905	return count;
802	}	906	}
803		907
		908	unsigned long sum_zone_numa_state(int node,
		909	enum numa_stat_item item)
		910	{
		911	struct zone *zones = NODE_DATA(node)->node_zones;
		912	int i;
		913	unsigned long count = 0;
		914
		915	for (i = 0; i < MAX_NR_ZONES; i++)
		916	count += zone_numa_state(zones + i, item);
		917
		918	return count;
		919	}
		920
804	/*	921	/*
805	* Determine the per node value of a stat item.	922	* Determine the per node value of a stat item.
806	*/	923	*/
@@ -937,6 +1054,9 @@ const char * const vmstat_text[] = {
937	#if IS_ENABLED(CONFIG_ZSMALLOC)	1054	#if IS_ENABLED(CONFIG_ZSMALLOC)
938	"nr_zspages",	1055	"nr_zspages",
939	#endif	1056	#endif
		1057	"nr_free_cma",
		1058
		1059	/* enum numa_stat_item counters */
940	#ifdef CONFIG_NUMA	1060	#ifdef CONFIG_NUMA
941	"numa_hit",	1061	"numa_hit",
942	"numa_miss",	1062	"numa_miss",
@@ -945,7 +1065,6 @@ const char * const vmstat_text[] = {
945	"numa_local",	1065	"numa_local",
946	"numa_other",	1066	"numa_other",
947	#endif	1067	#endif
948	"nr_free_cma",
949		1068
950	/* Node-based counters */	1069	/* Node-based counters */
951	"nr_inactive_anon",	1070	"nr_inactive_anon",
@@ -1106,7 +1225,6 @@ const char * const vmstat_text[] = {
1106	};	1225	};
1107	#endif /* CONFIG_PROC_FS \|\| CONFIG_SYSFS \|\| CONFIG_NUMA */	1226	#endif /* CONFIG_PROC_FS \|\| CONFIG_SYSFS \|\| CONFIG_NUMA */
1108		1227
1109
1110	#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) \|\| \	1228	#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) \|\| \
1111	defined(CONFIG_PROC_FS)	1229	defined(CONFIG_PROC_FS)
1112	static void frag_start(struct seq_file m, loff_t *pos)	1230	static void frag_start(struct seq_file m, loff_t *pos)
@@ -1384,7 +1502,8 @@ static void zoneinfo_show_print(struct seq_file m, pg_data_t pgdat,
1384	seq_printf(m, "\n per-node stats");	1502	seq_printf(m, "\n per-node stats");
1385	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {	1503	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1386	seq_printf(m, "\n %-12s %lu",	1504	seq_printf(m, "\n %-12s %lu",
1387	vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],	1505	vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
		1506	NR_VM_NUMA_STAT_ITEMS],
1388	node_page_state(pgdat, i));	1507	node_page_state(pgdat, i));
1389	}	1508	}
1390	}	1509	}
@@ -1421,6 +1540,13 @@ static void zoneinfo_show_print(struct seq_file m, pg_data_t pgdat,
1421	seq_printf(m, "\n %-12s %lu", vmstat_text[i],	1540	seq_printf(m, "\n %-12s %lu", vmstat_text[i],
1422	zone_page_state(zone, i));	1541	zone_page_state(zone, i));
1423		1542
		1543	#ifdef CONFIG_NUMA
		1544	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
		1545	seq_printf(m, "\n %-12s %lu",
		1546	vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
		1547	zone_numa_state(zone, i));
		1548	#endif
		1549
1424	seq_printf(m, "\n pagesets");	1550	seq_printf(m, "\n pagesets");
1425	for_each_online_cpu(i) {	1551	for_each_online_cpu(i) {
1426	struct per_cpu_pageset *pageset;	1552	struct per_cpu_pageset *pageset;
@@ -1497,6 +1623,7 @@ static void vmstat_start(struct seq_file m, loff_t *pos)
1497	if (*pos >= ARRAY_SIZE(vmstat_text))	1623	if (*pos >= ARRAY_SIZE(vmstat_text))
1498	return NULL;	1624	return NULL;
1499	stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +	1625	stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
		1626	NR_VM_NUMA_STAT_ITEMS * sizeof(unsigned long) +
1500	NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +	1627	NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
1501	NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);	1628	NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
1502		1629
@@ -1512,6 +1639,12 @@ static void vmstat_start(struct seq_file m, loff_t *pos)
1512	v[i] = global_zone_page_state(i);	1639	v[i] = global_zone_page_state(i);
1513	v += NR_VM_ZONE_STAT_ITEMS;	1640	v += NR_VM_ZONE_STAT_ITEMS;
1514		1641
		1642	#ifdef CONFIG_NUMA
		1643	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
		1644	v[i] = global_numa_state(i);
		1645	v += NR_VM_NUMA_STAT_ITEMS;
		1646	#endif
		1647
1515	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)	1648	for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
1516	v[i] = global_node_page_state(i);	1649	v[i] = global_node_page_state(i);
1517	v += NR_VM_NODE_STAT_ITEMS;	1650	v += NR_VM_NODE_STAT_ITEMS;
@@ -1613,6 +1746,16 @@ int vmstat_refresh(struct ctl_table *table, int write,
1613	err = -EINVAL;	1746	err = -EINVAL;
1614	}	1747	}
1615	}	1748	}
		1749	#ifdef CONFIG_NUMA
		1750	for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
		1751	val = atomic_long_read(&vm_numa_stat[i]);
		1752	if (val < 0) {
		1753	pr_warn("%s: %s %ld\n",
		1754	__func__, vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], val);
		1755	err = -EINVAL;
		1756	}
		1757	}
		1758	#endif
1616	if (err)	1759	if (err)
1617	return err;	1760	return err;
1618	if (write)	1761	if (write)
@@ -1654,13 +1797,19 @@ static bool need_update(int cpu)
1654	struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);	1797	struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
1655		1798
1656	BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);	1799	BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
		1800	#ifdef CONFIG_NUMA
		1801	BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 1);
		1802	#endif
1657	/*	1803	/*
1658	* The fast way of checking if there are any vmstat diffs.	1804	* The fast way of checking if there are any vmstat diffs.
1659	* This works because the diffs are byte sized items.	1805	* This works because the diffs are byte sized items.
1660	*/	1806	*/
1661	if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))	1807	if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
1662	return true;	1808	return true;
1663		1809	#ifdef CONFIG_NUMA
		1810	if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS))
		1811	return true;
		1812	#endif
1664	}	1813	}
1665	return false;	1814	return false;
1666	}	1815	}