aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKemi Wang <kemi.wang@intel.com>2017-09-08 19:12:48 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-08 21:26:47 -0400
commit3a321d2a3dde812142e06ab5c2f062ed860182a5 (patch)
tree8ea5fb1684e34a0eedf2e2e9f669171f16622c2e
parentfde26bed588918a11831841b219f74b20b32b080 (diff)
mm: change the call sites of numa statistics items
Patch series "Separate NUMA statistics from zone statistics", v2. Each page allocation updates a set of per-zone statistics with a call to zone_statistics(). As discussed in 2017 MM summit, these are a substantial source of overhead in the page allocator and are very rarely consumed. This significant overhead in cache bouncing caused by zone counters (NUMA associated counters) update in parallel in multi-threaded page allocation (pointed out by Dave Hansen). A link to the MM summit slides: http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017-JesperBrouer.pdf To mitigate this overhead, this patchset separates NUMA statistics from zone statistics framework, and update NUMA counter threshold to a fixed size of MAX_U16 - 2, as a small threshold greatly increases the update frequency of the global counter from local per cpu counter (suggested by Ying Huang). The rationality is that these statistics counters don't need to be read often, unlike other VM counters, so it's not a problem to use a large threshold and make readers more expensive. With this patchset, we see 31.3% drop of CPU cycles(537-->369, see below) for per single page allocation and reclaim on Jesper's page_bench03 benchmark. Meanwhile, this patchset keeps the same style of virtual memory statistics with little end-user-visible effects (only move the numa stats to show behind zone page stats, see the first patch for details). I did an experiment of single page allocation and reclaim concurrently using Jesper's page_bench03 benchmark on a 2-Socket Broadwell-based server (88 processors with 126G memory) with different size of threshold of pcp counter. Benchmark provided by Jesper D Brouer(increase loop times to 10000000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench Threshold CPU cycles Throughput(88 threads) 32 799 241760478 64 640 301628829 125 537 358906028 <==> system by default 256 468 412397590 512 428 450550704 4096 399 482520943 20000 394 489009617 30000 395 488017817 65533 369(-31.3%) 521661345(+45.3%) <==> with this patchset N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics This patch (of 3): In this patch, NUMA statistics is separated from zone statistics framework, all the call sites of NUMA stats are changed to use numa-stats-specific functions, it does not have any functionality change except that the number of NUMA stats is shown behind zone page stats when users *read* the zone info. E.g. cat /proc/zoneinfo ***Base*** ***With this patch*** nr_free_pages 3976 nr_free_pages 3976 nr_zone_inactive_anon 0 nr_zone_inactive_anon 0 nr_zone_active_anon 0 nr_zone_active_anon 0 nr_zone_inactive_file 0 nr_zone_inactive_file 0 nr_zone_active_file 0 nr_zone_active_file 0 nr_zone_unevictable 0 nr_zone_unevictable 0 nr_zone_write_pending 0 nr_zone_write_pending 0 nr_mlock 0 nr_mlock 0 nr_page_table_pages 0 nr_page_table_pages 0 nr_kernel_stack 0 nr_kernel_stack 0 nr_bounce 0 nr_bounce 0 nr_zspages 0 nr_zspages 0 numa_hit 0 *nr_free_cma 0* numa_miss 0 numa_hit 0 numa_foreign 0 numa_miss 0 numa_interleave 0 numa_foreign 0 numa_local 0 numa_interleave 0 numa_other 0 numa_local 0 *nr_free_cma 0* numa_other 0 ... ... vm stats threshold: 10 vm stats threshold: 10 ... ... The next patch updates the numa stats counter size and threshold. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1503568801-21305-2-git-send-email-kemi.wang@intel.com Signed-off-by: Kemi Wang <kemi.wang@intel.com> Reported-by: Jesper Dangaard Brouer <brouer@redhat.com> Acked-by: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Christopher Lameter <cl@linux.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Andi Kleen <andi.kleen@intel.com> Cc: Ying Huang <ying.huang@intel.com> Cc: Aaron Lu <aaron.lu@intel.com> Cc: Tim Chen <tim.c.chen@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--drivers/base/node.c22
-rw-r--r--include/linux/mmzone.h25
-rw-r--r--include/linux/vmstat.h29
-rw-r--r--mm/page_alloc.c10
-rw-r--r--mm/vmstat.c161
5 files changed, 220 insertions, 27 deletions
diff --git a/drivers/base/node.c b/drivers/base/node.c
index d8dc83017d8d..3855902f2c5b 100644
--- a/drivers/base/node.c
+++ b/drivers/base/node.c
@@ -160,12 +160,12 @@ static ssize_t node_read_numastat(struct device *dev,
160 "interleave_hit %lu\n" 160 "interleave_hit %lu\n"
161 "local_node %lu\n" 161 "local_node %lu\n"
162 "other_node %lu\n", 162 "other_node %lu\n",
163 sum_zone_node_page_state(dev->id, NUMA_HIT), 163 sum_zone_numa_state(dev->id, NUMA_HIT),
164 sum_zone_node_page_state(dev->id, NUMA_MISS), 164 sum_zone_numa_state(dev->id, NUMA_MISS),
165 sum_zone_node_page_state(dev->id, NUMA_FOREIGN), 165 sum_zone_numa_state(dev->id, NUMA_FOREIGN),
166 sum_zone_node_page_state(dev->id, NUMA_INTERLEAVE_HIT), 166 sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
167 sum_zone_node_page_state(dev->id, NUMA_LOCAL), 167 sum_zone_numa_state(dev->id, NUMA_LOCAL),
168 sum_zone_node_page_state(dev->id, NUMA_OTHER)); 168 sum_zone_numa_state(dev->id, NUMA_OTHER));
169} 169}
170static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL); 170static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
171 171
@@ -181,9 +181,17 @@ static ssize_t node_read_vmstat(struct device *dev,
181 n += sprintf(buf+n, "%s %lu\n", vmstat_text[i], 181 n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
182 sum_zone_node_page_state(nid, i)); 182 sum_zone_node_page_state(nid, i));
183 183
184 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 184#ifdef CONFIG_NUMA
185 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
185 n += sprintf(buf+n, "%s %lu\n", 186 n += sprintf(buf+n, "%s %lu\n",
186 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], 187 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
188 sum_zone_numa_state(nid, i));
189#endif
190
191 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
192 n += sprintf(buf+n, "%s %lu\n",
193 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
194 NR_VM_NUMA_STAT_ITEMS],
187 node_page_state(pgdat, i)); 195 node_page_state(pgdat, i));
188 196
189 return n; 197 return n;
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index e7e92c8f4883..e65d91c02e30 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -114,6 +114,20 @@ struct zone_padding {
114#define ZONE_PADDING(name) 114#define ZONE_PADDING(name)
115#endif 115#endif
116 116
117#ifdef CONFIG_NUMA
118enum numa_stat_item {
119 NUMA_HIT, /* allocated in intended node */
120 NUMA_MISS, /* allocated in non intended node */
121 NUMA_FOREIGN, /* was intended here, hit elsewhere */
122 NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */
123 NUMA_LOCAL, /* allocation from local node */
124 NUMA_OTHER, /* allocation from other node */
125 NR_VM_NUMA_STAT_ITEMS
126};
127#else
128#define NR_VM_NUMA_STAT_ITEMS 0
129#endif
130
117enum zone_stat_item { 131enum zone_stat_item {
118 /* First 128 byte cacheline (assuming 64 bit words) */ 132 /* First 128 byte cacheline (assuming 64 bit words) */
119 NR_FREE_PAGES, 133 NR_FREE_PAGES,
@@ -132,14 +146,6 @@ enum zone_stat_item {
132#if IS_ENABLED(CONFIG_ZSMALLOC) 146#if IS_ENABLED(CONFIG_ZSMALLOC)
133 NR_ZSPAGES, /* allocated in zsmalloc */ 147 NR_ZSPAGES, /* allocated in zsmalloc */
134#endif 148#endif
135#ifdef CONFIG_NUMA
136 NUMA_HIT, /* allocated in intended node */
137 NUMA_MISS, /* allocated in non intended node */
138 NUMA_FOREIGN, /* was intended here, hit elsewhere */
139 NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */
140 NUMA_LOCAL, /* allocation from local node */
141 NUMA_OTHER, /* allocation from other node */
142#endif
143 NR_FREE_CMA_PAGES, 149 NR_FREE_CMA_PAGES,
144 NR_VM_ZONE_STAT_ITEMS }; 150 NR_VM_ZONE_STAT_ITEMS };
145 151
@@ -276,6 +282,8 @@ struct per_cpu_pageset {
276 struct per_cpu_pages pcp; 282 struct per_cpu_pages pcp;
277#ifdef CONFIG_NUMA 283#ifdef CONFIG_NUMA
278 s8 expire; 284 s8 expire;
285 s8 numa_stat_threshold;
286 s8 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS];
279#endif 287#endif
280#ifdef CONFIG_SMP 288#ifdef CONFIG_SMP
281 s8 stat_threshold; 289 s8 stat_threshold;
@@ -496,6 +504,7 @@ struct zone {
496 ZONE_PADDING(_pad3_) 504 ZONE_PADDING(_pad3_)
497 /* Zone statistics */ 505 /* Zone statistics */
498 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; 506 atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
507 atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
499} ____cacheline_internodealigned_in_smp; 508} ____cacheline_internodealigned_in_smp;
500 509
501enum pgdat_flags { 510enum pgdat_flags {
diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h
index 97e11ab573f0..9ac82e29948f 100644
--- a/include/linux/vmstat.h
+++ b/include/linux/vmstat.h
@@ -107,8 +107,33 @@ static inline void vm_events_fold_cpu(int cpu)
107 * Zone and node-based page accounting with per cpu differentials. 107 * Zone and node-based page accounting with per cpu differentials.
108 */ 108 */
109extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS]; 109extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS];
110extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS];
110extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS]; 111extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS];
111 112
113#ifdef CONFIG_NUMA
114static inline void zone_numa_state_add(long x, struct zone *zone,
115 enum numa_stat_item item)
116{
117 atomic_long_add(x, &zone->vm_numa_stat[item]);
118 atomic_long_add(x, &vm_numa_stat[item]);
119}
120
121static inline unsigned long global_numa_state(enum numa_stat_item item)
122{
123 long x = atomic_long_read(&vm_numa_stat[item]);
124
125 return x;
126}
127
128static inline unsigned long zone_numa_state(struct zone *zone,
129 enum numa_stat_item item)
130{
131 long x = atomic_long_read(&zone->vm_numa_stat[item]);
132
133 return x;
134}
135#endif /* CONFIG_NUMA */
136
112static inline void zone_page_state_add(long x, struct zone *zone, 137static inline void zone_page_state_add(long x, struct zone *zone,
113 enum zone_stat_item item) 138 enum zone_stat_item item)
114{ 139{
@@ -194,8 +219,10 @@ static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat,
194 219
195 220
196#ifdef CONFIG_NUMA 221#ifdef CONFIG_NUMA
222extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item);
197extern unsigned long sum_zone_node_page_state(int node, 223extern unsigned long sum_zone_node_page_state(int node,
198 enum zone_stat_item item); 224 enum zone_stat_item item);
225extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item);
199extern unsigned long node_page_state(struct pglist_data *pgdat, 226extern unsigned long node_page_state(struct pglist_data *pgdat,
200 enum node_stat_item item); 227 enum node_stat_item item);
201#else 228#else
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a9add06fe768..45583cd8dd56 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2741,18 +2741,18 @@ int __isolate_free_page(struct page *page, unsigned int order)
2741static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) 2741static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
2742{ 2742{
2743#ifdef CONFIG_NUMA 2743#ifdef CONFIG_NUMA
2744 enum zone_stat_item local_stat = NUMA_LOCAL; 2744 enum numa_stat_item local_stat = NUMA_LOCAL;
2745 2745
2746 if (z->node != numa_node_id()) 2746 if (z->node != numa_node_id())
2747 local_stat = NUMA_OTHER; 2747 local_stat = NUMA_OTHER;
2748 2748
2749 if (z->node == preferred_zone->node) 2749 if (z->node == preferred_zone->node)
2750 __inc_zone_state(z, NUMA_HIT); 2750 __inc_numa_state(z, NUMA_HIT);
2751 else { 2751 else {
2752 __inc_zone_state(z, NUMA_MISS); 2752 __inc_numa_state(z, NUMA_MISS);
2753 __inc_zone_state(preferred_zone, NUMA_FOREIGN); 2753 __inc_numa_state(preferred_zone, NUMA_FOREIGN);
2754 } 2754 }
2755 __inc_zone_state(z, local_stat); 2755 __inc_numa_state(z, local_stat);
2756#endif 2756#endif
2757} 2757}
2758 2758
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c7e4b8458023..daea02833e2e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -87,8 +87,10 @@ void vm_events_fold_cpu(int cpu)
87 * vm_stat contains the global counters 87 * vm_stat contains the global counters
88 */ 88 */
89atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; 89atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
90atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp;
90atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp; 91atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
91EXPORT_SYMBOL(vm_zone_stat); 92EXPORT_SYMBOL(vm_zone_stat);
93EXPORT_SYMBOL(vm_numa_stat);
92EXPORT_SYMBOL(vm_node_stat); 94EXPORT_SYMBOL(vm_node_stat);
93 95
94#ifdef CONFIG_SMP 96#ifdef CONFIG_SMP
@@ -192,7 +194,10 @@ void refresh_zone_stat_thresholds(void)
192 194
193 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 195 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
194 = threshold; 196 = threshold;
195 197#ifdef CONFIG_NUMA
198 per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
199 = threshold;
200#endif
196 /* Base nodestat threshold on the largest populated zone. */ 201 /* Base nodestat threshold on the largest populated zone. */
197 pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold; 202 pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
198 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold 203 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
@@ -226,9 +231,14 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
226 continue; 231 continue;
227 232
228 threshold = (*calculate_pressure)(zone); 233 threshold = (*calculate_pressure)(zone);
229 for_each_online_cpu(cpu) 234 for_each_online_cpu(cpu) {
230 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 235 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
231 = threshold; 236 = threshold;
237#ifdef CONFIG_NUMA
238 per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
239 = threshold;
240#endif
241 }
232 } 242 }
233} 243}
234 244
@@ -604,6 +614,32 @@ EXPORT_SYMBOL(dec_node_page_state);
604 * Fold a differential into the global counters. 614 * Fold a differential into the global counters.
605 * Returns the number of counters updated. 615 * Returns the number of counters updated.
606 */ 616 */
617#ifdef CONFIG_NUMA
618static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
619{
620 int i;
621 int changes = 0;
622
623 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
624 if (zone_diff[i]) {
625 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
626 changes++;
627 }
628
629 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
630 if (numa_diff[i]) {
631 atomic_long_add(numa_diff[i], &vm_numa_stat[i]);
632 changes++;
633 }
634
635 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
636 if (node_diff[i]) {
637 atomic_long_add(node_diff[i], &vm_node_stat[i]);
638 changes++;
639 }
640 return changes;
641}
642#else
607static int fold_diff(int *zone_diff, int *node_diff) 643static int fold_diff(int *zone_diff, int *node_diff)
608{ 644{
609 int i; 645 int i;
@@ -622,6 +658,7 @@ static int fold_diff(int *zone_diff, int *node_diff)
622 } 658 }
623 return changes; 659 return changes;
624} 660}
661#endif /* CONFIG_NUMA */
625 662
626/* 663/*
627 * Update the zone counters for the current cpu. 664 * Update the zone counters for the current cpu.
@@ -645,6 +682,9 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
645 struct zone *zone; 682 struct zone *zone;
646 int i; 683 int i;
647 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; 684 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
685#ifdef CONFIG_NUMA
686 int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
687#endif
648 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; 688 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
649 int changes = 0; 689 int changes = 0;
650 690
@@ -666,6 +706,18 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
666 } 706 }
667 } 707 }
668#ifdef CONFIG_NUMA 708#ifdef CONFIG_NUMA
709 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
710 int v;
711
712 v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0);
713 if (v) {
714
715 atomic_long_add(v, &zone->vm_numa_stat[i]);
716 global_numa_diff[i] += v;
717 __this_cpu_write(p->expire, 3);
718 }
719 }
720
669 if (do_pagesets) { 721 if (do_pagesets) {
670 cond_resched(); 722 cond_resched();
671 /* 723 /*
@@ -712,7 +764,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
712 } 764 }
713 } 765 }
714 766
767#ifdef CONFIG_NUMA
768 changes += fold_diff(global_zone_diff, global_numa_diff,
769 global_node_diff);
770#else
715 changes += fold_diff(global_zone_diff, global_node_diff); 771 changes += fold_diff(global_zone_diff, global_node_diff);
772#endif
716 return changes; 773 return changes;
717} 774}
718 775
@@ -727,6 +784,9 @@ void cpu_vm_stats_fold(int cpu)
727 struct zone *zone; 784 struct zone *zone;
728 int i; 785 int i;
729 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; 786 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
787#ifdef CONFIG_NUMA
788 int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
789#endif
730 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; 790 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
731 791
732 for_each_populated_zone(zone) { 792 for_each_populated_zone(zone) {
@@ -743,6 +803,18 @@ void cpu_vm_stats_fold(int cpu)
743 atomic_long_add(v, &zone->vm_stat[i]); 803 atomic_long_add(v, &zone->vm_stat[i]);
744 global_zone_diff[i] += v; 804 global_zone_diff[i] += v;
745 } 805 }
806
807#ifdef CONFIG_NUMA
808 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
809 if (p->vm_numa_stat_diff[i]) {
810 int v;
811
812 v = p->vm_numa_stat_diff[i];
813 p->vm_numa_stat_diff[i] = 0;
814 atomic_long_add(v, &zone->vm_numa_stat[i]);
815 global_numa_diff[i] += v;
816 }
817#endif
746 } 818 }
747 819
748 for_each_online_pgdat(pgdat) { 820 for_each_online_pgdat(pgdat) {
@@ -761,7 +833,11 @@ void cpu_vm_stats_fold(int cpu)
761 } 833 }
762 } 834 }
763 835
836#ifdef CONFIG_NUMA
837 fold_diff(global_zone_diff, global_numa_diff, global_node_diff);
838#else
764 fold_diff(global_zone_diff, global_node_diff); 839 fold_diff(global_zone_diff, global_node_diff);
840#endif
765} 841}
766 842
767/* 843/*
@@ -779,10 +855,38 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
779 atomic_long_add(v, &zone->vm_stat[i]); 855 atomic_long_add(v, &zone->vm_stat[i]);
780 atomic_long_add(v, &vm_zone_stat[i]); 856 atomic_long_add(v, &vm_zone_stat[i]);
781 } 857 }
858
859#ifdef CONFIG_NUMA
860 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
861 if (pset->vm_numa_stat_diff[i]) {
862 int v = pset->vm_numa_stat_diff[i];
863
864 pset->vm_numa_stat_diff[i] = 0;
865 atomic_long_add(v, &zone->vm_numa_stat[i]);
866 atomic_long_add(v, &vm_numa_stat[i]);
867 }
868#endif
782} 869}
783#endif 870#endif
784 871
785#ifdef CONFIG_NUMA 872#ifdef CONFIG_NUMA
873void __inc_numa_state(struct zone *zone,
874 enum numa_stat_item item)
875{
876 struct per_cpu_pageset __percpu *pcp = zone->pageset;
877 s8 __percpu *p = pcp->vm_numa_stat_diff + item;
878 s8 v, t;
879
880 v = __this_cpu_inc_return(*p);
881 t = __this_cpu_read(pcp->numa_stat_threshold);
882 if (unlikely(v > t)) {
883 s8 overstep = t >> 1;
884
885 zone_numa_state_add(v + overstep, zone, item);
886 __this_cpu_write(*p, -overstep);
887 }
888}
889
786/* 890/*
787 * Determine the per node value of a stat item. This function 891 * Determine the per node value of a stat item. This function
788 * is called frequently in a NUMA machine, so try to be as 892 * is called frequently in a NUMA machine, so try to be as
@@ -801,6 +905,19 @@ unsigned long sum_zone_node_page_state(int node,
801 return count; 905 return count;
802} 906}
803 907
908unsigned long sum_zone_numa_state(int node,
909 enum numa_stat_item item)
910{
911 struct zone *zones = NODE_DATA(node)->node_zones;
912 int i;
913 unsigned long count = 0;
914
915 for (i = 0; i < MAX_NR_ZONES; i++)
916 count += zone_numa_state(zones + i, item);
917
918 return count;
919}
920
804/* 921/*
805 * Determine the per node value of a stat item. 922 * Determine the per node value of a stat item.
806 */ 923 */
@@ -937,6 +1054,9 @@ const char * const vmstat_text[] = {
937#if IS_ENABLED(CONFIG_ZSMALLOC) 1054#if IS_ENABLED(CONFIG_ZSMALLOC)
938 "nr_zspages", 1055 "nr_zspages",
939#endif 1056#endif
1057 "nr_free_cma",
1058
1059 /* enum numa_stat_item counters */
940#ifdef CONFIG_NUMA 1060#ifdef CONFIG_NUMA
941 "numa_hit", 1061 "numa_hit",
942 "numa_miss", 1062 "numa_miss",
@@ -945,7 +1065,6 @@ const char * const vmstat_text[] = {
945 "numa_local", 1065 "numa_local",
946 "numa_other", 1066 "numa_other",
947#endif 1067#endif
948 "nr_free_cma",
949 1068
950 /* Node-based counters */ 1069 /* Node-based counters */
951 "nr_inactive_anon", 1070 "nr_inactive_anon",
@@ -1106,7 +1225,6 @@ const char * const vmstat_text[] = {
1106}; 1225};
1107#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ 1226#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
1108 1227
1109
1110#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \ 1228#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1111 defined(CONFIG_PROC_FS) 1229 defined(CONFIG_PROC_FS)
1112static void *frag_start(struct seq_file *m, loff_t *pos) 1230static void *frag_start(struct seq_file *m, loff_t *pos)
@@ -1384,7 +1502,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1384 seq_printf(m, "\n per-node stats"); 1502 seq_printf(m, "\n per-node stats");
1385 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { 1503 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1386 seq_printf(m, "\n %-12s %lu", 1504 seq_printf(m, "\n %-12s %lu",
1387 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], 1505 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
1506 NR_VM_NUMA_STAT_ITEMS],
1388 node_page_state(pgdat, i)); 1507 node_page_state(pgdat, i));
1389 } 1508 }
1390 } 1509 }
@@ -1421,6 +1540,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1421 seq_printf(m, "\n %-12s %lu", vmstat_text[i], 1540 seq_printf(m, "\n %-12s %lu", vmstat_text[i],
1422 zone_page_state(zone, i)); 1541 zone_page_state(zone, i));
1423 1542
1543#ifdef CONFIG_NUMA
1544 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
1545 seq_printf(m, "\n %-12s %lu",
1546 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
1547 zone_numa_state(zone, i));
1548#endif
1549
1424 seq_printf(m, "\n pagesets"); 1550 seq_printf(m, "\n pagesets");
1425 for_each_online_cpu(i) { 1551 for_each_online_cpu(i) {
1426 struct per_cpu_pageset *pageset; 1552 struct per_cpu_pageset *pageset;
@@ -1497,6 +1623,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
1497 if (*pos >= ARRAY_SIZE(vmstat_text)) 1623 if (*pos >= ARRAY_SIZE(vmstat_text))
1498 return NULL; 1624 return NULL;
1499 stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + 1625 stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
1626 NR_VM_NUMA_STAT_ITEMS * sizeof(unsigned long) +
1500 NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) + 1627 NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
1501 NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); 1628 NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
1502 1629
@@ -1512,6 +1639,12 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
1512 v[i] = global_zone_page_state(i); 1639 v[i] = global_zone_page_state(i);
1513 v += NR_VM_ZONE_STAT_ITEMS; 1640 v += NR_VM_ZONE_STAT_ITEMS;
1514 1641
1642#ifdef CONFIG_NUMA
1643 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
1644 v[i] = global_numa_state(i);
1645 v += NR_VM_NUMA_STAT_ITEMS;
1646#endif
1647
1515 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 1648 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
1516 v[i] = global_node_page_state(i); 1649 v[i] = global_node_page_state(i);
1517 v += NR_VM_NODE_STAT_ITEMS; 1650 v += NR_VM_NODE_STAT_ITEMS;
@@ -1613,6 +1746,16 @@ int vmstat_refresh(struct ctl_table *table, int write,
1613 err = -EINVAL; 1746 err = -EINVAL;
1614 } 1747 }
1615 } 1748 }
1749#ifdef CONFIG_NUMA
1750 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
1751 val = atomic_long_read(&vm_numa_stat[i]);
1752 if (val < 0) {
1753 pr_warn("%s: %s %ld\n",
1754 __func__, vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], val);
1755 err = -EINVAL;
1756 }
1757 }
1758#endif
1616 if (err) 1759 if (err)
1617 return err; 1760 return err;
1618 if (write) 1761 if (write)
@@ -1654,13 +1797,19 @@ static bool need_update(int cpu)
1654 struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); 1797 struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
1655 1798
1656 BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); 1799 BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
1800#ifdef CONFIG_NUMA
1801 BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 1);
1802#endif
1657 /* 1803 /*
1658 * The fast way of checking if there are any vmstat diffs. 1804 * The fast way of checking if there are any vmstat diffs.
1659 * This works because the diffs are byte sized items. 1805 * This works because the diffs are byte sized items.
1660 */ 1806 */
1661 if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS)) 1807 if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
1662 return true; 1808 return true;
1663 1809#ifdef CONFIG_NUMA
1810 if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS))
1811 return true;
1812#endif
1664 } 1813 }
1665 return false; 1814 return false;
1666} 1815}