aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmstat.c
diff options
context:
space:
mode:
authorKemi Wang <kemi.wang@intel.com>2017-09-08 19:12:48 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2017-09-08 21:26:47 -0400
commit3a321d2a3dde812142e06ab5c2f062ed860182a5 (patch)
tree8ea5fb1684e34a0eedf2e2e9f669171f16622c2e /mm/vmstat.c
parentfde26bed588918a11831841b219f74b20b32b080 (diff)
mm: change the call sites of numa statistics items
Patch series "Separate NUMA statistics from zone statistics", v2. Each page allocation updates a set of per-zone statistics with a call to zone_statistics(). As discussed in 2017 MM summit, these are a substantial source of overhead in the page allocator and are very rarely consumed. This significant overhead in cache bouncing caused by zone counters (NUMA associated counters) update in parallel in multi-threaded page allocation (pointed out by Dave Hansen). A link to the MM summit slides: http://people.netfilter.org/hawk/presentations/MM-summit2017/MM-summit2017-JesperBrouer.pdf To mitigate this overhead, this patchset separates NUMA statistics from zone statistics framework, and update NUMA counter threshold to a fixed size of MAX_U16 - 2, as a small threshold greatly increases the update frequency of the global counter from local per cpu counter (suggested by Ying Huang). The rationality is that these statistics counters don't need to be read often, unlike other VM counters, so it's not a problem to use a large threshold and make readers more expensive. With this patchset, we see 31.3% drop of CPU cycles(537-->369, see below) for per single page allocation and reclaim on Jesper's page_bench03 benchmark. Meanwhile, this patchset keeps the same style of virtual memory statistics with little end-user-visible effects (only move the numa stats to show behind zone page stats, see the first patch for details). I did an experiment of single page allocation and reclaim concurrently using Jesper's page_bench03 benchmark on a 2-Socket Broadwell-based server (88 processors with 126G memory) with different size of threshold of pcp counter. Benchmark provided by Jesper D Brouer(increase loop times to 10000000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench Threshold CPU cycles Throughput(88 threads) 32 799 241760478 64 640 301628829 125 537 358906028 <==> system by default 256 468 412397590 512 428 450550704 4096 399 482520943 20000 394 489009617 30000 395 488017817 65533 369(-31.3%) 521661345(+45.3%) <==> with this patchset N/A 342(-36.3%) 562900157(+56.8%) <==> disable zone_statistics This patch (of 3): In this patch, NUMA statistics is separated from zone statistics framework, all the call sites of NUMA stats are changed to use numa-stats-specific functions, it does not have any functionality change except that the number of NUMA stats is shown behind zone page stats when users *read* the zone info. E.g. cat /proc/zoneinfo ***Base*** ***With this patch*** nr_free_pages 3976 nr_free_pages 3976 nr_zone_inactive_anon 0 nr_zone_inactive_anon 0 nr_zone_active_anon 0 nr_zone_active_anon 0 nr_zone_inactive_file 0 nr_zone_inactive_file 0 nr_zone_active_file 0 nr_zone_active_file 0 nr_zone_unevictable 0 nr_zone_unevictable 0 nr_zone_write_pending 0 nr_zone_write_pending 0 nr_mlock 0 nr_mlock 0 nr_page_table_pages 0 nr_page_table_pages 0 nr_kernel_stack 0 nr_kernel_stack 0 nr_bounce 0 nr_bounce 0 nr_zspages 0 nr_zspages 0 numa_hit 0 *nr_free_cma 0* numa_miss 0 numa_hit 0 numa_foreign 0 numa_miss 0 numa_interleave 0 numa_foreign 0 numa_local 0 numa_interleave 0 numa_other 0 numa_local 0 *nr_free_cma 0* numa_other 0 ... ... vm stats threshold: 10 vm stats threshold: 10 ... ... The next patch updates the numa stats counter size and threshold. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1503568801-21305-2-git-send-email-kemi.wang@intel.com Signed-off-by: Kemi Wang <kemi.wang@intel.com> Reported-by: Jesper Dangaard Brouer <brouer@redhat.com> Acked-by: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Christopher Lameter <cl@linux.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Andi Kleen <andi.kleen@intel.com> Cc: Ying Huang <ying.huang@intel.com> Cc: Aaron Lu <aaron.lu@intel.com> Cc: Tim Chen <tim.c.chen@intel.com> Cc: Dave Hansen <dave.hansen@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/vmstat.c')
-rw-r--r--mm/vmstat.c161
1 files changed, 155 insertions, 6 deletions
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c7e4b8458023..daea02833e2e 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -87,8 +87,10 @@ void vm_events_fold_cpu(int cpu)
87 * vm_stat contains the global counters 87 * vm_stat contains the global counters
88 */ 88 */
89atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; 89atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
90atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS] __cacheline_aligned_in_smp;
90atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp; 91atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
91EXPORT_SYMBOL(vm_zone_stat); 92EXPORT_SYMBOL(vm_zone_stat);
93EXPORT_SYMBOL(vm_numa_stat);
92EXPORT_SYMBOL(vm_node_stat); 94EXPORT_SYMBOL(vm_node_stat);
93 95
94#ifdef CONFIG_SMP 96#ifdef CONFIG_SMP
@@ -192,7 +194,10 @@ void refresh_zone_stat_thresholds(void)
192 194
193 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 195 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
194 = threshold; 196 = threshold;
195 197#ifdef CONFIG_NUMA
198 per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
199 = threshold;
200#endif
196 /* Base nodestat threshold on the largest populated zone. */ 201 /* Base nodestat threshold on the largest populated zone. */
197 pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold; 202 pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
198 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold 203 per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
@@ -226,9 +231,14 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat,
226 continue; 231 continue;
227 232
228 threshold = (*calculate_pressure)(zone); 233 threshold = (*calculate_pressure)(zone);
229 for_each_online_cpu(cpu) 234 for_each_online_cpu(cpu) {
230 per_cpu_ptr(zone->pageset, cpu)->stat_threshold 235 per_cpu_ptr(zone->pageset, cpu)->stat_threshold
231 = threshold; 236 = threshold;
237#ifdef CONFIG_NUMA
238 per_cpu_ptr(zone->pageset, cpu)->numa_stat_threshold
239 = threshold;
240#endif
241 }
232 } 242 }
233} 243}
234 244
@@ -604,6 +614,32 @@ EXPORT_SYMBOL(dec_node_page_state);
604 * Fold a differential into the global counters. 614 * Fold a differential into the global counters.
605 * Returns the number of counters updated. 615 * Returns the number of counters updated.
606 */ 616 */
617#ifdef CONFIG_NUMA
618static int fold_diff(int *zone_diff, int *numa_diff, int *node_diff)
619{
620 int i;
621 int changes = 0;
622
623 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
624 if (zone_diff[i]) {
625 atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
626 changes++;
627 }
628
629 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
630 if (numa_diff[i]) {
631 atomic_long_add(numa_diff[i], &vm_numa_stat[i]);
632 changes++;
633 }
634
635 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
636 if (node_diff[i]) {
637 atomic_long_add(node_diff[i], &vm_node_stat[i]);
638 changes++;
639 }
640 return changes;
641}
642#else
607static int fold_diff(int *zone_diff, int *node_diff) 643static int fold_diff(int *zone_diff, int *node_diff)
608{ 644{
609 int i; 645 int i;
@@ -622,6 +658,7 @@ static int fold_diff(int *zone_diff, int *node_diff)
622 } 658 }
623 return changes; 659 return changes;
624} 660}
661#endif /* CONFIG_NUMA */
625 662
626/* 663/*
627 * Update the zone counters for the current cpu. 664 * Update the zone counters for the current cpu.
@@ -645,6 +682,9 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
645 struct zone *zone; 682 struct zone *zone;
646 int i; 683 int i;
647 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; 684 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
685#ifdef CONFIG_NUMA
686 int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
687#endif
648 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; 688 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
649 int changes = 0; 689 int changes = 0;
650 690
@@ -666,6 +706,18 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
666 } 706 }
667 } 707 }
668#ifdef CONFIG_NUMA 708#ifdef CONFIG_NUMA
709 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
710 int v;
711
712 v = this_cpu_xchg(p->vm_numa_stat_diff[i], 0);
713 if (v) {
714
715 atomic_long_add(v, &zone->vm_numa_stat[i]);
716 global_numa_diff[i] += v;
717 __this_cpu_write(p->expire, 3);
718 }
719 }
720
669 if (do_pagesets) { 721 if (do_pagesets) {
670 cond_resched(); 722 cond_resched();
671 /* 723 /*
@@ -712,7 +764,12 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
712 } 764 }
713 } 765 }
714 766
767#ifdef CONFIG_NUMA
768 changes += fold_diff(global_zone_diff, global_numa_diff,
769 global_node_diff);
770#else
715 changes += fold_diff(global_zone_diff, global_node_diff); 771 changes += fold_diff(global_zone_diff, global_node_diff);
772#endif
716 return changes; 773 return changes;
717} 774}
718 775
@@ -727,6 +784,9 @@ void cpu_vm_stats_fold(int cpu)
727 struct zone *zone; 784 struct zone *zone;
728 int i; 785 int i;
729 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, }; 786 int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
787#ifdef CONFIG_NUMA
788 int global_numa_diff[NR_VM_NUMA_STAT_ITEMS] = { 0, };
789#endif
730 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, }; 790 int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
731 791
732 for_each_populated_zone(zone) { 792 for_each_populated_zone(zone) {
@@ -743,6 +803,18 @@ void cpu_vm_stats_fold(int cpu)
743 atomic_long_add(v, &zone->vm_stat[i]); 803 atomic_long_add(v, &zone->vm_stat[i]);
744 global_zone_diff[i] += v; 804 global_zone_diff[i] += v;
745 } 805 }
806
807#ifdef CONFIG_NUMA
808 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
809 if (p->vm_numa_stat_diff[i]) {
810 int v;
811
812 v = p->vm_numa_stat_diff[i];
813 p->vm_numa_stat_diff[i] = 0;
814 atomic_long_add(v, &zone->vm_numa_stat[i]);
815 global_numa_diff[i] += v;
816 }
817#endif
746 } 818 }
747 819
748 for_each_online_pgdat(pgdat) { 820 for_each_online_pgdat(pgdat) {
@@ -761,7 +833,11 @@ void cpu_vm_stats_fold(int cpu)
761 } 833 }
762 } 834 }
763 835
836#ifdef CONFIG_NUMA
837 fold_diff(global_zone_diff, global_numa_diff, global_node_diff);
838#else
764 fold_diff(global_zone_diff, global_node_diff); 839 fold_diff(global_zone_diff, global_node_diff);
840#endif
765} 841}
766 842
767/* 843/*
@@ -779,10 +855,38 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
779 atomic_long_add(v, &zone->vm_stat[i]); 855 atomic_long_add(v, &zone->vm_stat[i]);
780 atomic_long_add(v, &vm_zone_stat[i]); 856 atomic_long_add(v, &vm_zone_stat[i]);
781 } 857 }
858
859#ifdef CONFIG_NUMA
860 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
861 if (pset->vm_numa_stat_diff[i]) {
862 int v = pset->vm_numa_stat_diff[i];
863
864 pset->vm_numa_stat_diff[i] = 0;
865 atomic_long_add(v, &zone->vm_numa_stat[i]);
866 atomic_long_add(v, &vm_numa_stat[i]);
867 }
868#endif
782} 869}
783#endif 870#endif
784 871
785#ifdef CONFIG_NUMA 872#ifdef CONFIG_NUMA
873void __inc_numa_state(struct zone *zone,
874 enum numa_stat_item item)
875{
876 struct per_cpu_pageset __percpu *pcp = zone->pageset;
877 s8 __percpu *p = pcp->vm_numa_stat_diff + item;
878 s8 v, t;
879
880 v = __this_cpu_inc_return(*p);
881 t = __this_cpu_read(pcp->numa_stat_threshold);
882 if (unlikely(v > t)) {
883 s8 overstep = t >> 1;
884
885 zone_numa_state_add(v + overstep, zone, item);
886 __this_cpu_write(*p, -overstep);
887 }
888}
889
786/* 890/*
787 * Determine the per node value of a stat item. This function 891 * Determine the per node value of a stat item. This function
788 * is called frequently in a NUMA machine, so try to be as 892 * is called frequently in a NUMA machine, so try to be as
@@ -801,6 +905,19 @@ unsigned long sum_zone_node_page_state(int node,
801 return count; 905 return count;
802} 906}
803 907
908unsigned long sum_zone_numa_state(int node,
909 enum numa_stat_item item)
910{
911 struct zone *zones = NODE_DATA(node)->node_zones;
912 int i;
913 unsigned long count = 0;
914
915 for (i = 0; i < MAX_NR_ZONES; i++)
916 count += zone_numa_state(zones + i, item);
917
918 return count;
919}
920
804/* 921/*
805 * Determine the per node value of a stat item. 922 * Determine the per node value of a stat item.
806 */ 923 */
@@ -937,6 +1054,9 @@ const char * const vmstat_text[] = {
937#if IS_ENABLED(CONFIG_ZSMALLOC) 1054#if IS_ENABLED(CONFIG_ZSMALLOC)
938 "nr_zspages", 1055 "nr_zspages",
939#endif 1056#endif
1057 "nr_free_cma",
1058
1059 /* enum numa_stat_item counters */
940#ifdef CONFIG_NUMA 1060#ifdef CONFIG_NUMA
941 "numa_hit", 1061 "numa_hit",
942 "numa_miss", 1062 "numa_miss",
@@ -945,7 +1065,6 @@ const char * const vmstat_text[] = {
945 "numa_local", 1065 "numa_local",
946 "numa_other", 1066 "numa_other",
947#endif 1067#endif
948 "nr_free_cma",
949 1068
950 /* Node-based counters */ 1069 /* Node-based counters */
951 "nr_inactive_anon", 1070 "nr_inactive_anon",
@@ -1106,7 +1225,6 @@ const char * const vmstat_text[] = {
1106}; 1225};
1107#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ 1226#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
1108 1227
1109
1110#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \ 1228#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
1111 defined(CONFIG_PROC_FS) 1229 defined(CONFIG_PROC_FS)
1112static void *frag_start(struct seq_file *m, loff_t *pos) 1230static void *frag_start(struct seq_file *m, loff_t *pos)
@@ -1384,7 +1502,8 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1384 seq_printf(m, "\n per-node stats"); 1502 seq_printf(m, "\n per-node stats");
1385 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) { 1503 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
1386 seq_printf(m, "\n %-12s %lu", 1504 seq_printf(m, "\n %-12s %lu",
1387 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], 1505 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
1506 NR_VM_NUMA_STAT_ITEMS],
1388 node_page_state(pgdat, i)); 1507 node_page_state(pgdat, i));
1389 } 1508 }
1390 } 1509 }
@@ -1421,6 +1540,13 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
1421 seq_printf(m, "\n %-12s %lu", vmstat_text[i], 1540 seq_printf(m, "\n %-12s %lu", vmstat_text[i],
1422 zone_page_state(zone, i)); 1541 zone_page_state(zone, i));
1423 1542
1543#ifdef CONFIG_NUMA
1544 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
1545 seq_printf(m, "\n %-12s %lu",
1546 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
1547 zone_numa_state(zone, i));
1548#endif
1549
1424 seq_printf(m, "\n pagesets"); 1550 seq_printf(m, "\n pagesets");
1425 for_each_online_cpu(i) { 1551 for_each_online_cpu(i) {
1426 struct per_cpu_pageset *pageset; 1552 struct per_cpu_pageset *pageset;
@@ -1497,6 +1623,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
1497 if (*pos >= ARRAY_SIZE(vmstat_text)) 1623 if (*pos >= ARRAY_SIZE(vmstat_text))
1498 return NULL; 1624 return NULL;
1499 stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + 1625 stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
1626 NR_VM_NUMA_STAT_ITEMS * sizeof(unsigned long) +
1500 NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) + 1627 NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
1501 NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); 1628 NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
1502 1629
@@ -1512,6 +1639,12 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
1512 v[i] = global_zone_page_state(i); 1639 v[i] = global_zone_page_state(i);
1513 v += NR_VM_ZONE_STAT_ITEMS; 1640 v += NR_VM_ZONE_STAT_ITEMS;
1514 1641
1642#ifdef CONFIG_NUMA
1643 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
1644 v[i] = global_numa_state(i);
1645 v += NR_VM_NUMA_STAT_ITEMS;
1646#endif
1647
1515 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) 1648 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
1516 v[i] = global_node_page_state(i); 1649 v[i] = global_node_page_state(i);
1517 v += NR_VM_NODE_STAT_ITEMS; 1650 v += NR_VM_NODE_STAT_ITEMS;
@@ -1613,6 +1746,16 @@ int vmstat_refresh(struct ctl_table *table, int write,
1613 err = -EINVAL; 1746 err = -EINVAL;
1614 } 1747 }
1615 } 1748 }
1749#ifdef CONFIG_NUMA
1750 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++) {
1751 val = atomic_long_read(&vm_numa_stat[i]);
1752 if (val < 0) {
1753 pr_warn("%s: %s %ld\n",
1754 __func__, vmstat_text[i + NR_VM_ZONE_STAT_ITEMS], val);
1755 err = -EINVAL;
1756 }
1757 }
1758#endif
1616 if (err) 1759 if (err)
1617 return err; 1760 return err;
1618 if (write) 1761 if (write)
@@ -1654,13 +1797,19 @@ static bool need_update(int cpu)
1654 struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu); 1797 struct per_cpu_pageset *p = per_cpu_ptr(zone->pageset, cpu);
1655 1798
1656 BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1); 1799 BUILD_BUG_ON(sizeof(p->vm_stat_diff[0]) != 1);
1800#ifdef CONFIG_NUMA
1801 BUILD_BUG_ON(sizeof(p->vm_numa_stat_diff[0]) != 1);
1802#endif
1657 /* 1803 /*
1658 * The fast way of checking if there are any vmstat diffs. 1804 * The fast way of checking if there are any vmstat diffs.
1659 * This works because the diffs are byte sized items. 1805 * This works because the diffs are byte sized items.
1660 */ 1806 */
1661 if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS)) 1807 if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
1662 return true; 1808 return true;
1663 1809#ifdef CONFIG_NUMA
1810 if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS))
1811 return true;
1812#endif
1664 } 1813 }
1665 return false; 1814 return false;
1666} 1815}