aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/kernel-parameters.txt5
-rw-r--r--Documentation/sysctl/vm.txt45
-rw-r--r--include/linux/mmzone.h5
-rw-r--r--kernel/sysctl.c11
-rw-r--r--mm/page_alloc.c273
5 files changed, 317 insertions, 22 deletions
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 62aab585d9d7..4344f69ae24a 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1196,6 +1196,11 @@ and is between 256 and 4096 characters. It is defined in the file
1196 1196
1197 nowb [ARM] 1197 nowb [ARM]
1198 1198
1199 numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA.
1200 one of ['zone', 'node', 'default'] can be specified
1201 This can be set from sysctl after boot.
1202 See Documentation/sysctl/vm.txt for details.
1203
1199 nr_uarts= [SERIAL] maximum number of UARTs to be registered. 1204 nr_uarts= [SERIAL] maximum number of UARTs to be registered.
1200 1205
1201 opl3= [HW,OSS] 1206 opl3= [HW,OSS]
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 8cfca173d4bc..df3ff2095f9d 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -32,6 +32,7 @@ Currently, these files are in /proc/sys/vm:
32- min_slab_ratio 32- min_slab_ratio
33- panic_on_oom 33- panic_on_oom
34- mmap_min_address 34- mmap_min_address
35- numa_zonelist_order
35 36
36============================================================== 37==============================================================
37 38
@@ -231,3 +232,47 @@ security module. Setting this value to something like 64k will allow the
231vast majority of applications to work correctly and provide defense in depth 232vast majority of applications to work correctly and provide defense in depth
232against future potential kernel bugs. 233against future potential kernel bugs.
233 234
235==============================================================
236
237numa_zonelist_order
238
239This sysctl is only for NUMA.
240'where the memory is allocated from' is controlled by zonelists.
241(This documentation ignores ZONE_HIGHMEM/ZONE_DMA32 for simple explanation.
242 you may be able to read ZONE_DMA as ZONE_DMA32...)
243
244In non-NUMA case, a zonelist for GFP_KERNEL is ordered as following.
245ZONE_NORMAL -> ZONE_DMA
246This means that a memory allocation request for GFP_KERNEL will
247get memory from ZONE_DMA only when ZONE_NORMAL is not available.
248
249In NUMA case, you can think of following 2 types of order.
250Assume 2 node NUMA and below is zonelist of Node(0)'s GFP_KERNEL
251
252(A) Node(0) ZONE_NORMAL -> Node(0) ZONE_DMA -> Node(1) ZONE_NORMAL
253(B) Node(0) ZONE_NORMAL -> Node(1) ZONE_NORMAL -> Node(0) ZONE_DMA.
254
255Type(A) offers the best locality for processes on Node(0), but ZONE_DMA
256will be used before ZONE_NORMAL exhaustion. This increases possibility of
257out-of-memory(OOM) of ZONE_DMA because ZONE_DMA is tend to be small.
258
259Type(B) cannot offer the best locality but is more robust against OOM of
260the DMA zone.
261
262Type(A) is called as "Node" order. Type (B) is "Zone" order.
263
264"Node order" orders the zonelists by node, then by zone within each node.
265Specify "[Nn]ode" for zone order
266
267"Zone Order" orders the zonelists by zone type, then by node within each
268zone. Specify "[Zz]one"for zode order.
269
270Specify "[Dd]efault" to request automatic configuration. Autoconfiguration
271will select "node" order in following case.
272(1) if the DMA zone does not exist or
273(2) if the DMA zone comprises greater than 50% of the available memory or
274(3) if any node's DMA zone comprises greater than 60% of its local memory and
275 the amount of local memory is big enough.
276
277Otherwise, "zone" order will be selected. Default order is recommended unless
278this is causing problems for your system/application.
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index d09b1345a3a1..04b1636a970b 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -566,6 +566,11 @@ int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int,
566int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, 566int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int,
567 struct file *, void __user *, size_t *, loff_t *); 567 struct file *, void __user *, size_t *, loff_t *);
568 568
569extern int numa_zonelist_order_handler(struct ctl_table *, int,
570 struct file *, void __user *, size_t *, loff_t *);
571extern char numa_zonelist_order[];
572#define NUMA_ZONELIST_ORDER_LEN 16 /* string buffer size */
573
569#include <linux/topology.h> 574#include <linux/topology.h>
570/* Returns the number of the current Node. */ 575/* Returns the number of the current Node. */
571#ifndef numa_node_id 576#ifndef numa_node_id
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d93e13d93f24..ccaebbbd75ae 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -958,6 +958,17 @@ static ctl_table vm_table[] = {
958 .mode = 0644, 958 .mode = 0644,
959 .proc_handler = &proc_doulongvec_minmax, 959 .proc_handler = &proc_doulongvec_minmax,
960 }, 960 },
961#ifdef CONFIG_NUMA
962 {
963 .ctl_name = CTL_UNNUMBERED,
964 .procname = "numa_zonelist_order",
965 .data = &numa_zonelist_order,
966 .maxlen = NUMA_ZONELIST_ORDER_LEN,
967 .mode = 0644,
968 .proc_handler = &numa_zonelist_order_handler,
969 .strategy = &sysctl_string,
970 },
971#endif
961#endif 972#endif
962#if defined(CONFIG_X86_32) || \ 973#if defined(CONFIG_X86_32) || \
963 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL)) 974 (defined(CONFIG_SUPERH) && defined(CONFIG_VSYSCALL))
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 05ace44852eb..092b2d8f2f0c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1621,8 +1621,8 @@ void show_free_areas(void)
1621 * 1621 *
1622 * Add all populated zones of a node to the zonelist. 1622 * Add all populated zones of a node to the zonelist.
1623 */ 1623 */
1624static int __meminit build_zonelists_node(pg_data_t *pgdat, 1624static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
1625 struct zonelist *zonelist, int nr_zones, enum zone_type zone_type) 1625 int nr_zones, enum zone_type zone_type)
1626{ 1626{
1627 struct zone *zone; 1627 struct zone *zone;
1628 1628
@@ -1641,9 +1641,102 @@ static int __meminit build_zonelists_node(pg_data_t *pgdat,
1641 return nr_zones; 1641 return nr_zones;
1642} 1642}
1643 1643
1644
1645/*
1646 * zonelist_order:
1647 * 0 = automatic detection of better ordering.
1648 * 1 = order by ([node] distance, -zonetype)
1649 * 2 = order by (-zonetype, [node] distance)
1650 *
1651 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
1652 * the same zonelist. So only NUMA can configure this param.
1653 */
1654#define ZONELIST_ORDER_DEFAULT 0
1655#define ZONELIST_ORDER_NODE 1
1656#define ZONELIST_ORDER_ZONE 2
1657
1658/* zonelist order in the kernel.
1659 * set_zonelist_order() will set this to NODE or ZONE.
1660 */
1661static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
1662static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
1663
1664
1644#ifdef CONFIG_NUMA 1665#ifdef CONFIG_NUMA
1666/* The value user specified ....changed by config */
1667static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
1668/* string for sysctl */
1669#define NUMA_ZONELIST_ORDER_LEN 16
1670char numa_zonelist_order[16] = "default";
1671
1672/*
1673 * interface for configure zonelist ordering.
1674 * command line option "numa_zonelist_order"
1675 * = "[dD]efault - default, automatic configuration.
1676 * = "[nN]ode - order by node locality, then by zone within node
1677 * = "[zZ]one - order by zone, then by locality within zone
1678 */
1679
1680static int __parse_numa_zonelist_order(char *s)
1681{
1682 if (*s == 'd' || *s == 'D') {
1683 user_zonelist_order = ZONELIST_ORDER_DEFAULT;
1684 } else if (*s == 'n' || *s == 'N') {
1685 user_zonelist_order = ZONELIST_ORDER_NODE;
1686 } else if (*s == 'z' || *s == 'Z') {
1687 user_zonelist_order = ZONELIST_ORDER_ZONE;
1688 } else {
1689 printk(KERN_WARNING
1690 "Ignoring invalid numa_zonelist_order value: "
1691 "%s\n", s);
1692 return -EINVAL;
1693 }
1694 return 0;
1695}
1696
1697static __init int setup_numa_zonelist_order(char *s)
1698{
1699 if (s)
1700 return __parse_numa_zonelist_order(s);
1701 return 0;
1702}
1703early_param("numa_zonelist_order", setup_numa_zonelist_order);
1704
1705/*
1706 * sysctl handler for numa_zonelist_order
1707 */
1708int numa_zonelist_order_handler(ctl_table *table, int write,
1709 struct file *file, void __user *buffer, size_t *length,
1710 loff_t *ppos)
1711{
1712 char saved_string[NUMA_ZONELIST_ORDER_LEN];
1713 int ret;
1714
1715 if (write)
1716 strncpy(saved_string, (char*)table->data,
1717 NUMA_ZONELIST_ORDER_LEN);
1718 ret = proc_dostring(table, write, file, buffer, length, ppos);
1719 if (ret)
1720 return ret;
1721 if (write) {
1722 int oldval = user_zonelist_order;
1723 if (__parse_numa_zonelist_order((char*)table->data)) {
1724 /*
1725 * bogus value. restore saved string
1726 */
1727 strncpy((char*)table->data, saved_string,
1728 NUMA_ZONELIST_ORDER_LEN);
1729 user_zonelist_order = oldval;
1730 } else if (oldval != user_zonelist_order)
1731 build_all_zonelists();
1732 }
1733 return 0;
1734}
1735
1736
1645#define MAX_NODE_LOAD (num_online_nodes()) 1737#define MAX_NODE_LOAD (num_online_nodes())
1646static int __meminitdata node_load[MAX_NUMNODES]; 1738static int node_load[MAX_NUMNODES];
1739
1647/** 1740/**
1648 * find_next_best_node - find the next node that should appear in a given node's fallback list 1741 * find_next_best_node - find the next node that should appear in a given node's fallback list
1649 * @node: node whose fallback list we're appending 1742 * @node: node whose fallback list we're appending
@@ -1658,7 +1751,7 @@ static int __meminitdata node_load[MAX_NUMNODES];
1658 * on them otherwise. 1751 * on them otherwise.
1659 * It returns -1 if no node is found. 1752 * It returns -1 if no node is found.
1660 */ 1753 */
1661static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) 1754static int find_next_best_node(int node, nodemask_t *used_node_mask)
1662{ 1755{
1663 int n, val; 1756 int n, val;
1664 int min_val = INT_MAX; 1757 int min_val = INT_MAX;
@@ -1704,13 +1797,129 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
1704 return best_node; 1797 return best_node;
1705} 1798}
1706 1799
1707static void __meminit build_zonelists(pg_data_t *pgdat) 1800
1801/*
1802 * Build zonelists ordered by node and zones within node.
1803 * This results in maximum locality--normal zone overflows into local
1804 * DMA zone, if any--but risks exhausting DMA zone.
1805 */
1806static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
1708{ 1807{
1709 int j, node, local_node;
1710 enum zone_type i; 1808 enum zone_type i;
1711 int prev_node, load; 1809 int j;
1712 struct zonelist *zonelist; 1810 struct zonelist *zonelist;
1811
1812 for (i = 0; i < MAX_NR_ZONES; i++) {
1813 zonelist = pgdat->node_zonelists + i;
1814 for (j = 0; zonelist->zones[j] != NULL; j++)
1815 ;
1816 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1817 zonelist->zones[j] = NULL;
1818 }
1819}
1820
1821/*
1822 * Build zonelists ordered by zone and nodes within zones.
1823 * This results in conserving DMA zone[s] until all Normal memory is
1824 * exhausted, but results in overflowing to remote node while memory
1825 * may still exist in local DMA zone.
1826 */
1827static int node_order[MAX_NUMNODES];
1828
1829static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
1830{
1831 enum zone_type i;
1832 int pos, j, node;
1833 int zone_type; /* needs to be signed */
1834 struct zone *z;
1835 struct zonelist *zonelist;
1836
1837 for (i = 0; i < MAX_NR_ZONES; i++) {
1838 zonelist = pgdat->node_zonelists + i;
1839 pos = 0;
1840 for (zone_type = i; zone_type >= 0; zone_type--) {
1841 for (j = 0; j < nr_nodes; j++) {
1842 node = node_order[j];
1843 z = &NODE_DATA(node)->node_zones[zone_type];
1844 if (populated_zone(z)) {
1845 zonelist->zones[pos++] = z;
1846 check_highest_zone(zone_type);
1847 }
1848 }
1849 }
1850 zonelist->zones[pos] = NULL;
1851 }
1852}
1853
1854static int default_zonelist_order(void)
1855{
1856 int nid, zone_type;
1857 unsigned long low_kmem_size,total_size;
1858 struct zone *z;
1859 int average_size;
1860 /*
1861 * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
1862 * If they are really small and used heavily, the system can fall
1863 * into OOM very easily.
1864 * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
1865 */
1866 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
1867 low_kmem_size = 0;
1868 total_size = 0;
1869 for_each_online_node(nid) {
1870 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
1871 z = &NODE_DATA(nid)->node_zones[zone_type];
1872 if (populated_zone(z)) {
1873 if (zone_type < ZONE_NORMAL)
1874 low_kmem_size += z->present_pages;
1875 total_size += z->present_pages;
1876 }
1877 }
1878 }
1879 if (!low_kmem_size || /* there are no DMA area. */
1880 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
1881 return ZONELIST_ORDER_NODE;
1882 /*
1883 * look into each node's config.
1884 * If there is a node whose DMA/DMA32 memory is very big area on
1885 * local memory, NODE_ORDER may be suitable.
1886 */
1887 average_size = total_size / (num_online_nodes() + 1);
1888 for_each_online_node(nid) {
1889 low_kmem_size = 0;
1890 total_size = 0;
1891 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
1892 z = &NODE_DATA(nid)->node_zones[zone_type];
1893 if (populated_zone(z)) {
1894 if (zone_type < ZONE_NORMAL)
1895 low_kmem_size += z->present_pages;
1896 total_size += z->present_pages;
1897 }
1898 }
1899 if (low_kmem_size &&
1900 total_size > average_size && /* ignore small node */
1901 low_kmem_size > total_size * 70/100)
1902 return ZONELIST_ORDER_NODE;
1903 }
1904 return ZONELIST_ORDER_ZONE;
1905}
1906
1907static void set_zonelist_order(void)
1908{
1909 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
1910 current_zonelist_order = default_zonelist_order();
1911 else
1912 current_zonelist_order = user_zonelist_order;
1913}
1914
1915static void build_zonelists(pg_data_t *pgdat)
1916{
1917 int j, node, load;
1918 enum zone_type i;
1713 nodemask_t used_mask; 1919 nodemask_t used_mask;
1920 int local_node, prev_node;
1921 struct zonelist *zonelist;
1922 int order = current_zonelist_order;
1714 1923
1715 /* initialize zonelists */ 1924 /* initialize zonelists */
1716 for (i = 0; i < MAX_NR_ZONES; i++) { 1925 for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -1723,6 +1932,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1723 load = num_online_nodes(); 1932 load = num_online_nodes();
1724 prev_node = local_node; 1933 prev_node = local_node;
1725 nodes_clear(used_mask); 1934 nodes_clear(used_mask);
1935
1936 memset(node_load, 0, sizeof(node_load));
1937 memset(node_order, 0, sizeof(node_order));
1938 j = 0;
1939
1726 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 1940 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
1727 int distance = node_distance(local_node, node); 1941 int distance = node_distance(local_node, node);
1728 1942
@@ -1738,23 +1952,25 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1738 * So adding penalty to the first node in same 1952 * So adding penalty to the first node in same
1739 * distance group to make it round-robin. 1953 * distance group to make it round-robin.
1740 */ 1954 */
1741
1742 if (distance != node_distance(local_node, prev_node)) 1955 if (distance != node_distance(local_node, prev_node))
1743 node_load[node] += load; 1956 node_load[node] = load;
1957
1744 prev_node = node; 1958 prev_node = node;
1745 load--; 1959 load--;
1746 for (i = 0; i < MAX_NR_ZONES; i++) { 1960 if (order == ZONELIST_ORDER_NODE)
1747 zonelist = pgdat->node_zonelists + i; 1961 build_zonelists_in_node_order(pgdat, node);
1748 for (j = 0; zonelist->zones[j] != NULL; j++); 1962 else
1963 node_order[j++] = node; /* remember order */
1964 }
1749 1965
1750 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 1966 if (order == ZONELIST_ORDER_ZONE) {
1751 zonelist->zones[j] = NULL; 1967 /* calculate node order -- i.e., DMA last! */
1752 } 1968 build_zonelists_in_zone_order(pgdat, j);
1753 } 1969 }
1754} 1970}
1755 1971
1756/* Construct the zonelist performance cache - see further mmzone.h */ 1972/* Construct the zonelist performance cache - see further mmzone.h */
1757static void __meminit build_zonelist_cache(pg_data_t *pgdat) 1973static void build_zonelist_cache(pg_data_t *pgdat)
1758{ 1974{
1759 int i; 1975 int i;
1760 1976
@@ -1771,9 +1987,15 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1771 } 1987 }
1772} 1988}
1773 1989
1990
1774#else /* CONFIG_NUMA */ 1991#else /* CONFIG_NUMA */
1775 1992
1776static void __meminit build_zonelists(pg_data_t *pgdat) 1993static void set_zonelist_order(void)
1994{
1995 current_zonelist_order = ZONELIST_ORDER_ZONE;
1996}
1997
1998static void build_zonelists(pg_data_t *pgdat)
1777{ 1999{
1778 int node, local_node; 2000 int node, local_node;
1779 enum zone_type i,j; 2001 enum zone_type i,j;
@@ -1809,7 +2031,7 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1809} 2031}
1810 2032
1811/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 2033/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
1812static void __meminit build_zonelist_cache(pg_data_t *pgdat) 2034static void build_zonelist_cache(pg_data_t *pgdat)
1813{ 2035{
1814 int i; 2036 int i;
1815 2037
@@ -1820,7 +2042,7 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1820#endif /* CONFIG_NUMA */ 2042#endif /* CONFIG_NUMA */
1821 2043
1822/* return values int ....just for stop_machine_run() */ 2044/* return values int ....just for stop_machine_run() */
1823static int __meminit __build_all_zonelists(void *dummy) 2045static int __build_all_zonelists(void *dummy)
1824{ 2046{
1825 int nid; 2047 int nid;
1826 2048
@@ -1831,8 +2053,10 @@ static int __meminit __build_all_zonelists(void *dummy)
1831 return 0; 2053 return 0;
1832} 2054}
1833 2055
1834void __meminit build_all_zonelists(void) 2056void build_all_zonelists(void)
1835{ 2057{
2058 set_zonelist_order();
2059
1836 if (system_state == SYSTEM_BOOTING) { 2060 if (system_state == SYSTEM_BOOTING) {
1837 __build_all_zonelists(NULL); 2061 __build_all_zonelists(NULL);
1838 cpuset_init_current_mems_allowed(); 2062 cpuset_init_current_mems_allowed();
@@ -1843,8 +2067,13 @@ void __meminit build_all_zonelists(void)
1843 /* cpuset refresh routine should be here */ 2067 /* cpuset refresh routine should be here */
1844 } 2068 }
1845 vm_total_pages = nr_free_pagecache_pages(); 2069 vm_total_pages = nr_free_pagecache_pages();
1846 printk("Built %i zonelists. Total pages: %ld\n", 2070 printk("Built %i zonelists in %s order. Total pages: %ld\n",
1847 num_online_nodes(), vm_total_pages); 2071 num_online_nodes(),
2072 zonelist_order_name[current_zonelist_order],
2073 vm_total_pages);
2074#ifdef CONFIG_NUMA
2075 printk("Policy zone: %s\n", zone_names[policy_zone]);
2076#endif
1848} 2077}
1849 2078
1850/* 2079/*