aboutsummaryrefslogtreecommitdiffstats
path: root/mm/page_alloc.c
diff options
context:
space:
mode:
authorKAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>2007-07-16 02:38:01 -0400
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-07-16 12:05:35 -0400
commitf0c0b2b808f232741eadac272bd4bc51f18df0f4 (patch)
treec2568efdc496cc165a4e72d8aa2542b22035e342 /mm/page_alloc.c
parent18a8bd949d6adb311ea816125ff65050df1f3f6e (diff)
change zonelist order: zonelist order selection logic
Make zonelist creation policy selectable from sysctl/boot option v6. This patch makes NUMA's zonelist (of pgdat) order selectable. Available order are Default(automatic)/ Node-based / Zone-based. [Default Order] The kernel selects Node-based or Zone-based order automatically. [Node-based Order] This policy treats the locality of memory as the most important parameter. Zonelist order is created by each zone's locality. This means lower zones (ex. ZONE_DMA) can be used before higher zone (ex. ZONE_NORMAL) exhausion. IOW. ZONE_DMA will be in the middle of zonelist. current 2.6.21 kernel uses this. Pros. * A user can expect local memory as much as possible. Cons. * lower zone will be exhansted before higher zone. This may cause OOM_KILL. Maybe suitable if ZONE_DMA is relatively big and you never see OOM_KILL because of ZONE_DMA exhaution and you need the best locality. (example) assume 2 node NUMA. node(0) has ZONE_DMA/ZONE_NORMAL, node(1) has ZONE_NORMAL. *node(0)'s memory allocation order: node(0)'s NORMAL -> node(0)'s DMA -> node(1)'s NORMAL. *node(1)'s memory allocation order: node(1)'s NORMAL -> node(0)'s NORMAL -> node(0)'s DMA. [Zone-based order] This policy treats the zone type as the most important parameter. Zonelist order is created by zone-type order. This means lower zone never be used bofere higher zone exhaustion. IOW. ZONE_DMA will be always at the tail of zonelist. Pros. * OOM_KILL(bacause of lower zone) occurs only if the whole zones are exhausted. Cons. * memory locality may not be best. (example) assume 2 node NUMA. node(0) has ZONE_DMA/ZONE_NORMAL, node(1) has ZONE_NORMAL. *node(0)'s memory allocation order: node(0)'s NORMAL -> node(1)'s NORMAL -> node(0)'s DMA. *node(1)'s memory allocation order: node(1)'s NORMAL -> node(0)'s NORMAL -> node(0)'s DMA. bootoption "numa_zonelist_order=" and proc/sysctl is supporetd. command: %echo N > /proc/sys/vm/numa_zonelist_order Will rebuild zonelist in Node-based order. command: %echo Z > /proc/sys/vm/numa_zonelist_order Will rebuild zonelist in Zone-based order. Thanks to Lee Schermerhorn, he gives me much help and codes. [Lee.Schermerhorn@hp.com: add check_highest_zone to build_zonelists_in_zone_order] [akpm@linux-foundation.org: build fix] Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Christoph Lameter <clameter@sgi.com> Cc: Andi Kleen <ak@suse.de> Cc: "jesse.barnes@intel.com" <jesse.barnes@intel.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/page_alloc.c')
-rw-r--r--mm/page_alloc.c273
1 files changed, 251 insertions, 22 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 05ace44852eb..092b2d8f2f0c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1621,8 +1621,8 @@ void show_free_areas(void)
1621 * 1621 *
1622 * Add all populated zones of a node to the zonelist. 1622 * Add all populated zones of a node to the zonelist.
1623 */ 1623 */
1624static int __meminit build_zonelists_node(pg_data_t *pgdat, 1624static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
1625 struct zonelist *zonelist, int nr_zones, enum zone_type zone_type) 1625 int nr_zones, enum zone_type zone_type)
1626{ 1626{
1627 struct zone *zone; 1627 struct zone *zone;
1628 1628
@@ -1641,9 +1641,102 @@ static int __meminit build_zonelists_node(pg_data_t *pgdat,
1641 return nr_zones; 1641 return nr_zones;
1642} 1642}
1643 1643
1644
1645/*
1646 * zonelist_order:
1647 * 0 = automatic detection of better ordering.
1648 * 1 = order by ([node] distance, -zonetype)
1649 * 2 = order by (-zonetype, [node] distance)
1650 *
1651 * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
1652 * the same zonelist. So only NUMA can configure this param.
1653 */
1654#define ZONELIST_ORDER_DEFAULT 0
1655#define ZONELIST_ORDER_NODE 1
1656#define ZONELIST_ORDER_ZONE 2
1657
1658/* zonelist order in the kernel.
1659 * set_zonelist_order() will set this to NODE or ZONE.
1660 */
1661static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
1662static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
1663
1664
1644#ifdef CONFIG_NUMA 1665#ifdef CONFIG_NUMA
1666/* The value user specified ....changed by config */
1667static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
1668/* string for sysctl */
1669#define NUMA_ZONELIST_ORDER_LEN 16
1670char numa_zonelist_order[16] = "default";
1671
1672/*
1673 * interface for configure zonelist ordering.
1674 * command line option "numa_zonelist_order"
1675 * = "[dD]efault - default, automatic configuration.
1676 * = "[nN]ode - order by node locality, then by zone within node
1677 * = "[zZ]one - order by zone, then by locality within zone
1678 */
1679
1680static int __parse_numa_zonelist_order(char *s)
1681{
1682 if (*s == 'd' || *s == 'D') {
1683 user_zonelist_order = ZONELIST_ORDER_DEFAULT;
1684 } else if (*s == 'n' || *s == 'N') {
1685 user_zonelist_order = ZONELIST_ORDER_NODE;
1686 } else if (*s == 'z' || *s == 'Z') {
1687 user_zonelist_order = ZONELIST_ORDER_ZONE;
1688 } else {
1689 printk(KERN_WARNING
1690 "Ignoring invalid numa_zonelist_order value: "
1691 "%s\n", s);
1692 return -EINVAL;
1693 }
1694 return 0;
1695}
1696
1697static __init int setup_numa_zonelist_order(char *s)
1698{
1699 if (s)
1700 return __parse_numa_zonelist_order(s);
1701 return 0;
1702}
1703early_param("numa_zonelist_order", setup_numa_zonelist_order);
1704
1705/*
1706 * sysctl handler for numa_zonelist_order
1707 */
1708int numa_zonelist_order_handler(ctl_table *table, int write,
1709 struct file *file, void __user *buffer, size_t *length,
1710 loff_t *ppos)
1711{
1712 char saved_string[NUMA_ZONELIST_ORDER_LEN];
1713 int ret;
1714
1715 if (write)
1716 strncpy(saved_string, (char*)table->data,
1717 NUMA_ZONELIST_ORDER_LEN);
1718 ret = proc_dostring(table, write, file, buffer, length, ppos);
1719 if (ret)
1720 return ret;
1721 if (write) {
1722 int oldval = user_zonelist_order;
1723 if (__parse_numa_zonelist_order((char*)table->data)) {
1724 /*
1725 * bogus value. restore saved string
1726 */
1727 strncpy((char*)table->data, saved_string,
1728 NUMA_ZONELIST_ORDER_LEN);
1729 user_zonelist_order = oldval;
1730 } else if (oldval != user_zonelist_order)
1731 build_all_zonelists();
1732 }
1733 return 0;
1734}
1735
1736
1645#define MAX_NODE_LOAD (num_online_nodes()) 1737#define MAX_NODE_LOAD (num_online_nodes())
1646static int __meminitdata node_load[MAX_NUMNODES]; 1738static int node_load[MAX_NUMNODES];
1739
1647/** 1740/**
1648 * find_next_best_node - find the next node that should appear in a given node's fallback list 1741 * find_next_best_node - find the next node that should appear in a given node's fallback list
1649 * @node: node whose fallback list we're appending 1742 * @node: node whose fallback list we're appending
@@ -1658,7 +1751,7 @@ static int __meminitdata node_load[MAX_NUMNODES];
1658 * on them otherwise. 1751 * on them otherwise.
1659 * It returns -1 if no node is found. 1752 * It returns -1 if no node is found.
1660 */ 1753 */
1661static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) 1754static int find_next_best_node(int node, nodemask_t *used_node_mask)
1662{ 1755{
1663 int n, val; 1756 int n, val;
1664 int min_val = INT_MAX; 1757 int min_val = INT_MAX;
@@ -1704,13 +1797,129 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask)
1704 return best_node; 1797 return best_node;
1705} 1798}
1706 1799
1707static void __meminit build_zonelists(pg_data_t *pgdat) 1800
1801/*
1802 * Build zonelists ordered by node and zones within node.
1803 * This results in maximum locality--normal zone overflows into local
1804 * DMA zone, if any--but risks exhausting DMA zone.
1805 */
1806static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
1708{ 1807{
1709 int j, node, local_node;
1710 enum zone_type i; 1808 enum zone_type i;
1711 int prev_node, load; 1809 int j;
1712 struct zonelist *zonelist; 1810 struct zonelist *zonelist;
1811
1812 for (i = 0; i < MAX_NR_ZONES; i++) {
1813 zonelist = pgdat->node_zonelists + i;
1814 for (j = 0; zonelist->zones[j] != NULL; j++)
1815 ;
1816 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i);
1817 zonelist->zones[j] = NULL;
1818 }
1819}
1820
1821/*
1822 * Build zonelists ordered by zone and nodes within zones.
1823 * This results in conserving DMA zone[s] until all Normal memory is
1824 * exhausted, but results in overflowing to remote node while memory
1825 * may still exist in local DMA zone.
1826 */
1827static int node_order[MAX_NUMNODES];
1828
1829static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
1830{
1831 enum zone_type i;
1832 int pos, j, node;
1833 int zone_type; /* needs to be signed */
1834 struct zone *z;
1835 struct zonelist *zonelist;
1836
1837 for (i = 0; i < MAX_NR_ZONES; i++) {
1838 zonelist = pgdat->node_zonelists + i;
1839 pos = 0;
1840 for (zone_type = i; zone_type >= 0; zone_type--) {
1841 for (j = 0; j < nr_nodes; j++) {
1842 node = node_order[j];
1843 z = &NODE_DATA(node)->node_zones[zone_type];
1844 if (populated_zone(z)) {
1845 zonelist->zones[pos++] = z;
1846 check_highest_zone(zone_type);
1847 }
1848 }
1849 }
1850 zonelist->zones[pos] = NULL;
1851 }
1852}
1853
1854static int default_zonelist_order(void)
1855{
1856 int nid, zone_type;
1857 unsigned long low_kmem_size,total_size;
1858 struct zone *z;
1859 int average_size;
1860 /*
1861 * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem.
1862 * If they are really small and used heavily, the system can fall
1863 * into OOM very easily.
1864 * This function detect ZONE_DMA/DMA32 size and confgigures zone order.
1865 */
1866 /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */
1867 low_kmem_size = 0;
1868 total_size = 0;
1869 for_each_online_node(nid) {
1870 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
1871 z = &NODE_DATA(nid)->node_zones[zone_type];
1872 if (populated_zone(z)) {
1873 if (zone_type < ZONE_NORMAL)
1874 low_kmem_size += z->present_pages;
1875 total_size += z->present_pages;
1876 }
1877 }
1878 }
1879 if (!low_kmem_size || /* there are no DMA area. */
1880 low_kmem_size > total_size/2) /* DMA/DMA32 is big. */
1881 return ZONELIST_ORDER_NODE;
1882 /*
1883 * look into each node's config.
1884 * If there is a node whose DMA/DMA32 memory is very big area on
1885 * local memory, NODE_ORDER may be suitable.
1886 */
1887 average_size = total_size / (num_online_nodes() + 1);
1888 for_each_online_node(nid) {
1889 low_kmem_size = 0;
1890 total_size = 0;
1891 for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) {
1892 z = &NODE_DATA(nid)->node_zones[zone_type];
1893 if (populated_zone(z)) {
1894 if (zone_type < ZONE_NORMAL)
1895 low_kmem_size += z->present_pages;
1896 total_size += z->present_pages;
1897 }
1898 }
1899 if (low_kmem_size &&
1900 total_size > average_size && /* ignore small node */
1901 low_kmem_size > total_size * 70/100)
1902 return ZONELIST_ORDER_NODE;
1903 }
1904 return ZONELIST_ORDER_ZONE;
1905}
1906
1907static void set_zonelist_order(void)
1908{
1909 if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
1910 current_zonelist_order = default_zonelist_order();
1911 else
1912 current_zonelist_order = user_zonelist_order;
1913}
1914
1915static void build_zonelists(pg_data_t *pgdat)
1916{
1917 int j, node, load;
1918 enum zone_type i;
1713 nodemask_t used_mask; 1919 nodemask_t used_mask;
1920 int local_node, prev_node;
1921 struct zonelist *zonelist;
1922 int order = current_zonelist_order;
1714 1923
1715 /* initialize zonelists */ 1924 /* initialize zonelists */
1716 for (i = 0; i < MAX_NR_ZONES; i++) { 1925 for (i = 0; i < MAX_NR_ZONES; i++) {
@@ -1723,6 +1932,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1723 load = num_online_nodes(); 1932 load = num_online_nodes();
1724 prev_node = local_node; 1933 prev_node = local_node;
1725 nodes_clear(used_mask); 1934 nodes_clear(used_mask);
1935
1936 memset(node_load, 0, sizeof(node_load));
1937 memset(node_order, 0, sizeof(node_order));
1938 j = 0;
1939
1726 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { 1940 while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
1727 int distance = node_distance(local_node, node); 1941 int distance = node_distance(local_node, node);
1728 1942
@@ -1738,23 +1952,25 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1738 * So adding penalty to the first node in same 1952 * So adding penalty to the first node in same
1739 * distance group to make it round-robin. 1953 * distance group to make it round-robin.
1740 */ 1954 */
1741
1742 if (distance != node_distance(local_node, prev_node)) 1955 if (distance != node_distance(local_node, prev_node))
1743 node_load[node] += load; 1956 node_load[node] = load;
1957
1744 prev_node = node; 1958 prev_node = node;
1745 load--; 1959 load--;
1746 for (i = 0; i < MAX_NR_ZONES; i++) { 1960 if (order == ZONELIST_ORDER_NODE)
1747 zonelist = pgdat->node_zonelists + i; 1961 build_zonelists_in_node_order(pgdat, node);
1748 for (j = 0; zonelist->zones[j] != NULL; j++); 1962 else
1963 node_order[j++] = node; /* remember order */
1964 }
1749 1965
1750 j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); 1966 if (order == ZONELIST_ORDER_ZONE) {
1751 zonelist->zones[j] = NULL; 1967 /* calculate node order -- i.e., DMA last! */
1752 } 1968 build_zonelists_in_zone_order(pgdat, j);
1753 } 1969 }
1754} 1970}
1755 1971
1756/* Construct the zonelist performance cache - see further mmzone.h */ 1972/* Construct the zonelist performance cache - see further mmzone.h */
1757static void __meminit build_zonelist_cache(pg_data_t *pgdat) 1973static void build_zonelist_cache(pg_data_t *pgdat)
1758{ 1974{
1759 int i; 1975 int i;
1760 1976
@@ -1771,9 +1987,15 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1771 } 1987 }
1772} 1988}
1773 1989
1990
1774#else /* CONFIG_NUMA */ 1991#else /* CONFIG_NUMA */
1775 1992
1776static void __meminit build_zonelists(pg_data_t *pgdat) 1993static void set_zonelist_order(void)
1994{
1995 current_zonelist_order = ZONELIST_ORDER_ZONE;
1996}
1997
1998static void build_zonelists(pg_data_t *pgdat)
1777{ 1999{
1778 int node, local_node; 2000 int node, local_node;
1779 enum zone_type i,j; 2001 enum zone_type i,j;
@@ -1809,7 +2031,7 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
1809} 2031}
1810 2032
1811/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ 2033/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
1812static void __meminit build_zonelist_cache(pg_data_t *pgdat) 2034static void build_zonelist_cache(pg_data_t *pgdat)
1813{ 2035{
1814 int i; 2036 int i;
1815 2037
@@ -1820,7 +2042,7 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat)
1820#endif /* CONFIG_NUMA */ 2042#endif /* CONFIG_NUMA */
1821 2043
1822/* return values int ....just for stop_machine_run() */ 2044/* return values int ....just for stop_machine_run() */
1823static int __meminit __build_all_zonelists(void *dummy) 2045static int __build_all_zonelists(void *dummy)
1824{ 2046{
1825 int nid; 2047 int nid;
1826 2048
@@ -1831,8 +2053,10 @@ static int __meminit __build_all_zonelists(void *dummy)
1831 return 0; 2053 return 0;
1832} 2054}
1833 2055
1834void __meminit build_all_zonelists(void) 2056void build_all_zonelists(void)
1835{ 2057{
2058 set_zonelist_order();
2059
1836 if (system_state == SYSTEM_BOOTING) { 2060 if (system_state == SYSTEM_BOOTING) {
1837 __build_all_zonelists(NULL); 2061 __build_all_zonelists(NULL);
1838 cpuset_init_current_mems_allowed(); 2062 cpuset_init_current_mems_allowed();
@@ -1843,8 +2067,13 @@ void __meminit build_all_zonelists(void)
1843 /* cpuset refresh routine should be here */ 2067 /* cpuset refresh routine should be here */
1844 } 2068 }
1845 vm_total_pages = nr_free_pagecache_pages(); 2069 vm_total_pages = nr_free_pagecache_pages();
1846 printk("Built %i zonelists. Total pages: %ld\n", 2070 printk("Built %i zonelists in %s order. Total pages: %ld\n",
1847 num_online_nodes(), vm_total_pages); 2071 num_online_nodes(),
2072 zonelist_order_name[current_zonelist_order],
2073 vm_total_pages);
2074#ifdef CONFIG_NUMA
2075 printk("Policy zone: %s\n", zone_names[policy_zone]);
2076#endif
1848} 2077}
1849 2078
1850/* 2079/*