diff options
author | KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> | 2007-07-16 02:38:01 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@woody.linux-foundation.org> | 2007-07-16 12:05:35 -0400 |
commit | f0c0b2b808f232741eadac272bd4bc51f18df0f4 (patch) | |
tree | c2568efdc496cc165a4e72d8aa2542b22035e342 /mm | |
parent | 18a8bd949d6adb311ea816125ff65050df1f3f6e (diff) |
change zonelist order: zonelist order selection logic
Make zonelist creation policy selectable from sysctl/boot option v6.
This patch makes NUMA's zonelist (of pgdat) order selectable.
Available order are Default(automatic)/ Node-based / Zone-based.
[Default Order]
The kernel selects Node-based or Zone-based order automatically.
[Node-based Order]
This policy treats the locality of memory as the most important parameter.
Zonelist order is created by each zone's locality. This means lower zones
(ex. ZONE_DMA) can be used before higher zone (ex. ZONE_NORMAL) exhausion.
IOW. ZONE_DMA will be in the middle of zonelist.
current 2.6.21 kernel uses this.
Pros.
* A user can expect local memory as much as possible.
Cons.
* lower zone will be exhansted before higher zone. This may cause OOM_KILL.
Maybe suitable if ZONE_DMA is relatively big and you never see OOM_KILL
because of ZONE_DMA exhaution and you need the best locality.
(example)
assume 2 node NUMA. node(0) has ZONE_DMA/ZONE_NORMAL, node(1) has ZONE_NORMAL.
*node(0)'s memory allocation order:
node(0)'s NORMAL -> node(0)'s DMA -> node(1)'s NORMAL.
*node(1)'s memory allocation order:
node(1)'s NORMAL -> node(0)'s NORMAL -> node(0)'s DMA.
[Zone-based order]
This policy treats the zone type as the most important parameter.
Zonelist order is created by zone-type order. This means lower zone
never be used bofere higher zone exhaustion.
IOW. ZONE_DMA will be always at the tail of zonelist.
Pros.
* OOM_KILL(bacause of lower zone) occurs only if the whole zones are exhausted.
Cons.
* memory locality may not be best.
(example)
assume 2 node NUMA. node(0) has ZONE_DMA/ZONE_NORMAL, node(1) has ZONE_NORMAL.
*node(0)'s memory allocation order:
node(0)'s NORMAL -> node(1)'s NORMAL -> node(0)'s DMA.
*node(1)'s memory allocation order:
node(1)'s NORMAL -> node(0)'s NORMAL -> node(0)'s DMA.
bootoption "numa_zonelist_order=" and proc/sysctl is supporetd.
command:
%echo N > /proc/sys/vm/numa_zonelist_order
Will rebuild zonelist in Node-based order.
command:
%echo Z > /proc/sys/vm/numa_zonelist_order
Will rebuild zonelist in Zone-based order.
Thanks to Lee Schermerhorn, he gives me much help and codes.
[Lee.Schermerhorn@hp.com: add check_highest_zone to build_zonelists_in_zone_order]
[akpm@linux-foundation.org: build fix]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Lee Schermerhorn <lee.schermerhorn@hp.com>
Cc: Christoph Lameter <clameter@sgi.com>
Cc: Andi Kleen <ak@suse.de>
Cc: "jesse.barnes@intel.com" <jesse.barnes@intel.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r-- | mm/page_alloc.c | 273 |
1 files changed, 251 insertions, 22 deletions
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 05ace44852eb..092b2d8f2f0c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1621,8 +1621,8 @@ void show_free_areas(void) | |||
1621 | * | 1621 | * |
1622 | * Add all populated zones of a node to the zonelist. | 1622 | * Add all populated zones of a node to the zonelist. |
1623 | */ | 1623 | */ |
1624 | static int __meminit build_zonelists_node(pg_data_t *pgdat, | 1624 | static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, |
1625 | struct zonelist *zonelist, int nr_zones, enum zone_type zone_type) | 1625 | int nr_zones, enum zone_type zone_type) |
1626 | { | 1626 | { |
1627 | struct zone *zone; | 1627 | struct zone *zone; |
1628 | 1628 | ||
@@ -1641,9 +1641,102 @@ static int __meminit build_zonelists_node(pg_data_t *pgdat, | |||
1641 | return nr_zones; | 1641 | return nr_zones; |
1642 | } | 1642 | } |
1643 | 1643 | ||
1644 | |||
1645 | /* | ||
1646 | * zonelist_order: | ||
1647 | * 0 = automatic detection of better ordering. | ||
1648 | * 1 = order by ([node] distance, -zonetype) | ||
1649 | * 2 = order by (-zonetype, [node] distance) | ||
1650 | * | ||
1651 | * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create | ||
1652 | * the same zonelist. So only NUMA can configure this param. | ||
1653 | */ | ||
1654 | #define ZONELIST_ORDER_DEFAULT 0 | ||
1655 | #define ZONELIST_ORDER_NODE 1 | ||
1656 | #define ZONELIST_ORDER_ZONE 2 | ||
1657 | |||
1658 | /* zonelist order in the kernel. | ||
1659 | * set_zonelist_order() will set this to NODE or ZONE. | ||
1660 | */ | ||
1661 | static int current_zonelist_order = ZONELIST_ORDER_DEFAULT; | ||
1662 | static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"}; | ||
1663 | |||
1664 | |||
1644 | #ifdef CONFIG_NUMA | 1665 | #ifdef CONFIG_NUMA |
1666 | /* The value user specified ....changed by config */ | ||
1667 | static int user_zonelist_order = ZONELIST_ORDER_DEFAULT; | ||
1668 | /* string for sysctl */ | ||
1669 | #define NUMA_ZONELIST_ORDER_LEN 16 | ||
1670 | char numa_zonelist_order[16] = "default"; | ||
1671 | |||
1672 | /* | ||
1673 | * interface for configure zonelist ordering. | ||
1674 | * command line option "numa_zonelist_order" | ||
1675 | * = "[dD]efault - default, automatic configuration. | ||
1676 | * = "[nN]ode - order by node locality, then by zone within node | ||
1677 | * = "[zZ]one - order by zone, then by locality within zone | ||
1678 | */ | ||
1679 | |||
1680 | static int __parse_numa_zonelist_order(char *s) | ||
1681 | { | ||
1682 | if (*s == 'd' || *s == 'D') { | ||
1683 | user_zonelist_order = ZONELIST_ORDER_DEFAULT; | ||
1684 | } else if (*s == 'n' || *s == 'N') { | ||
1685 | user_zonelist_order = ZONELIST_ORDER_NODE; | ||
1686 | } else if (*s == 'z' || *s == 'Z') { | ||
1687 | user_zonelist_order = ZONELIST_ORDER_ZONE; | ||
1688 | } else { | ||
1689 | printk(KERN_WARNING | ||
1690 | "Ignoring invalid numa_zonelist_order value: " | ||
1691 | "%s\n", s); | ||
1692 | return -EINVAL; | ||
1693 | } | ||
1694 | return 0; | ||
1695 | } | ||
1696 | |||
1697 | static __init int setup_numa_zonelist_order(char *s) | ||
1698 | { | ||
1699 | if (s) | ||
1700 | return __parse_numa_zonelist_order(s); | ||
1701 | return 0; | ||
1702 | } | ||
1703 | early_param("numa_zonelist_order", setup_numa_zonelist_order); | ||
1704 | |||
1705 | /* | ||
1706 | * sysctl handler for numa_zonelist_order | ||
1707 | */ | ||
1708 | int numa_zonelist_order_handler(ctl_table *table, int write, | ||
1709 | struct file *file, void __user *buffer, size_t *length, | ||
1710 | loff_t *ppos) | ||
1711 | { | ||
1712 | char saved_string[NUMA_ZONELIST_ORDER_LEN]; | ||
1713 | int ret; | ||
1714 | |||
1715 | if (write) | ||
1716 | strncpy(saved_string, (char*)table->data, | ||
1717 | NUMA_ZONELIST_ORDER_LEN); | ||
1718 | ret = proc_dostring(table, write, file, buffer, length, ppos); | ||
1719 | if (ret) | ||
1720 | return ret; | ||
1721 | if (write) { | ||
1722 | int oldval = user_zonelist_order; | ||
1723 | if (__parse_numa_zonelist_order((char*)table->data)) { | ||
1724 | /* | ||
1725 | * bogus value. restore saved string | ||
1726 | */ | ||
1727 | strncpy((char*)table->data, saved_string, | ||
1728 | NUMA_ZONELIST_ORDER_LEN); | ||
1729 | user_zonelist_order = oldval; | ||
1730 | } else if (oldval != user_zonelist_order) | ||
1731 | build_all_zonelists(); | ||
1732 | } | ||
1733 | return 0; | ||
1734 | } | ||
1735 | |||
1736 | |||
1645 | #define MAX_NODE_LOAD (num_online_nodes()) | 1737 | #define MAX_NODE_LOAD (num_online_nodes()) |
1646 | static int __meminitdata node_load[MAX_NUMNODES]; | 1738 | static int node_load[MAX_NUMNODES]; |
1739 | |||
1647 | /** | 1740 | /** |
1648 | * find_next_best_node - find the next node that should appear in a given node's fallback list | 1741 | * find_next_best_node - find the next node that should appear in a given node's fallback list |
1649 | * @node: node whose fallback list we're appending | 1742 | * @node: node whose fallback list we're appending |
@@ -1658,7 +1751,7 @@ static int __meminitdata node_load[MAX_NUMNODES]; | |||
1658 | * on them otherwise. | 1751 | * on them otherwise. |
1659 | * It returns -1 if no node is found. | 1752 | * It returns -1 if no node is found. |
1660 | */ | 1753 | */ |
1661 | static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) | 1754 | static int find_next_best_node(int node, nodemask_t *used_node_mask) |
1662 | { | 1755 | { |
1663 | int n, val; | 1756 | int n, val; |
1664 | int min_val = INT_MAX; | 1757 | int min_val = INT_MAX; |
@@ -1704,13 +1797,129 @@ static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) | |||
1704 | return best_node; | 1797 | return best_node; |
1705 | } | 1798 | } |
1706 | 1799 | ||
1707 | static void __meminit build_zonelists(pg_data_t *pgdat) | 1800 | |
1801 | /* | ||
1802 | * Build zonelists ordered by node and zones within node. | ||
1803 | * This results in maximum locality--normal zone overflows into local | ||
1804 | * DMA zone, if any--but risks exhausting DMA zone. | ||
1805 | */ | ||
1806 | static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) | ||
1708 | { | 1807 | { |
1709 | int j, node, local_node; | ||
1710 | enum zone_type i; | 1808 | enum zone_type i; |
1711 | int prev_node, load; | 1809 | int j; |
1712 | struct zonelist *zonelist; | 1810 | struct zonelist *zonelist; |
1811 | |||
1812 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
1813 | zonelist = pgdat->node_zonelists + i; | ||
1814 | for (j = 0; zonelist->zones[j] != NULL; j++) | ||
1815 | ; | ||
1816 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | ||
1817 | zonelist->zones[j] = NULL; | ||
1818 | } | ||
1819 | } | ||
1820 | |||
1821 | /* | ||
1822 | * Build zonelists ordered by zone and nodes within zones. | ||
1823 | * This results in conserving DMA zone[s] until all Normal memory is | ||
1824 | * exhausted, but results in overflowing to remote node while memory | ||
1825 | * may still exist in local DMA zone. | ||
1826 | */ | ||
1827 | static int node_order[MAX_NUMNODES]; | ||
1828 | |||
1829 | static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes) | ||
1830 | { | ||
1831 | enum zone_type i; | ||
1832 | int pos, j, node; | ||
1833 | int zone_type; /* needs to be signed */ | ||
1834 | struct zone *z; | ||
1835 | struct zonelist *zonelist; | ||
1836 | |||
1837 | for (i = 0; i < MAX_NR_ZONES; i++) { | ||
1838 | zonelist = pgdat->node_zonelists + i; | ||
1839 | pos = 0; | ||
1840 | for (zone_type = i; zone_type >= 0; zone_type--) { | ||
1841 | for (j = 0; j < nr_nodes; j++) { | ||
1842 | node = node_order[j]; | ||
1843 | z = &NODE_DATA(node)->node_zones[zone_type]; | ||
1844 | if (populated_zone(z)) { | ||
1845 | zonelist->zones[pos++] = z; | ||
1846 | check_highest_zone(zone_type); | ||
1847 | } | ||
1848 | } | ||
1849 | } | ||
1850 | zonelist->zones[pos] = NULL; | ||
1851 | } | ||
1852 | } | ||
1853 | |||
1854 | static int default_zonelist_order(void) | ||
1855 | { | ||
1856 | int nid, zone_type; | ||
1857 | unsigned long low_kmem_size,total_size; | ||
1858 | struct zone *z; | ||
1859 | int average_size; | ||
1860 | /* | ||
1861 | * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem. | ||
1862 | * If they are really small and used heavily, the system can fall | ||
1863 | * into OOM very easily. | ||
1864 | * This function detect ZONE_DMA/DMA32 size and confgigures zone order. | ||
1865 | */ | ||
1866 | /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ | ||
1867 | low_kmem_size = 0; | ||
1868 | total_size = 0; | ||
1869 | for_each_online_node(nid) { | ||
1870 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { | ||
1871 | z = &NODE_DATA(nid)->node_zones[zone_type]; | ||
1872 | if (populated_zone(z)) { | ||
1873 | if (zone_type < ZONE_NORMAL) | ||
1874 | low_kmem_size += z->present_pages; | ||
1875 | total_size += z->present_pages; | ||
1876 | } | ||
1877 | } | ||
1878 | } | ||
1879 | if (!low_kmem_size || /* there are no DMA area. */ | ||
1880 | low_kmem_size > total_size/2) /* DMA/DMA32 is big. */ | ||
1881 | return ZONELIST_ORDER_NODE; | ||
1882 | /* | ||
1883 | * look into each node's config. | ||
1884 | * If there is a node whose DMA/DMA32 memory is very big area on | ||
1885 | * local memory, NODE_ORDER may be suitable. | ||
1886 | */ | ||
1887 | average_size = total_size / (num_online_nodes() + 1); | ||
1888 | for_each_online_node(nid) { | ||
1889 | low_kmem_size = 0; | ||
1890 | total_size = 0; | ||
1891 | for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) { | ||
1892 | z = &NODE_DATA(nid)->node_zones[zone_type]; | ||
1893 | if (populated_zone(z)) { | ||
1894 | if (zone_type < ZONE_NORMAL) | ||
1895 | low_kmem_size += z->present_pages; | ||
1896 | total_size += z->present_pages; | ||
1897 | } | ||
1898 | } | ||
1899 | if (low_kmem_size && | ||
1900 | total_size > average_size && /* ignore small node */ | ||
1901 | low_kmem_size > total_size * 70/100) | ||
1902 | return ZONELIST_ORDER_NODE; | ||
1903 | } | ||
1904 | return ZONELIST_ORDER_ZONE; | ||
1905 | } | ||
1906 | |||
1907 | static void set_zonelist_order(void) | ||
1908 | { | ||
1909 | if (user_zonelist_order == ZONELIST_ORDER_DEFAULT) | ||
1910 | current_zonelist_order = default_zonelist_order(); | ||
1911 | else | ||
1912 | current_zonelist_order = user_zonelist_order; | ||
1913 | } | ||
1914 | |||
1915 | static void build_zonelists(pg_data_t *pgdat) | ||
1916 | { | ||
1917 | int j, node, load; | ||
1918 | enum zone_type i; | ||
1713 | nodemask_t used_mask; | 1919 | nodemask_t used_mask; |
1920 | int local_node, prev_node; | ||
1921 | struct zonelist *zonelist; | ||
1922 | int order = current_zonelist_order; | ||
1714 | 1923 | ||
1715 | /* initialize zonelists */ | 1924 | /* initialize zonelists */ |
1716 | for (i = 0; i < MAX_NR_ZONES; i++) { | 1925 | for (i = 0; i < MAX_NR_ZONES; i++) { |
@@ -1723,6 +1932,11 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1723 | load = num_online_nodes(); | 1932 | load = num_online_nodes(); |
1724 | prev_node = local_node; | 1933 | prev_node = local_node; |
1725 | nodes_clear(used_mask); | 1934 | nodes_clear(used_mask); |
1935 | |||
1936 | memset(node_load, 0, sizeof(node_load)); | ||
1937 | memset(node_order, 0, sizeof(node_order)); | ||
1938 | j = 0; | ||
1939 | |||
1726 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { | 1940 | while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { |
1727 | int distance = node_distance(local_node, node); | 1941 | int distance = node_distance(local_node, node); |
1728 | 1942 | ||
@@ -1738,23 +1952,25 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1738 | * So adding penalty to the first node in same | 1952 | * So adding penalty to the first node in same |
1739 | * distance group to make it round-robin. | 1953 | * distance group to make it round-robin. |
1740 | */ | 1954 | */ |
1741 | |||
1742 | if (distance != node_distance(local_node, prev_node)) | 1955 | if (distance != node_distance(local_node, prev_node)) |
1743 | node_load[node] += load; | 1956 | node_load[node] = load; |
1957 | |||
1744 | prev_node = node; | 1958 | prev_node = node; |
1745 | load--; | 1959 | load--; |
1746 | for (i = 0; i < MAX_NR_ZONES; i++) { | 1960 | if (order == ZONELIST_ORDER_NODE) |
1747 | zonelist = pgdat->node_zonelists + i; | 1961 | build_zonelists_in_node_order(pgdat, node); |
1748 | for (j = 0; zonelist->zones[j] != NULL; j++); | 1962 | else |
1963 | node_order[j++] = node; /* remember order */ | ||
1964 | } | ||
1749 | 1965 | ||
1750 | j = build_zonelists_node(NODE_DATA(node), zonelist, j, i); | 1966 | if (order == ZONELIST_ORDER_ZONE) { |
1751 | zonelist->zones[j] = NULL; | 1967 | /* calculate node order -- i.e., DMA last! */ |
1752 | } | 1968 | build_zonelists_in_zone_order(pgdat, j); |
1753 | } | 1969 | } |
1754 | } | 1970 | } |
1755 | 1971 | ||
1756 | /* Construct the zonelist performance cache - see further mmzone.h */ | 1972 | /* Construct the zonelist performance cache - see further mmzone.h */ |
1757 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | 1973 | static void build_zonelist_cache(pg_data_t *pgdat) |
1758 | { | 1974 | { |
1759 | int i; | 1975 | int i; |
1760 | 1976 | ||
@@ -1771,9 +1987,15 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat) | |||
1771 | } | 1987 | } |
1772 | } | 1988 | } |
1773 | 1989 | ||
1990 | |||
1774 | #else /* CONFIG_NUMA */ | 1991 | #else /* CONFIG_NUMA */ |
1775 | 1992 | ||
1776 | static void __meminit build_zonelists(pg_data_t *pgdat) | 1993 | static void set_zonelist_order(void) |
1994 | { | ||
1995 | current_zonelist_order = ZONELIST_ORDER_ZONE; | ||
1996 | } | ||
1997 | |||
1998 | static void build_zonelists(pg_data_t *pgdat) | ||
1777 | { | 1999 | { |
1778 | int node, local_node; | 2000 | int node, local_node; |
1779 | enum zone_type i,j; | 2001 | enum zone_type i,j; |
@@ -1809,7 +2031,7 @@ static void __meminit build_zonelists(pg_data_t *pgdat) | |||
1809 | } | 2031 | } |
1810 | 2032 | ||
1811 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ | 2033 | /* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */ |
1812 | static void __meminit build_zonelist_cache(pg_data_t *pgdat) | 2034 | static void build_zonelist_cache(pg_data_t *pgdat) |
1813 | { | 2035 | { |
1814 | int i; | 2036 | int i; |
1815 | 2037 | ||
@@ -1820,7 +2042,7 @@ static void __meminit build_zonelist_cache(pg_data_t *pgdat) | |||
1820 | #endif /* CONFIG_NUMA */ | 2042 | #endif /* CONFIG_NUMA */ |
1821 | 2043 | ||
1822 | /* return values int ....just for stop_machine_run() */ | 2044 | /* return values int ....just for stop_machine_run() */ |
1823 | static int __meminit __build_all_zonelists(void *dummy) | 2045 | static int __build_all_zonelists(void *dummy) |
1824 | { | 2046 | { |
1825 | int nid; | 2047 | int nid; |
1826 | 2048 | ||
@@ -1831,8 +2053,10 @@ static int __meminit __build_all_zonelists(void *dummy) | |||
1831 | return 0; | 2053 | return 0; |
1832 | } | 2054 | } |
1833 | 2055 | ||
1834 | void __meminit build_all_zonelists(void) | 2056 | void build_all_zonelists(void) |
1835 | { | 2057 | { |
2058 | set_zonelist_order(); | ||
2059 | |||
1836 | if (system_state == SYSTEM_BOOTING) { | 2060 | if (system_state == SYSTEM_BOOTING) { |
1837 | __build_all_zonelists(NULL); | 2061 | __build_all_zonelists(NULL); |
1838 | cpuset_init_current_mems_allowed(); | 2062 | cpuset_init_current_mems_allowed(); |
@@ -1843,8 +2067,13 @@ void __meminit build_all_zonelists(void) | |||
1843 | /* cpuset refresh routine should be here */ | 2067 | /* cpuset refresh routine should be here */ |
1844 | } | 2068 | } |
1845 | vm_total_pages = nr_free_pagecache_pages(); | 2069 | vm_total_pages = nr_free_pagecache_pages(); |
1846 | printk("Built %i zonelists. Total pages: %ld\n", | 2070 | printk("Built %i zonelists in %s order. Total pages: %ld\n", |
1847 | num_online_nodes(), vm_total_pages); | 2071 | num_online_nodes(), |
2072 | zonelist_order_name[current_zonelist_order], | ||
2073 | vm_total_pages); | ||
2074 | #ifdef CONFIG_NUMA | ||
2075 | printk("Policy zone: %s\n", zone_names[policy_zone]); | ||
2076 | #endif | ||
1848 | } | 2077 | } |
1849 | 2078 | ||
1850 | /* | 2079 | /* |