aboutsummaryrefslogtreecommitdiffstats
path: root/mm/percpu.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2009-07-03 19:11:00 -0400
committerTejun Heo <tj@kernel.org>2009-07-03 19:11:00 -0400
commita530b7958612bafe2027e21359083dba84f0b3b4 (patch)
treefecbfc0d23b7702a903e8b2539e04e6086ba4404 /mm/percpu.c
parent2f39e637ea240efb74cf807d31c93a71a0b89174 (diff)
percpu: teach large page allocator about NUMA
Large page first chunk allocator is primarily used for NUMA machines; however, its NUMA handling is extremely simplistic. Regardless of their proximity, each cpu is put into separate large page just to return most of the allocated space back wasting large amount of vmalloc space and increasing cache footprint. This patch teachs NUMA details to large page allocator. Given processor proximity information, pcpu_lpage_build_unit_map() will find fitting cpu -> unit mapping in which cpus in LOCAL_DISTANCE share the same large page and not too much virtual address space is wasted. This greatly reduces the unit and thus chunk size and wastes much less address space for the first chunk. For example, on 4/4 NUMA machine, the original code occupied 16MB of virtual space for the first chunk while the new code only uses 4MB - one 2MB page for each node. [ Impact: much better space efficiency on NUMA machines ] Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Ingo Molnar <mingo@elte.hu> Cc: Jan Beulich <JBeulich@novell.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: David Miller <davem@davemloft.net>
Diffstat (limited to 'mm/percpu.c')
-rw-r--r--mm/percpu.c358
1 files changed, 282 insertions, 76 deletions
diff --git a/mm/percpu.c b/mm/percpu.c
index 2196fae24f00..b3d0bcff8c7c 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -59,6 +59,7 @@
59#include <linux/bitmap.h> 59#include <linux/bitmap.h>
60#include <linux/bootmem.h> 60#include <linux/bootmem.h>
61#include <linux/list.h> 61#include <linux/list.h>
62#include <linux/log2.h>
62#include <linux/mm.h> 63#include <linux/mm.h>
63#include <linux/module.h> 64#include <linux/module.h>
64#include <linux/mutex.h> 65#include <linux/mutex.h>
@@ -1594,75 +1595,259 @@ out_free_ar:
1594 * Large page remapping first chunk setup helper 1595 * Large page remapping first chunk setup helper
1595 */ 1596 */
1596#ifdef CONFIG_NEED_MULTIPLE_NODES 1597#ifdef CONFIG_NEED_MULTIPLE_NODES
1598
1599/**
1600 * pcpu_lpage_build_unit_map - build unit_map for large page remapping
1601 * @static_size: the size of static percpu area in bytes
1602 * @reserved_size: the size of reserved percpu area in bytes
1603 * @dyn_sizep: in/out parameter for dynamic size, -1 for auto
1604 * @unit_sizep: out parameter for unit size
1605 * @unit_map: unit_map to be filled
1606 * @cpu_distance_fn: callback to determine distance between cpus
1607 *
1608 * This function builds cpu -> unit map and determine other parameters
1609 * considering needed percpu size, large page size and distances
1610 * between CPUs in NUMA.
1611 *
1612 * CPUs which are of LOCAL_DISTANCE both ways are grouped together and
1613 * may share units in the same large page. The returned configuration
1614 * is guaranteed to have CPUs on different nodes on different large
1615 * pages and >=75% usage of allocated virtual address space.
1616 *
1617 * RETURNS:
1618 * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
1619 * returns the number of units to be allocated. -errno on failure.
1620 */
1621int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size,
1622 ssize_t *dyn_sizep, size_t *unit_sizep,
1623 size_t lpage_size, int *unit_map,
1624 pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
1625{
1626 static int group_map[NR_CPUS] __initdata;
1627 static int group_cnt[NR_CPUS] __initdata;
1628 int group_cnt_max = 0;
1629 size_t size_sum, min_unit_size, alloc_size;
1630 int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
1631 int last_allocs;
1632 unsigned int cpu, tcpu;
1633 int group, unit;
1634
1635 /*
1636 * Determine min_unit_size, alloc_size and max_upa such that
1637 * alloc_size is multiple of lpage_size and is the smallest
1638 * which can accomodate 4k aligned segments which are equal to
1639 * or larger than min_unit_size.
1640 */
1641 size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
1642 min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
1643
1644 alloc_size = roundup(min_unit_size, lpage_size);
1645 upa = alloc_size / min_unit_size;
1646 while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1647 upa--;
1648 max_upa = upa;
1649
1650 /* group cpus according to their proximity */
1651 for_each_possible_cpu(cpu) {
1652 group = 0;
1653 next_group:
1654 for_each_possible_cpu(tcpu) {
1655 if (cpu == tcpu)
1656 break;
1657 if (group_map[tcpu] == group &&
1658 (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
1659 cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
1660 group++;
1661 goto next_group;
1662 }
1663 }
1664 group_map[cpu] = group;
1665 group_cnt[group]++;
1666 group_cnt_max = max(group_cnt_max, group_cnt[group]);
1667 }
1668
1669 /*
1670 * Expand unit size until address space usage goes over 75%
1671 * and then as much as possible without using more address
1672 * space.
1673 */
1674 last_allocs = INT_MAX;
1675 for (upa = max_upa; upa; upa--) {
1676 int allocs = 0, wasted = 0;
1677
1678 if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
1679 continue;
1680
1681 for (group = 0; group_cnt[group]; group++) {
1682 int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
1683 allocs += this_allocs;
1684 wasted += this_allocs * upa - group_cnt[group];
1685 }
1686
1687 /*
1688 * Don't accept if wastage is over 25%. The
1689 * greater-than comparison ensures upa==1 always
1690 * passes the following check.
1691 */
1692 if (wasted > num_possible_cpus() / 3)
1693 continue;
1694
1695 /* and then don't consume more memory */
1696 if (allocs > last_allocs)
1697 break;
1698 last_allocs = allocs;
1699 best_upa = upa;
1700 }
1701 *unit_sizep = alloc_size / best_upa;
1702
1703 /* assign units to cpus accordingly */
1704 unit = 0;
1705 for (group = 0; group_cnt[group]; group++) {
1706 for_each_possible_cpu(cpu)
1707 if (group_map[cpu] == group)
1708 unit_map[cpu] = unit++;
1709 unit = roundup(unit, best_upa);
1710 }
1711
1712 return unit; /* unit contains aligned number of units */
1713}
1714
1597struct pcpul_ent { 1715struct pcpul_ent {
1598 unsigned int cpu;
1599 void *ptr; 1716 void *ptr;
1717 void *map_addr;
1600}; 1718};
1601 1719
1602static size_t pcpul_size; 1720static size_t pcpul_size;
1603static size_t pcpul_unit_size; 1721static size_t pcpul_lpage_size;
1722static int pcpul_nr_lpages;
1604static struct pcpul_ent *pcpul_map; 1723static struct pcpul_ent *pcpul_map;
1605static struct vm_struct pcpul_vm; 1724
1725static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
1726 unsigned int *cpup)
1727{
1728 unsigned int cpu;
1729
1730 for_each_possible_cpu(cpu)
1731 if (unit_map[cpu] == unit) {
1732 if (cpup)
1733 *cpup = cpu;
1734 return true;
1735 }
1736
1737 return false;
1738}
1739
1740static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
1741 size_t reserved_size, size_t dyn_size,
1742 size_t unit_size, size_t lpage_size,
1743 const int *unit_map, int nr_units)
1744{
1745 int width = 1, v = nr_units;
1746 char empty_str[] = "--------";
1747 int upl, lpl; /* units per lpage, lpage per line */
1748 unsigned int cpu;
1749 int lpage, unit;
1750
1751 while (v /= 10)
1752 width++;
1753 empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
1754
1755 upl = max_t(int, lpage_size / unit_size, 1);
1756 lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
1757
1758 printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
1759 static_size, reserved_size, dyn_size, unit_size, lpage_size);
1760
1761 for (lpage = 0, unit = 0; unit < nr_units; unit++) {
1762 if (!(unit % upl)) {
1763 if (!(lpage++ % lpl)) {
1764 printk("\n");
1765 printk("%spcpu-lpage: ", lvl);
1766 } else
1767 printk("| ");
1768 }
1769 if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
1770 printk("%0*d ", width, cpu);
1771 else
1772 printk("%s ", empty_str);
1773 }
1774 printk("\n");
1775}
1606 1776
1607/** 1777/**
1608 * pcpu_lpage_first_chunk - remap the first percpu chunk using large page 1778 * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
1609 * @static_size: the size of static percpu area in bytes 1779 * @static_size: the size of static percpu area in bytes
1610 * @reserved_size: the size of reserved percpu area in bytes 1780 * @reserved_size: the size of reserved percpu area in bytes
1611 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1781 * @dyn_size: free size for dynamic allocation in bytes
1782 * @unit_size: unit size in bytes
1612 * @lpage_size: the size of a large page 1783 * @lpage_size: the size of a large page
1784 * @unit_map: cpu -> unit mapping
1785 * @nr_units: the number of units
1613 * @alloc_fn: function to allocate percpu lpage, always called with lpage_size 1786 * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
1614 * @free_fn: function to free percpu memory, @size <= lpage_size 1787 * @free_fn: function to free percpu memory, @size <= lpage_size
1615 * @map_fn: function to map percpu lpage, always called with lpage_size 1788 * @map_fn: function to map percpu lpage, always called with lpage_size
1616 * 1789 *
1617 * This allocator uses large page as unit. A large page is allocated 1790 * This allocator uses large page to build and map the first chunk.
1618 * for each cpu and each is remapped into vmalloc area using large 1791 * Unlike other helpers, the caller should always specify @dyn_size
1619 * page mapping. As large page can be quite large, only part of it is 1792 * and @unit_size. These parameters along with @unit_map and
1620 * used for the first chunk. Unused part is returned to the bootmem 1793 * @nr_units can be determined using pcpu_lpage_build_unit_map().
1621 * allocator. 1794 * This two stage initialization is to allow arch code to evaluate the
1622 * 1795 * parameters before committing to it.
1623 * So, the large pages are mapped twice - once to the physical mapping 1796 *
1624 * and to the vmalloc area for the first percpu chunk. The double 1797 * Large pages are allocated as directed by @unit_map and other
1625 * mapping does add one more large TLB entry pressure but still is 1798 * parameters and mapped to vmalloc space. Unused holes are returned
1626 * much better than only using 4k mappings while still being NUMA 1799 * to the page allocator. Note that these holes end up being actively
1627 * friendly. 1800 * mapped twice - once to the physical mapping and to the vmalloc area
1801 * for the first percpu chunk. Depending on architecture, this might
1802 * cause problem when changing page attributes of the returned area.
1803 * These double mapped areas can be detected using
1804 * pcpu_lpage_remapped().
1628 * 1805 *
1629 * RETURNS: 1806 * RETURNS:
1630 * The determined pcpu_unit_size which can be used to initialize 1807 * The determined pcpu_unit_size which can be used to initialize
1631 * percpu access on success, -errno on failure. 1808 * percpu access on success, -errno on failure.
1632 */ 1809 */
1633ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, 1810ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
1634 ssize_t dyn_size, size_t lpage_size, 1811 size_t dyn_size, size_t unit_size,
1812 size_t lpage_size, const int *unit_map,
1813 int nr_units,
1635 pcpu_fc_alloc_fn_t alloc_fn, 1814 pcpu_fc_alloc_fn_t alloc_fn,
1636 pcpu_fc_free_fn_t free_fn, 1815 pcpu_fc_free_fn_t free_fn,
1637 pcpu_fc_map_fn_t map_fn) 1816 pcpu_fc_map_fn_t map_fn)
1638{ 1817{
1639 size_t size_sum; 1818 static struct vm_struct vm;
1819 size_t chunk_size = unit_size * nr_units;
1640 size_t map_size; 1820 size_t map_size;
1641 unsigned int cpu; 1821 unsigned int cpu;
1642 int i, j;
1643 ssize_t ret; 1822 ssize_t ret;
1823 int i, j, unit;
1644 1824
1645 /* 1825 pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size,
1646 * Currently supports only single page. Supporting multiple 1826 unit_size, lpage_size, unit_map, nr_units);
1647 * pages won't be too difficult if it ever becomes necessary.
1648 */
1649 size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
1650 1827
1651 pcpul_unit_size = lpage_size; 1828 BUG_ON(chunk_size % lpage_size);
1652 pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); 1829
1653 if (pcpul_size > pcpul_unit_size) { 1830 pcpul_size = static_size + reserved_size + dyn_size;
1654 pr_warning("PERCPU: static data is larger than large page, " 1831 pcpul_lpage_size = lpage_size;
1655 "can't use large page\n"); 1832 pcpul_nr_lpages = chunk_size / lpage_size;
1656 return -EINVAL;
1657 }
1658 1833
1659 /* allocate pointer array and alloc large pages */ 1834 /* allocate pointer array and alloc large pages */
1660 map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); 1835 map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]);
1661 pcpul_map = alloc_bootmem(map_size); 1836 pcpul_map = alloc_bootmem(map_size);
1662 1837
1663 for_each_possible_cpu(cpu) { 1838 /* allocate all pages */
1839 for (i = 0; i < pcpul_nr_lpages; i++) {
1840 size_t offset = i * lpage_size;
1841 int first_unit = offset / unit_size;
1842 int last_unit = (offset + lpage_size - 1) / unit_size;
1664 void *ptr; 1843 void *ptr;
1665 1844
1845 /* find out which cpu is mapped to this unit */
1846 for (unit = first_unit; unit <= last_unit; unit++)
1847 if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
1848 goto found;
1849 continue;
1850 found:
1666 ptr = alloc_fn(cpu, lpage_size); 1851 ptr = alloc_fn(cpu, lpage_size);
1667 if (!ptr) { 1852 if (!ptr) {
1668 pr_warning("PERCPU: failed to allocate large page " 1853 pr_warning("PERCPU: failed to allocate large page "
@@ -1670,53 +1855,79 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
1670 goto enomem; 1855 goto enomem;
1671 } 1856 }
1672 1857
1673 /* 1858 pcpul_map[i].ptr = ptr;
1674 * Only use pcpul_size bytes and give back the rest. 1859 }
1675 *
1676 * Ingo: The lpage_size up-rounding bootmem is needed
1677 * to make sure the partial lpage is still fully RAM -
1678 * it's not well-specified to have a incompatible area
1679 * (unmapped RAM, device memory, etc.) in that hole.
1680 */
1681 free_fn(ptr + pcpul_size, lpage_size - pcpul_size);
1682
1683 pcpul_map[cpu].cpu = cpu;
1684 pcpul_map[cpu].ptr = ptr;
1685 1860
1686 memcpy(ptr, __per_cpu_load, static_size); 1861 /* return unused holes */
1862 for (unit = 0; unit < nr_units; unit++) {
1863 size_t start = unit * unit_size;
1864 size_t end = start + unit_size;
1865 size_t off, next;
1866
1867 /* don't free used part of occupied unit */
1868 if (pcpul_unit_to_cpu(unit, unit_map, NULL))
1869 start += pcpul_size;
1870
1871 /* unit can span more than one page, punch the holes */
1872 for (off = start; off < end; off = next) {
1873 void *ptr = pcpul_map[off / lpage_size].ptr;
1874 next = min(roundup(off + 1, lpage_size), end);
1875 if (ptr)
1876 free_fn(ptr + off % lpage_size, next - off);
1877 }
1687 } 1878 }
1688 1879
1689 /* allocate address and map */ 1880 /* allocate address, map and copy */
1690 pcpul_vm.flags = VM_ALLOC; 1881 vm.flags = VM_ALLOC;
1691 pcpul_vm.size = num_possible_cpus() * pcpul_unit_size; 1882 vm.size = chunk_size;
1692 vm_area_register_early(&pcpul_vm, pcpul_unit_size); 1883 vm_area_register_early(&vm, unit_size);
1884
1885 for (i = 0; i < pcpul_nr_lpages; i++) {
1886 if (!pcpul_map[i].ptr)
1887 continue;
1888 pcpul_map[i].map_addr = vm.addr + i * lpage_size;
1889 map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr);
1890 }
1693 1891
1694 for_each_possible_cpu(cpu) 1892 for_each_possible_cpu(cpu)
1695 map_fn(pcpul_map[cpu].ptr, pcpul_unit_size, 1893 memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load,
1696 pcpul_vm.addr + cpu * pcpul_unit_size); 1894 static_size);
1697 1895
1698 /* we're ready, commit */ 1896 /* we're ready, commit */
1699 pr_info("PERCPU: Remapped at %p with large pages, static data " 1897 pr_info("PERCPU: Remapped at %p with large pages, static data "
1700 "%zu bytes\n", pcpul_vm.addr, static_size); 1898 "%zu bytes\n", vm.addr, static_size);
1701 1899
1702 ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, 1900 ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
1703 pcpul_unit_size, pcpul_vm.addr, NULL); 1901 unit_size, vm.addr, unit_map);
1704 1902
1705 /* sort pcpul_map array for pcpu_lpage_remapped() */ 1903 /*
1706 for (i = 0; i < num_possible_cpus() - 1; i++) 1904 * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped
1707 for (j = i + 1; j < num_possible_cpus(); j++) 1905 * lpages are pushed to the end and trimmed.
1708 if (pcpul_map[i].ptr > pcpul_map[j].ptr) { 1906 */
1709 struct pcpul_ent tmp = pcpul_map[i]; 1907 for (i = 0; i < pcpul_nr_lpages - 1; i++)
1710 pcpul_map[i] = pcpul_map[j]; 1908 for (j = i + 1; j < pcpul_nr_lpages; j++) {
1711 pcpul_map[j] = tmp; 1909 struct pcpul_ent tmp;
1712 } 1910
1911 if (!pcpul_map[j].ptr)
1912 continue;
1913 if (pcpul_map[i].ptr &&
1914 pcpul_map[i].ptr < pcpul_map[j].ptr)
1915 continue;
1916
1917 tmp = pcpul_map[i];
1918 pcpul_map[i] = pcpul_map[j];
1919 pcpul_map[j] = tmp;
1920 }
1921
1922 while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
1923 pcpul_nr_lpages--;
1713 1924
1714 return ret; 1925 return ret;
1715 1926
1716enomem: 1927enomem:
1717 for_each_possible_cpu(cpu) 1928 for (i = 0; i < pcpul_nr_lpages; i++)
1718 if (pcpul_map[cpu].ptr) 1929 if (pcpul_map[i].ptr)
1719 free_fn(pcpul_map[cpu].ptr, pcpul_size); 1930 free_fn(pcpul_map[i].ptr, lpage_size);
1720 free_bootmem(__pa(pcpul_map), map_size); 1931 free_bootmem(__pa(pcpul_map), map_size);
1721 return -ENOMEM; 1932 return -ENOMEM;
1722} 1933}
@@ -1739,10 +1950,10 @@ enomem:
1739 */ 1950 */
1740void *pcpu_lpage_remapped(void *kaddr) 1951void *pcpu_lpage_remapped(void *kaddr)
1741{ 1952{
1742 unsigned long unit_mask = pcpul_unit_size - 1; 1953 unsigned long lpage_mask = pcpul_lpage_size - 1;
1743 void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask); 1954 void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask);
1744 unsigned long offset = (unsigned long)kaddr & unit_mask; 1955 unsigned long offset = (unsigned long)kaddr & lpage_mask;
1745 int left = 0, right = num_possible_cpus() - 1; 1956 int left = 0, right = pcpul_nr_lpages - 1;
1746 int pos; 1957 int pos;
1747 1958
1748 /* pcpul in use at all? */ 1959 /* pcpul in use at all? */
@@ -1757,13 +1968,8 @@ void *pcpu_lpage_remapped(void *kaddr)
1757 left = pos + 1; 1968 left = pos + 1;
1758 else if (pcpul_map[pos].ptr > lpage_addr) 1969 else if (pcpul_map[pos].ptr > lpage_addr)
1759 right = pos - 1; 1970 right = pos - 1;
1760 else { 1971 else
1761 /* it shouldn't be in the area for the first chunk */ 1972 return pcpul_map[pos].map_addr + offset;
1762 WARN_ON(offset < pcpul_size);
1763
1764 return pcpul_vm.addr +
1765 pcpul_map[pos].cpu * pcpul_unit_size + offset;
1766 }
1767 } 1973 }
1768 1974
1769 return NULL; 1975 return NULL;