aboutsummaryrefslogtreecommitdiffstats
path: root/mm/percpu.c
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2009-08-14 02:00:52 -0400
committerTejun Heo <tj@kernel.org>2009-08-14 02:00:52 -0400
commitc8826dd538602d730ed2c18c6753f1bbfa6c4933 (patch)
tree705a34d5afae4a53a1b041689b0b0079cd88f737 /mm/percpu.c
parent6563297ceafab6bbcc931b52e2a9e660fbb21fb2 (diff)
percpu: update embedding first chunk allocator to handle sparse units
Now that percpu core can handle very sparse units, given that vmalloc space is large enough, embedding first chunk allocator can use any memory to build the first chunk. This patch teaches pcpu_embed_first_chunk() about distances between cpus and to use alloc/free callbacks to allocate node specific areas for each group and use them for the first chunk. This brings the benefits of embedding allocator to NUMA configurations - no extra TLB pressure with the flexibility of unified dynamic allocator and no need to restructure arch code to build memory layout suitable for percpu. With units put into atom_size aligned groups according to cpu distances, using large page for dynamic chunks is also easily possible with falling back to reuglar pages if large allocation fails. Embedding allocator users are converted to specify NULL cpu_distance_fn, so this patch doesn't cause any visible behavior difference. Following patches will convert them. Signed-off-by: Tejun Heo <tj@kernel.org>
Diffstat (limited to 'mm/percpu.c')
-rw-r--r--mm/percpu.c113
1 files changed, 85 insertions, 28 deletions
diff --git a/mm/percpu.c b/mm/percpu.c
index cc9c4c64606d..c2826d05505c 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1747,15 +1747,25 @@ early_param("percpu_alloc", percpu_alloc_setup);
1747 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem 1747 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
1748 * @reserved_size: the size of reserved percpu area in bytes 1748 * @reserved_size: the size of reserved percpu area in bytes
1749 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto 1749 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
1750 * @atom_size: allocation atom size
1751 * @cpu_distance_fn: callback to determine distance between cpus, optional
1752 * @alloc_fn: function to allocate percpu page
1753 * @free_fn: funtion to free percpu page
1750 * 1754 *
1751 * This is a helper to ease setting up embedded first percpu chunk and 1755 * This is a helper to ease setting up embedded first percpu chunk and
1752 * can be called where pcpu_setup_first_chunk() is expected. 1756 * can be called where pcpu_setup_first_chunk() is expected.
1753 * 1757 *
1754 * If this function is used to setup the first chunk, it is allocated 1758 * If this function is used to setup the first chunk, it is allocated
1755 * as a contiguous area using bootmem allocator and used as-is without 1759 * by calling @alloc_fn and used as-is without being mapped into
1756 * being mapped into vmalloc area. This enables the first chunk to 1760 * vmalloc area. Allocations are always whole multiples of @atom_size
1757 * piggy back on the linear physical mapping which often uses larger 1761 * aligned to @atom_size.
1758 * page size. 1762 *
1763 * This enables the first chunk to piggy back on the linear physical
1764 * mapping which often uses larger page size. Please note that this
1765 * can result in very sparse cpu->unit mapping on NUMA machines thus
1766 * requiring large vmalloc address space. Don't use this allocator if
1767 * vmalloc space is not orders of magnitude larger than distances
1768 * between node memory addresses (ie. 32bit NUMA machines).
1759 * 1769 *
1760 * When @dyn_size is positive, dynamic area might be larger than 1770 * When @dyn_size is positive, dynamic area might be larger than
1761 * specified to fill page alignment. When @dyn_size is auto, 1771 * specified to fill page alignment. When @dyn_size is auto,
@@ -1763,53 +1773,88 @@ early_param("percpu_alloc", percpu_alloc_setup);
1763 * and reserved areas. 1773 * and reserved areas.
1764 * 1774 *
1765 * If the needed size is smaller than the minimum or specified unit 1775 * If the needed size is smaller than the minimum or specified unit
1766 * size, the leftover is returned to the bootmem allocator. 1776 * size, the leftover is returned using @free_fn.
1767 * 1777 *
1768 * RETURNS: 1778 * RETURNS:
1769 * 0 on success, -errno on failure. 1779 * 0 on success, -errno on failure.
1770 */ 1780 */
1771int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size) 1781int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size,
1782 size_t atom_size,
1783 pcpu_fc_cpu_distance_fn_t cpu_distance_fn,
1784 pcpu_fc_alloc_fn_t alloc_fn,
1785 pcpu_fc_free_fn_t free_fn)
1772{ 1786{
1787 void *base = (void *)ULONG_MAX;
1788 void **areas = NULL;
1773 struct pcpu_alloc_info *ai; 1789 struct pcpu_alloc_info *ai;
1774 size_t size_sum, chunk_size; 1790 size_t size_sum, areas_size;
1775 void *base; 1791 int group, i, rc;
1776 int unit;
1777 int rc;
1778 1792
1779 ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL); 1793 ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
1794 cpu_distance_fn);
1780 if (IS_ERR(ai)) 1795 if (IS_ERR(ai))
1781 return PTR_ERR(ai); 1796 return PTR_ERR(ai);
1782 BUG_ON(ai->nr_groups != 1);
1783 BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
1784 1797
1785 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; 1798 size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
1786 chunk_size = ai->unit_size * num_possible_cpus(); 1799 areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
1787 1800
1788 base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE, 1801 areas = alloc_bootmem_nopanic(areas_size);
1789 __pa(MAX_DMA_ADDRESS)); 1802 if (!areas) {
1790 if (!base) {
1791 pr_warning("PERCPU: failed to allocate %zu bytes for "
1792 "embedding\n", chunk_size);
1793 rc = -ENOMEM; 1803 rc = -ENOMEM;
1794 goto out_free_ai; 1804 goto out_free;
1795 } 1805 }
1796 1806
1797 /* return the leftover and copy */ 1807 /* allocate, copy and determine base address */
1798 for (unit = 0; unit < num_possible_cpus(); unit++) { 1808 for (group = 0; group < ai->nr_groups; group++) {
1799 void *ptr = base + unit * ai->unit_size; 1809 struct pcpu_group_info *gi = &ai->groups[group];
1810 unsigned int cpu = NR_CPUS;
1811 void *ptr;
1812
1813 for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++)
1814 cpu = gi->cpu_map[i];
1815 BUG_ON(cpu == NR_CPUS);
1816
1817 /* allocate space for the whole group */
1818 ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);
1819 if (!ptr) {
1820 rc = -ENOMEM;
1821 goto out_free_areas;
1822 }
1823 areas[group] = ptr;
1800 1824
1801 free_bootmem(__pa(ptr + size_sum), ai->unit_size - size_sum); 1825 base = min(ptr, base);
1802 memcpy(ptr, __per_cpu_load, ai->static_size); 1826
1827 for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) {
1828 if (gi->cpu_map[i] == NR_CPUS) {
1829 /* unused unit, free whole */
1830 free_fn(ptr, ai->unit_size);
1831 continue;
1832 }
1833 /* copy and return the unused part */
1834 memcpy(ptr, __per_cpu_load, ai->static_size);
1835 free_fn(ptr + size_sum, ai->unit_size - size_sum);
1836 }
1803 } 1837 }
1804 1838
1805 /* we're ready, commit */ 1839 /* base address is now known, determine group base offsets */
1840 for (group = 0; group < ai->nr_groups; group++)
1841 ai->groups[group].base_offset = areas[group] - base;
1842
1806 pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", 1843 pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
1807 PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, 1844 PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
1808 ai->dyn_size, ai->unit_size); 1845 ai->dyn_size, ai->unit_size);
1809 1846
1810 rc = pcpu_setup_first_chunk(ai, base); 1847 rc = pcpu_setup_first_chunk(ai, base);
1811out_free_ai: 1848 goto out_free;
1849
1850out_free_areas:
1851 for (group = 0; group < ai->nr_groups; group++)
1852 free_fn(areas[group],
1853 ai->groups[group].nr_units * ai->unit_size);
1854out_free:
1812 pcpu_free_alloc_info(ai); 1855 pcpu_free_alloc_info(ai);
1856 if (areas)
1857 free_bootmem(__pa(areas), areas_size);
1813 return rc; 1858 return rc;
1814} 1859}
1815#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK || 1860#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
@@ -2177,6 +2222,17 @@ void *pcpu_lpage_remapped(void *kaddr)
2177unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; 2222unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
2178EXPORT_SYMBOL(__per_cpu_offset); 2223EXPORT_SYMBOL(__per_cpu_offset);
2179 2224
2225static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
2226 size_t align)
2227{
2228 return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS));
2229}
2230
2231static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
2232{
2233 free_bootmem(__pa(ptr), size);
2234}
2235
2180void __init setup_per_cpu_areas(void) 2236void __init setup_per_cpu_areas(void)
2181{ 2237{
2182 unsigned long delta; 2238 unsigned long delta;
@@ -2188,7 +2244,8 @@ void __init setup_per_cpu_areas(void)
2188 * what the legacy allocator did. 2244 * what the legacy allocator did.
2189 */ 2245 */
2190 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, 2246 rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
2191 PERCPU_DYNAMIC_RESERVE); 2247 PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
2248 pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
2192 if (rc < 0) 2249 if (rc < 0)
2193 panic("Failed to initialized percpu areas."); 2250 panic("Failed to initialized percpu areas.");
2194 2251