diff options
-rw-r--r-- | arch/x86/kernel/setup_percpu.c | 72 | ||||
-rw-r--r-- | include/linux/percpu.h | 24 | ||||
-rw-r--r-- | mm/percpu.c | 358 |
3 files changed, 359 insertions, 95 deletions
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 4f2e0ac9130b..7501bb14bd51 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c | |||
@@ -149,36 +149,73 @@ static void __init pcpul_map(void *ptr, size_t size, void *addr) | |||
149 | set_pmd(pmd, pmd_v); | 149 | set_pmd(pmd, pmd_v); |
150 | } | 150 | } |
151 | 151 | ||
152 | static int pcpu_lpage_cpu_distance(unsigned int from, unsigned int to) | ||
153 | { | ||
154 | if (early_cpu_to_node(from) == early_cpu_to_node(to)) | ||
155 | return LOCAL_DISTANCE; | ||
156 | else | ||
157 | return REMOTE_DISTANCE; | ||
158 | } | ||
159 | |||
152 | static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) | 160 | static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) |
153 | { | 161 | { |
154 | size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; | 162 | size_t reserve = PERCPU_MODULE_RESERVE + PERCPU_DYNAMIC_RESERVE; |
163 | size_t dyn_size = reserve - PERCPU_FIRST_CHUNK_RESERVE; | ||
164 | size_t unit_map_size, unit_size; | ||
165 | int *unit_map; | ||
166 | int nr_units; | ||
167 | ssize_t ret; | ||
168 | |||
169 | /* on non-NUMA, embedding is better */ | ||
170 | if (!chosen && !pcpu_need_numa()) | ||
171 | return -EINVAL; | ||
172 | |||
173 | /* need PSE */ | ||
174 | if (!cpu_has_pse) { | ||
175 | pr_warning("PERCPU: lpage allocator requires PSE\n"); | ||
176 | return -EINVAL; | ||
177 | } | ||
155 | 178 | ||
179 | /* allocate and build unit_map */ | ||
180 | unit_map_size = num_possible_cpus() * sizeof(int); | ||
181 | unit_map = alloc_bootmem_nopanic(unit_map_size); | ||
182 | if (!unit_map) { | ||
183 | pr_warning("PERCPU: failed to allocate unit_map\n"); | ||
184 | return -ENOMEM; | ||
185 | } | ||
186 | |||
187 | ret = pcpu_lpage_build_unit_map(static_size, | ||
188 | PERCPU_FIRST_CHUNK_RESERVE, | ||
189 | &dyn_size, &unit_size, PMD_SIZE, | ||
190 | unit_map, pcpu_lpage_cpu_distance); | ||
191 | if (ret < 0) { | ||
192 | pr_warning("PERCPU: failed to build unit_map\n"); | ||
193 | goto out_free; | ||
194 | } | ||
195 | nr_units = ret; | ||
196 | |||
197 | /* do the parameters look okay? */ | ||
156 | if (!chosen) { | 198 | if (!chosen) { |
157 | size_t vm_size = VMALLOC_END - VMALLOC_START; | 199 | size_t vm_size = VMALLOC_END - VMALLOC_START; |
158 | size_t tot_size = num_possible_cpus() * PMD_SIZE; | 200 | size_t tot_size = nr_units * unit_size; |
159 | |||
160 | /* on non-NUMA, embedding is better */ | ||
161 | if (!pcpu_need_numa()) | ||
162 | return -EINVAL; | ||
163 | 201 | ||
164 | /* don't consume more than 20% of vmalloc area */ | 202 | /* don't consume more than 20% of vmalloc area */ |
165 | if (tot_size > vm_size / 5) { | 203 | if (tot_size > vm_size / 5) { |
166 | pr_info("PERCPU: too large chunk size %zuMB for " | 204 | pr_info("PERCPU: too large chunk size %zuMB for " |
167 | "large page remap\n", tot_size >> 20); | 205 | "large page remap\n", tot_size >> 20); |
168 | return -EINVAL; | 206 | ret = -EINVAL; |
207 | goto out_free; | ||
169 | } | 208 | } |
170 | } | 209 | } |
171 | 210 | ||
172 | /* need PSE */ | 211 | ret = pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, |
173 | if (!cpu_has_pse) { | 212 | dyn_size, unit_size, PMD_SIZE, |
174 | pr_warning("PERCPU: lpage allocator requires PSE\n"); | 213 | unit_map, nr_units, |
175 | return -EINVAL; | 214 | pcpu_fc_alloc, pcpu_fc_free, pcpul_map); |
176 | } | 215 | out_free: |
177 | 216 | if (ret < 0) | |
178 | return pcpu_lpage_first_chunk(static_size, PERCPU_FIRST_CHUNK_RESERVE, | 217 | free_bootmem(__pa(unit_map), unit_map_size); |
179 | reserve - PERCPU_FIRST_CHUNK_RESERVE, | 218 | return ret; |
180 | PMD_SIZE, | ||
181 | pcpu_fc_alloc, pcpu_fc_free, pcpul_map); | ||
182 | } | 219 | } |
183 | #else | 220 | #else |
184 | static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) | 221 | static ssize_t __init setup_pcpu_lpage(size_t static_size, bool chosen) |
@@ -299,7 +336,8 @@ void __init setup_per_cpu_areas(void) | |||
299 | /* alrighty, percpu areas up and running */ | 336 | /* alrighty, percpu areas up and running */ |
300 | delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; | 337 | delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; |
301 | for_each_possible_cpu(cpu) { | 338 | for_each_possible_cpu(cpu) { |
302 | per_cpu_offset(cpu) = delta + cpu * pcpu_unit_size; | 339 | per_cpu_offset(cpu) = |
340 | delta + pcpu_unit_map[cpu] * pcpu_unit_size; | ||
303 | per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); | 341 | per_cpu(this_cpu_off, cpu) = per_cpu_offset(cpu); |
304 | per_cpu(cpu_number, cpu) = cpu; | 342 | per_cpu(cpu_number, cpu) = cpu; |
305 | setup_percpu_segment(cpu); | 343 | setup_percpu_segment(cpu); |
diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 1e0e8878dc2a..8ce91af4aa19 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h | |||
@@ -62,6 +62,7 @@ extern const int *pcpu_unit_map; | |||
62 | typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); | 62 | typedef void * (*pcpu_fc_alloc_fn_t)(unsigned int cpu, size_t size); |
63 | typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); | 63 | typedef void (*pcpu_fc_free_fn_t)(void *ptr, size_t size); |
64 | typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); | 64 | typedef void (*pcpu_fc_populate_pte_fn_t)(unsigned long addr); |
65 | typedef int (pcpu_fc_cpu_distance_fn_t)(unsigned int from, unsigned int to); | ||
65 | typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); | 66 | typedef void (*pcpu_fc_map_fn_t)(void *ptr, size_t size, void *addr); |
66 | 67 | ||
67 | extern size_t __init pcpu_setup_first_chunk( | 68 | extern size_t __init pcpu_setup_first_chunk( |
@@ -80,18 +81,37 @@ extern ssize_t __init pcpu_4k_first_chunk( | |||
80 | pcpu_fc_populate_pte_fn_t populate_pte_fn); | 81 | pcpu_fc_populate_pte_fn_t populate_pte_fn); |
81 | 82 | ||
82 | #ifdef CONFIG_NEED_MULTIPLE_NODES | 83 | #ifdef CONFIG_NEED_MULTIPLE_NODES |
84 | extern int __init pcpu_lpage_build_unit_map( | ||
85 | size_t static_size, size_t reserved_size, | ||
86 | ssize_t *dyn_sizep, size_t *unit_sizep, | ||
87 | size_t lpage_size, int *unit_map, | ||
88 | pcpu_fc_cpu_distance_fn_t cpu_distance_fn); | ||
89 | |||
83 | extern ssize_t __init pcpu_lpage_first_chunk( | 90 | extern ssize_t __init pcpu_lpage_first_chunk( |
84 | size_t static_size, size_t reserved_size, | 91 | size_t static_size, size_t reserved_size, |
85 | ssize_t dyn_size, size_t lpage_size, | 92 | size_t dyn_size, size_t unit_size, |
93 | size_t lpage_size, const int *unit_map, | ||
94 | int nr_units, | ||
86 | pcpu_fc_alloc_fn_t alloc_fn, | 95 | pcpu_fc_alloc_fn_t alloc_fn, |
87 | pcpu_fc_free_fn_t free_fn, | 96 | pcpu_fc_free_fn_t free_fn, |
88 | pcpu_fc_map_fn_t map_fn); | 97 | pcpu_fc_map_fn_t map_fn); |
89 | 98 | ||
90 | extern void *pcpu_lpage_remapped(void *kaddr); | 99 | extern void *pcpu_lpage_remapped(void *kaddr); |
91 | #else | 100 | #else |
101 | static inline int pcpu_lpage_build_unit_map( | ||
102 | size_t static_size, size_t reserved_size, | ||
103 | ssize_t *dyn_sizep, size_t *unit_sizep, | ||
104 | size_t lpage_size, int *unit_map, | ||
105 | pcpu_fc_cpu_distance_fn_t cpu_distance_fn) | ||
106 | { | ||
107 | return -EINVAL; | ||
108 | } | ||
109 | |||
92 | static inline ssize_t __init pcpu_lpage_first_chunk( | 110 | static inline ssize_t __init pcpu_lpage_first_chunk( |
93 | size_t static_size, size_t reserved_size, | 111 | size_t static_size, size_t reserved_size, |
94 | ssize_t dyn_size, size_t lpage_size, | 112 | size_t dyn_size, size_t unit_size, |
113 | size_t lpage_size, const int *unit_map, | ||
114 | int nr_units, | ||
95 | pcpu_fc_alloc_fn_t alloc_fn, | 115 | pcpu_fc_alloc_fn_t alloc_fn, |
96 | pcpu_fc_free_fn_t free_fn, | 116 | pcpu_fc_free_fn_t free_fn, |
97 | pcpu_fc_map_fn_t map_fn) | 117 | pcpu_fc_map_fn_t map_fn) |
diff --git a/mm/percpu.c b/mm/percpu.c index 2196fae24f00..b3d0bcff8c7c 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -59,6 +59,7 @@ | |||
59 | #include <linux/bitmap.h> | 59 | #include <linux/bitmap.h> |
60 | #include <linux/bootmem.h> | 60 | #include <linux/bootmem.h> |
61 | #include <linux/list.h> | 61 | #include <linux/list.h> |
62 | #include <linux/log2.h> | ||
62 | #include <linux/mm.h> | 63 | #include <linux/mm.h> |
63 | #include <linux/module.h> | 64 | #include <linux/module.h> |
64 | #include <linux/mutex.h> | 65 | #include <linux/mutex.h> |
@@ -1594,75 +1595,259 @@ out_free_ar: | |||
1594 | * Large page remapping first chunk setup helper | 1595 | * Large page remapping first chunk setup helper |
1595 | */ | 1596 | */ |
1596 | #ifdef CONFIG_NEED_MULTIPLE_NODES | 1597 | #ifdef CONFIG_NEED_MULTIPLE_NODES |
1598 | |||
1599 | /** | ||
1600 | * pcpu_lpage_build_unit_map - build unit_map for large page remapping | ||
1601 | * @static_size: the size of static percpu area in bytes | ||
1602 | * @reserved_size: the size of reserved percpu area in bytes | ||
1603 | * @dyn_sizep: in/out parameter for dynamic size, -1 for auto | ||
1604 | * @unit_sizep: out parameter for unit size | ||
1605 | * @unit_map: unit_map to be filled | ||
1606 | * @cpu_distance_fn: callback to determine distance between cpus | ||
1607 | * | ||
1608 | * This function builds cpu -> unit map and determine other parameters | ||
1609 | * considering needed percpu size, large page size and distances | ||
1610 | * between CPUs in NUMA. | ||
1611 | * | ||
1612 | * CPUs which are of LOCAL_DISTANCE both ways are grouped together and | ||
1613 | * may share units in the same large page. The returned configuration | ||
1614 | * is guaranteed to have CPUs on different nodes on different large | ||
1615 | * pages and >=75% usage of allocated virtual address space. | ||
1616 | * | ||
1617 | * RETURNS: | ||
1618 | * On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and | ||
1619 | * returns the number of units to be allocated. -errno on failure. | ||
1620 | */ | ||
1621 | int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size, | ||
1622 | ssize_t *dyn_sizep, size_t *unit_sizep, | ||
1623 | size_t lpage_size, int *unit_map, | ||
1624 | pcpu_fc_cpu_distance_fn_t cpu_distance_fn) | ||
1625 | { | ||
1626 | static int group_map[NR_CPUS] __initdata; | ||
1627 | static int group_cnt[NR_CPUS] __initdata; | ||
1628 | int group_cnt_max = 0; | ||
1629 | size_t size_sum, min_unit_size, alloc_size; | ||
1630 | int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ | ||
1631 | int last_allocs; | ||
1632 | unsigned int cpu, tcpu; | ||
1633 | int group, unit; | ||
1634 | |||
1635 | /* | ||
1636 | * Determine min_unit_size, alloc_size and max_upa such that | ||
1637 | * alloc_size is multiple of lpage_size and is the smallest | ||
1638 | * which can accomodate 4k aligned segments which are equal to | ||
1639 | * or larger than min_unit_size. | ||
1640 | */ | ||
1641 | size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep); | ||
1642 | min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); | ||
1643 | |||
1644 | alloc_size = roundup(min_unit_size, lpage_size); | ||
1645 | upa = alloc_size / min_unit_size; | ||
1646 | while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) | ||
1647 | upa--; | ||
1648 | max_upa = upa; | ||
1649 | |||
1650 | /* group cpus according to their proximity */ | ||
1651 | for_each_possible_cpu(cpu) { | ||
1652 | group = 0; | ||
1653 | next_group: | ||
1654 | for_each_possible_cpu(tcpu) { | ||
1655 | if (cpu == tcpu) | ||
1656 | break; | ||
1657 | if (group_map[tcpu] == group && | ||
1658 | (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || | ||
1659 | cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { | ||
1660 | group++; | ||
1661 | goto next_group; | ||
1662 | } | ||
1663 | } | ||
1664 | group_map[cpu] = group; | ||
1665 | group_cnt[group]++; | ||
1666 | group_cnt_max = max(group_cnt_max, group_cnt[group]); | ||
1667 | } | ||
1668 | |||
1669 | /* | ||
1670 | * Expand unit size until address space usage goes over 75% | ||
1671 | * and then as much as possible without using more address | ||
1672 | * space. | ||
1673 | */ | ||
1674 | last_allocs = INT_MAX; | ||
1675 | for (upa = max_upa; upa; upa--) { | ||
1676 | int allocs = 0, wasted = 0; | ||
1677 | |||
1678 | if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) | ||
1679 | continue; | ||
1680 | |||
1681 | for (group = 0; group_cnt[group]; group++) { | ||
1682 | int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); | ||
1683 | allocs += this_allocs; | ||
1684 | wasted += this_allocs * upa - group_cnt[group]; | ||
1685 | } | ||
1686 | |||
1687 | /* | ||
1688 | * Don't accept if wastage is over 25%. The | ||
1689 | * greater-than comparison ensures upa==1 always | ||
1690 | * passes the following check. | ||
1691 | */ | ||
1692 | if (wasted > num_possible_cpus() / 3) | ||
1693 | continue; | ||
1694 | |||
1695 | /* and then don't consume more memory */ | ||
1696 | if (allocs > last_allocs) | ||
1697 | break; | ||
1698 | last_allocs = allocs; | ||
1699 | best_upa = upa; | ||
1700 | } | ||
1701 | *unit_sizep = alloc_size / best_upa; | ||
1702 | |||
1703 | /* assign units to cpus accordingly */ | ||
1704 | unit = 0; | ||
1705 | for (group = 0; group_cnt[group]; group++) { | ||
1706 | for_each_possible_cpu(cpu) | ||
1707 | if (group_map[cpu] == group) | ||
1708 | unit_map[cpu] = unit++; | ||
1709 | unit = roundup(unit, best_upa); | ||
1710 | } | ||
1711 | |||
1712 | return unit; /* unit contains aligned number of units */ | ||
1713 | } | ||
1714 | |||
1597 | struct pcpul_ent { | 1715 | struct pcpul_ent { |
1598 | unsigned int cpu; | ||
1599 | void *ptr; | 1716 | void *ptr; |
1717 | void *map_addr; | ||
1600 | }; | 1718 | }; |
1601 | 1719 | ||
1602 | static size_t pcpul_size; | 1720 | static size_t pcpul_size; |
1603 | static size_t pcpul_unit_size; | 1721 | static size_t pcpul_lpage_size; |
1722 | static int pcpul_nr_lpages; | ||
1604 | static struct pcpul_ent *pcpul_map; | 1723 | static struct pcpul_ent *pcpul_map; |
1605 | static struct vm_struct pcpul_vm; | 1724 | |
1725 | static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map, | ||
1726 | unsigned int *cpup) | ||
1727 | { | ||
1728 | unsigned int cpu; | ||
1729 | |||
1730 | for_each_possible_cpu(cpu) | ||
1731 | if (unit_map[cpu] == unit) { | ||
1732 | if (cpup) | ||
1733 | *cpup = cpu; | ||
1734 | return true; | ||
1735 | } | ||
1736 | |||
1737 | return false; | ||
1738 | } | ||
1739 | |||
1740 | static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size, | ||
1741 | size_t reserved_size, size_t dyn_size, | ||
1742 | size_t unit_size, size_t lpage_size, | ||
1743 | const int *unit_map, int nr_units) | ||
1744 | { | ||
1745 | int width = 1, v = nr_units; | ||
1746 | char empty_str[] = "--------"; | ||
1747 | int upl, lpl; /* units per lpage, lpage per line */ | ||
1748 | unsigned int cpu; | ||
1749 | int lpage, unit; | ||
1750 | |||
1751 | while (v /= 10) | ||
1752 | width++; | ||
1753 | empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0'; | ||
1754 | |||
1755 | upl = max_t(int, lpage_size / unit_size, 1); | ||
1756 | lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1)); | ||
1757 | |||
1758 | printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl, | ||
1759 | static_size, reserved_size, dyn_size, unit_size, lpage_size); | ||
1760 | |||
1761 | for (lpage = 0, unit = 0; unit < nr_units; unit++) { | ||
1762 | if (!(unit % upl)) { | ||
1763 | if (!(lpage++ % lpl)) { | ||
1764 | printk("\n"); | ||
1765 | printk("%spcpu-lpage: ", lvl); | ||
1766 | } else | ||
1767 | printk("| "); | ||
1768 | } | ||
1769 | if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) | ||
1770 | printk("%0*d ", width, cpu); | ||
1771 | else | ||
1772 | printk("%s ", empty_str); | ||
1773 | } | ||
1774 | printk("\n"); | ||
1775 | } | ||
1606 | 1776 | ||
1607 | /** | 1777 | /** |
1608 | * pcpu_lpage_first_chunk - remap the first percpu chunk using large page | 1778 | * pcpu_lpage_first_chunk - remap the first percpu chunk using large page |
1609 | * @static_size: the size of static percpu area in bytes | 1779 | * @static_size: the size of static percpu area in bytes |
1610 | * @reserved_size: the size of reserved percpu area in bytes | 1780 | * @reserved_size: the size of reserved percpu area in bytes |
1611 | * @dyn_size: free size for dynamic allocation in bytes, -1 for auto | 1781 | * @dyn_size: free size for dynamic allocation in bytes |
1782 | * @unit_size: unit size in bytes | ||
1612 | * @lpage_size: the size of a large page | 1783 | * @lpage_size: the size of a large page |
1784 | * @unit_map: cpu -> unit mapping | ||
1785 | * @nr_units: the number of units | ||
1613 | * @alloc_fn: function to allocate percpu lpage, always called with lpage_size | 1786 | * @alloc_fn: function to allocate percpu lpage, always called with lpage_size |
1614 | * @free_fn: function to free percpu memory, @size <= lpage_size | 1787 | * @free_fn: function to free percpu memory, @size <= lpage_size |
1615 | * @map_fn: function to map percpu lpage, always called with lpage_size | 1788 | * @map_fn: function to map percpu lpage, always called with lpage_size |
1616 | * | 1789 | * |
1617 | * This allocator uses large page as unit. A large page is allocated | 1790 | * This allocator uses large page to build and map the first chunk. |
1618 | * for each cpu and each is remapped into vmalloc area using large | 1791 | * Unlike other helpers, the caller should always specify @dyn_size |
1619 | * page mapping. As large page can be quite large, only part of it is | 1792 | * and @unit_size. These parameters along with @unit_map and |
1620 | * used for the first chunk. Unused part is returned to the bootmem | 1793 | * @nr_units can be determined using pcpu_lpage_build_unit_map(). |
1621 | * allocator. | 1794 | * This two stage initialization is to allow arch code to evaluate the |
1622 | * | 1795 | * parameters before committing to it. |
1623 | * So, the large pages are mapped twice - once to the physical mapping | 1796 | * |
1624 | * and to the vmalloc area for the first percpu chunk. The double | 1797 | * Large pages are allocated as directed by @unit_map and other |
1625 | * mapping does add one more large TLB entry pressure but still is | 1798 | * parameters and mapped to vmalloc space. Unused holes are returned |
1626 | * much better than only using 4k mappings while still being NUMA | 1799 | * to the page allocator. Note that these holes end up being actively |
1627 | * friendly. | 1800 | * mapped twice - once to the physical mapping and to the vmalloc area |
1801 | * for the first percpu chunk. Depending on architecture, this might | ||
1802 | * cause problem when changing page attributes of the returned area. | ||
1803 | * These double mapped areas can be detected using | ||
1804 | * pcpu_lpage_remapped(). | ||
1628 | * | 1805 | * |
1629 | * RETURNS: | 1806 | * RETURNS: |
1630 | * The determined pcpu_unit_size which can be used to initialize | 1807 | * The determined pcpu_unit_size which can be used to initialize |
1631 | * percpu access on success, -errno on failure. | 1808 | * percpu access on success, -errno on failure. |
1632 | */ | 1809 | */ |
1633 | ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, | 1810 | ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, |
1634 | ssize_t dyn_size, size_t lpage_size, | 1811 | size_t dyn_size, size_t unit_size, |
1812 | size_t lpage_size, const int *unit_map, | ||
1813 | int nr_units, | ||
1635 | pcpu_fc_alloc_fn_t alloc_fn, | 1814 | pcpu_fc_alloc_fn_t alloc_fn, |
1636 | pcpu_fc_free_fn_t free_fn, | 1815 | pcpu_fc_free_fn_t free_fn, |
1637 | pcpu_fc_map_fn_t map_fn) | 1816 | pcpu_fc_map_fn_t map_fn) |
1638 | { | 1817 | { |
1639 | size_t size_sum; | 1818 | static struct vm_struct vm; |
1819 | size_t chunk_size = unit_size * nr_units; | ||
1640 | size_t map_size; | 1820 | size_t map_size; |
1641 | unsigned int cpu; | 1821 | unsigned int cpu; |
1642 | int i, j; | ||
1643 | ssize_t ret; | 1822 | ssize_t ret; |
1823 | int i, j, unit; | ||
1644 | 1824 | ||
1645 | /* | 1825 | pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size, |
1646 | * Currently supports only single page. Supporting multiple | 1826 | unit_size, lpage_size, unit_map, nr_units); |
1647 | * pages won't be too difficult if it ever becomes necessary. | ||
1648 | */ | ||
1649 | size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size); | ||
1650 | 1827 | ||
1651 | pcpul_unit_size = lpage_size; | 1828 | BUG_ON(chunk_size % lpage_size); |
1652 | pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); | 1829 | |
1653 | if (pcpul_size > pcpul_unit_size) { | 1830 | pcpul_size = static_size + reserved_size + dyn_size; |
1654 | pr_warning("PERCPU: static data is larger than large page, " | 1831 | pcpul_lpage_size = lpage_size; |
1655 | "can't use large page\n"); | 1832 | pcpul_nr_lpages = chunk_size / lpage_size; |
1656 | return -EINVAL; | ||
1657 | } | ||
1658 | 1833 | ||
1659 | /* allocate pointer array and alloc large pages */ | 1834 | /* allocate pointer array and alloc large pages */ |
1660 | map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0])); | 1835 | map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]); |
1661 | pcpul_map = alloc_bootmem(map_size); | 1836 | pcpul_map = alloc_bootmem(map_size); |
1662 | 1837 | ||
1663 | for_each_possible_cpu(cpu) { | 1838 | /* allocate all pages */ |
1839 | for (i = 0; i < pcpul_nr_lpages; i++) { | ||
1840 | size_t offset = i * lpage_size; | ||
1841 | int first_unit = offset / unit_size; | ||
1842 | int last_unit = (offset + lpage_size - 1) / unit_size; | ||
1664 | void *ptr; | 1843 | void *ptr; |
1665 | 1844 | ||
1845 | /* find out which cpu is mapped to this unit */ | ||
1846 | for (unit = first_unit; unit <= last_unit; unit++) | ||
1847 | if (pcpul_unit_to_cpu(unit, unit_map, &cpu)) | ||
1848 | goto found; | ||
1849 | continue; | ||
1850 | found: | ||
1666 | ptr = alloc_fn(cpu, lpage_size); | 1851 | ptr = alloc_fn(cpu, lpage_size); |
1667 | if (!ptr) { | 1852 | if (!ptr) { |
1668 | pr_warning("PERCPU: failed to allocate large page " | 1853 | pr_warning("PERCPU: failed to allocate large page " |
@@ -1670,53 +1855,79 @@ ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size, | |||
1670 | goto enomem; | 1855 | goto enomem; |
1671 | } | 1856 | } |
1672 | 1857 | ||
1673 | /* | 1858 | pcpul_map[i].ptr = ptr; |
1674 | * Only use pcpul_size bytes and give back the rest. | 1859 | } |
1675 | * | ||
1676 | * Ingo: The lpage_size up-rounding bootmem is needed | ||
1677 | * to make sure the partial lpage is still fully RAM - | ||
1678 | * it's not well-specified to have a incompatible area | ||
1679 | * (unmapped RAM, device memory, etc.) in that hole. | ||
1680 | */ | ||
1681 | free_fn(ptr + pcpul_size, lpage_size - pcpul_size); | ||
1682 | |||
1683 | pcpul_map[cpu].cpu = cpu; | ||
1684 | pcpul_map[cpu].ptr = ptr; | ||
1685 | 1860 | ||
1686 | memcpy(ptr, __per_cpu_load, static_size); | 1861 | /* return unused holes */ |
1862 | for (unit = 0; unit < nr_units; unit++) { | ||
1863 | size_t start = unit * unit_size; | ||
1864 | size_t end = start + unit_size; | ||
1865 | size_t off, next; | ||
1866 | |||
1867 | /* don't free used part of occupied unit */ | ||
1868 | if (pcpul_unit_to_cpu(unit, unit_map, NULL)) | ||
1869 | start += pcpul_size; | ||
1870 | |||
1871 | /* unit can span more than one page, punch the holes */ | ||
1872 | for (off = start; off < end; off = next) { | ||
1873 | void *ptr = pcpul_map[off / lpage_size].ptr; | ||
1874 | next = min(roundup(off + 1, lpage_size), end); | ||
1875 | if (ptr) | ||
1876 | free_fn(ptr + off % lpage_size, next - off); | ||
1877 | } | ||
1687 | } | 1878 | } |
1688 | 1879 | ||
1689 | /* allocate address and map */ | 1880 | /* allocate address, map and copy */ |
1690 | pcpul_vm.flags = VM_ALLOC; | 1881 | vm.flags = VM_ALLOC; |
1691 | pcpul_vm.size = num_possible_cpus() * pcpul_unit_size; | 1882 | vm.size = chunk_size; |
1692 | vm_area_register_early(&pcpul_vm, pcpul_unit_size); | 1883 | vm_area_register_early(&vm, unit_size); |
1884 | |||
1885 | for (i = 0; i < pcpul_nr_lpages; i++) { | ||
1886 | if (!pcpul_map[i].ptr) | ||
1887 | continue; | ||
1888 | pcpul_map[i].map_addr = vm.addr + i * lpage_size; | ||
1889 | map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr); | ||
1890 | } | ||
1693 | 1891 | ||
1694 | for_each_possible_cpu(cpu) | 1892 | for_each_possible_cpu(cpu) |
1695 | map_fn(pcpul_map[cpu].ptr, pcpul_unit_size, | 1893 | memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load, |
1696 | pcpul_vm.addr + cpu * pcpul_unit_size); | 1894 | static_size); |
1697 | 1895 | ||
1698 | /* we're ready, commit */ | 1896 | /* we're ready, commit */ |
1699 | pr_info("PERCPU: Remapped at %p with large pages, static data " | 1897 | pr_info("PERCPU: Remapped at %p with large pages, static data " |
1700 | "%zu bytes\n", pcpul_vm.addr, static_size); | 1898 | "%zu bytes\n", vm.addr, static_size); |
1701 | 1899 | ||
1702 | ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, | 1900 | ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size, |
1703 | pcpul_unit_size, pcpul_vm.addr, NULL); | 1901 | unit_size, vm.addr, unit_map); |
1704 | 1902 | ||
1705 | /* sort pcpul_map array for pcpu_lpage_remapped() */ | 1903 | /* |
1706 | for (i = 0; i < num_possible_cpus() - 1; i++) | 1904 | * Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped |
1707 | for (j = i + 1; j < num_possible_cpus(); j++) | 1905 | * lpages are pushed to the end and trimmed. |
1708 | if (pcpul_map[i].ptr > pcpul_map[j].ptr) { | 1906 | */ |
1709 | struct pcpul_ent tmp = pcpul_map[i]; | 1907 | for (i = 0; i < pcpul_nr_lpages - 1; i++) |
1710 | pcpul_map[i] = pcpul_map[j]; | 1908 | for (j = i + 1; j < pcpul_nr_lpages; j++) { |
1711 | pcpul_map[j] = tmp; | 1909 | struct pcpul_ent tmp; |
1712 | } | 1910 | |
1911 | if (!pcpul_map[j].ptr) | ||
1912 | continue; | ||
1913 | if (pcpul_map[i].ptr && | ||
1914 | pcpul_map[i].ptr < pcpul_map[j].ptr) | ||
1915 | continue; | ||
1916 | |||
1917 | tmp = pcpul_map[i]; | ||
1918 | pcpul_map[i] = pcpul_map[j]; | ||
1919 | pcpul_map[j] = tmp; | ||
1920 | } | ||
1921 | |||
1922 | while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr) | ||
1923 | pcpul_nr_lpages--; | ||
1713 | 1924 | ||
1714 | return ret; | 1925 | return ret; |
1715 | 1926 | ||
1716 | enomem: | 1927 | enomem: |
1717 | for_each_possible_cpu(cpu) | 1928 | for (i = 0; i < pcpul_nr_lpages; i++) |
1718 | if (pcpul_map[cpu].ptr) | 1929 | if (pcpul_map[i].ptr) |
1719 | free_fn(pcpul_map[cpu].ptr, pcpul_size); | 1930 | free_fn(pcpul_map[i].ptr, lpage_size); |
1720 | free_bootmem(__pa(pcpul_map), map_size); | 1931 | free_bootmem(__pa(pcpul_map), map_size); |
1721 | return -ENOMEM; | 1932 | return -ENOMEM; |
1722 | } | 1933 | } |
@@ -1739,10 +1950,10 @@ enomem: | |||
1739 | */ | 1950 | */ |
1740 | void *pcpu_lpage_remapped(void *kaddr) | 1951 | void *pcpu_lpage_remapped(void *kaddr) |
1741 | { | 1952 | { |
1742 | unsigned long unit_mask = pcpul_unit_size - 1; | 1953 | unsigned long lpage_mask = pcpul_lpage_size - 1; |
1743 | void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask); | 1954 | void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask); |
1744 | unsigned long offset = (unsigned long)kaddr & unit_mask; | 1955 | unsigned long offset = (unsigned long)kaddr & lpage_mask; |
1745 | int left = 0, right = num_possible_cpus() - 1; | 1956 | int left = 0, right = pcpul_nr_lpages - 1; |
1746 | int pos; | 1957 | int pos; |
1747 | 1958 | ||
1748 | /* pcpul in use at all? */ | 1959 | /* pcpul in use at all? */ |
@@ -1757,13 +1968,8 @@ void *pcpu_lpage_remapped(void *kaddr) | |||
1757 | left = pos + 1; | 1968 | left = pos + 1; |
1758 | else if (pcpul_map[pos].ptr > lpage_addr) | 1969 | else if (pcpul_map[pos].ptr > lpage_addr) |
1759 | right = pos - 1; | 1970 | right = pos - 1; |
1760 | else { | 1971 | else |
1761 | /* it shouldn't be in the area for the first chunk */ | 1972 | return pcpul_map[pos].map_addr + offset; |
1762 | WARN_ON(offset < pcpul_size); | ||
1763 | |||
1764 | return pcpul_vm.addr + | ||
1765 | pcpul_map[pos].cpu * pcpul_unit_size + offset; | ||
1766 | } | ||
1767 | } | 1973 | } |
1768 | 1974 | ||
1769 | return NULL; | 1975 | return NULL; |