aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2009-08-14 02:00:52 -0400
committerTejun Heo <tj@kernel.org>2009-08-14 02:00:52 -0400
commitca23e405e06d5fffb005df004c72781f76062f51 (patch)
tree92f708eb81325c70c3e2827e68eb413c0c14d355
parentcf88c79006bd6a09ad725ba0b34c0e23db20b19e (diff)
vmalloc: implement pcpu_get_vm_areas()
To directly use spread NUMA memories for percpu units, percpu allocator will be updated to allow sparsely mapping units in a chunk. As the distances between units can be very large, this makes allocating single vmap area for each chunk undesirable. This patch implements pcpu_get_vm_areas() and pcpu_free_vm_areas() which allocates and frees sparse congruent vmap areas. pcpu_get_vm_areas() take @offsets and @sizes array which define distances and sizes of vmap areas. It scans down from the top of vmalloc area looking for the top-most address which can accomodate all the areas. The top-down scan is to avoid interacting with regular vmallocs which can push up these congruent areas up little by little ending up wasting address space and page table. To speed up top-down scan, the highest possible address hint is maintained. Although the scan is linear from the hint, given the usual large holes between memory addresses between NUMA nodes, the scanning is highly likely to finish after finding the first hole for the last unit which is scanned first. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Nick Piggin <npiggin@suse.de>
-rw-r--r--include/linux/vmalloc.h6
-rw-r--r--mm/vmalloc.c293
2 files changed, 299 insertions, 0 deletions
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index a43ebec3a7b9..227c2a585e4f 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -115,4 +115,10 @@ extern rwlock_t vmlist_lock;
115extern struct vm_struct *vmlist; 115extern struct vm_struct *vmlist;
116extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); 116extern __init void vm_area_register_early(struct vm_struct *vm, size_t align);
117 117
118struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
119 const size_t *sizes, int nr_vms,
120 size_t align, gfp_t gfp_mask);
121
122void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms);
123
118#endif /* _LINUX_VMALLOC_H */ 124#endif /* _LINUX_VMALLOC_H */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 2eb461c3a46e..204b8243d8ab 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -265,6 +265,7 @@ struct vmap_area {
265static DEFINE_SPINLOCK(vmap_area_lock); 265static DEFINE_SPINLOCK(vmap_area_lock);
266static struct rb_root vmap_area_root = RB_ROOT; 266static struct rb_root vmap_area_root = RB_ROOT;
267static LIST_HEAD(vmap_area_list); 267static LIST_HEAD(vmap_area_list);
268static unsigned long vmap_area_pcpu_hole;
268 269
269static struct vmap_area *__find_vmap_area(unsigned long addr) 270static struct vmap_area *__find_vmap_area(unsigned long addr)
270{ 271{
@@ -431,6 +432,15 @@ static void __free_vmap_area(struct vmap_area *va)
431 RB_CLEAR_NODE(&va->rb_node); 432 RB_CLEAR_NODE(&va->rb_node);
432 list_del_rcu(&va->list); 433 list_del_rcu(&va->list);
433 434
435 /*
436 * Track the highest possible candidate for pcpu area
437 * allocation. Areas outside of vmalloc area can be returned
438 * here too, consider only end addresses which fall inside
439 * vmalloc area proper.
440 */
441 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
442 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
443
434 call_rcu(&va->rcu_head, rcu_free_va); 444 call_rcu(&va->rcu_head, rcu_free_va);
435} 445}
436 446
@@ -1038,6 +1048,9 @@ void __init vmalloc_init(void)
1038 va->va_end = va->va_start + tmp->size; 1048 va->va_end = va->va_start + tmp->size;
1039 __insert_vmap_area(va); 1049 __insert_vmap_area(va);
1040 } 1050 }
1051
1052 vmap_area_pcpu_hole = VMALLOC_END;
1053
1041 vmap_initialized = true; 1054 vmap_initialized = true;
1042} 1055}
1043 1056
@@ -1821,6 +1834,286 @@ void free_vm_area(struct vm_struct *area)
1821} 1834}
1822EXPORT_SYMBOL_GPL(free_vm_area); 1835EXPORT_SYMBOL_GPL(free_vm_area);
1823 1836
1837static struct vmap_area *node_to_va(struct rb_node *n)
1838{
1839 return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
1840}
1841
1842/**
1843 * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
1844 * @end: target address
1845 * @pnext: out arg for the next vmap_area
1846 * @pprev: out arg for the previous vmap_area
1847 *
1848 * Returns: %true if either or both of next and prev are found,
1849 * %false if no vmap_area exists
1850 *
1851 * Find vmap_areas end addresses of which enclose @end. ie. if not
1852 * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
1853 */
1854static bool pvm_find_next_prev(unsigned long end,
1855 struct vmap_area **pnext,
1856 struct vmap_area **pprev)
1857{
1858 struct rb_node *n = vmap_area_root.rb_node;
1859 struct vmap_area *va = NULL;
1860
1861 while (n) {
1862 va = rb_entry(n, struct vmap_area, rb_node);
1863 if (end < va->va_end)
1864 n = n->rb_left;
1865 else if (end > va->va_end)
1866 n = n->rb_right;
1867 else
1868 break;
1869 }
1870
1871 if (!va)
1872 return false;
1873
1874 if (va->va_end > end) {
1875 *pnext = va;
1876 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
1877 } else {
1878 *pprev = va;
1879 *pnext = node_to_va(rb_next(&(*pprev)->rb_node));
1880 }
1881 return true;
1882}
1883
1884/**
1885 * pvm_determine_end - find the highest aligned address between two vmap_areas
1886 * @pnext: in/out arg for the next vmap_area
1887 * @pprev: in/out arg for the previous vmap_area
1888 * @align: alignment
1889 *
1890 * Returns: determined end address
1891 *
1892 * Find the highest aligned address between *@pnext and *@pprev below
1893 * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned
1894 * down address is between the end addresses of the two vmap_areas.
1895 *
1896 * Please note that the address returned by this function may fall
1897 * inside *@pnext vmap_area. The caller is responsible for checking
1898 * that.
1899 */
1900static unsigned long pvm_determine_end(struct vmap_area **pnext,
1901 struct vmap_area **pprev,
1902 unsigned long align)
1903{
1904 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
1905 unsigned long addr;
1906
1907 if (*pnext)
1908 addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
1909 else
1910 addr = vmalloc_end;
1911
1912 while (*pprev && (*pprev)->va_end > addr) {
1913 *pnext = *pprev;
1914 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
1915 }
1916
1917 return addr;
1918}
1919
1920/**
1921 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
1922 * @offsets: array containing offset of each area
1923 * @sizes: array containing size of each area
1924 * @nr_vms: the number of areas to allocate
1925 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
1926 * @gfp_mask: allocation mask
1927 *
1928 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
1929 * vm_structs on success, %NULL on failure
1930 *
1931 * Percpu allocator wants to use congruent vm areas so that it can
1932 * maintain the offsets among percpu areas. This function allocates
1933 * congruent vmalloc areas for it. These areas tend to be scattered
1934 * pretty far, distance between two areas easily going up to
1935 * gigabytes. To avoid interacting with regular vmallocs, these areas
1936 * are allocated from top.
1937 *
1938 * Despite its complicated look, this allocator is rather simple. It
1939 * does everything top-down and scans areas from the end looking for
1940 * matching slot. While scanning, if any of the areas overlaps with
1941 * existing vmap_area, the base address is pulled down to fit the
1942 * area. Scanning is repeated till all the areas fit and then all
1943 * necessary data structres are inserted and the result is returned.
1944 */
1945struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
1946 const size_t *sizes, int nr_vms,
1947 size_t align, gfp_t gfp_mask)
1948{
1949 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
1950 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
1951 struct vmap_area **vas, *prev, *next;
1952 struct vm_struct **vms;
1953 int area, area2, last_area, term_area;
1954 unsigned long base, start, end, last_end;
1955 bool purged = false;
1956
1957 gfp_mask &= GFP_RECLAIM_MASK;
1958
1959 /* verify parameters and allocate data structures */
1960 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
1961 for (last_area = 0, area = 0; area < nr_vms; area++) {
1962 start = offsets[area];
1963 end = start + sizes[area];
1964
1965 /* is everything aligned properly? */
1966 BUG_ON(!IS_ALIGNED(offsets[area], align));
1967 BUG_ON(!IS_ALIGNED(sizes[area], align));
1968
1969 /* detect the area with the highest address */
1970 if (start > offsets[last_area])
1971 last_area = area;
1972
1973 for (area2 = 0; area2 < nr_vms; area2++) {
1974 unsigned long start2 = offsets[area2];
1975 unsigned long end2 = start2 + sizes[area2];
1976
1977 if (area2 == area)
1978 continue;
1979
1980 BUG_ON(start2 >= start && start2 < end);
1981 BUG_ON(end2 <= end && end2 > start);
1982 }
1983 }
1984 last_end = offsets[last_area] + sizes[last_area];
1985
1986 if (vmalloc_end - vmalloc_start < last_end) {
1987 WARN_ON(true);
1988 return NULL;
1989 }
1990
1991 vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
1992 vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
1993 if (!vas || !vms)
1994 goto err_free;
1995
1996 for (area = 0; area < nr_vms; area++) {
1997 vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
1998 vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
1999 if (!vas[area] || !vms[area])
2000 goto err_free;
2001 }
2002retry:
2003 spin_lock(&vmap_area_lock);
2004
2005 /* start scanning - we scan from the top, begin with the last area */
2006 area = term_area = last_area;
2007 start = offsets[area];
2008 end = start + sizes[area];
2009
2010 if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
2011 base = vmalloc_end - last_end;
2012 goto found;
2013 }
2014 base = pvm_determine_end(&next, &prev, align) - end;
2015
2016 while (true) {
2017 BUG_ON(next && next->va_end <= base + end);
2018 BUG_ON(prev && prev->va_end > base + end);
2019
2020 /*
2021 * base might have underflowed, add last_end before
2022 * comparing.
2023 */
2024 if (base + last_end < vmalloc_start + last_end) {
2025 spin_unlock(&vmap_area_lock);
2026 if (!purged) {
2027 purge_vmap_area_lazy();
2028 purged = true;
2029 goto retry;
2030 }
2031 goto err_free;
2032 }
2033
2034 /*
2035 * If next overlaps, move base downwards so that it's
2036 * right below next and then recheck.
2037 */
2038 if (next && next->va_start < base + end) {
2039 base = pvm_determine_end(&next, &prev, align) - end;
2040 term_area = area;
2041 continue;
2042 }
2043
2044 /*
2045 * If prev overlaps, shift down next and prev and move
2046 * base so that it's right below new next and then
2047 * recheck.
2048 */
2049 if (prev && prev->va_end > base + start) {
2050 next = prev;
2051 prev = node_to_va(rb_prev(&next->rb_node));
2052 base = pvm_determine_end(&next, &prev, align) - end;
2053 term_area = area;
2054 continue;
2055 }
2056
2057 /*
2058 * This area fits, move on to the previous one. If
2059 * the previous one is the terminal one, we're done.
2060 */
2061 area = (area + nr_vms - 1) % nr_vms;
2062 if (area == term_area)
2063 break;
2064 start = offsets[area];
2065 end = start + sizes[area];
2066 pvm_find_next_prev(base + end, &next, &prev);
2067 }
2068found:
2069 /* we've found a fitting base, insert all va's */
2070 for (area = 0; area < nr_vms; area++) {
2071 struct vmap_area *va = vas[area];
2072
2073 va->va_start = base + offsets[area];
2074 va->va_end = va->va_start + sizes[area];
2075 __insert_vmap_area(va);
2076 }
2077
2078 vmap_area_pcpu_hole = base + offsets[last_area];
2079
2080 spin_unlock(&vmap_area_lock);
2081
2082 /* insert all vm's */
2083 for (area = 0; area < nr_vms; area++)
2084 insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
2085 pcpu_get_vm_areas);
2086
2087 kfree(vas);
2088 return vms;
2089
2090err_free:
2091 for (area = 0; area < nr_vms; area++) {
2092 if (vas)
2093 kfree(vas[area]);
2094 if (vms)
2095 kfree(vms[area]);
2096 }
2097 kfree(vas);
2098 kfree(vms);
2099 return NULL;
2100}
2101
2102/**
2103 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
2104 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
2105 * @nr_vms: the number of allocated areas
2106 *
2107 * Free vm_structs and the array allocated by pcpu_get_vm_areas().
2108 */
2109void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
2110{
2111 int i;
2112
2113 for (i = 0; i < nr_vms; i++)
2114 free_vm_area(vms[i]);
2115 kfree(vms);
2116}
1824 2117
1825#ifdef CONFIG_PROC_FS 2118#ifdef CONFIG_PROC_FS
1826static void *s_start(struct seq_file *m, loff_t *pos) 2119static void *s_start(struct seq_file *m, loff_t *pos)