aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorNick Piggin <npiggin@suse.de>2008-10-18 23:27:03 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2008-10-20 11:52:32 -0400
commitdb64fe02258f1507e13fe5212a989922323685ce (patch)
treeed24c8d4e3b0bcadfce3a8c3916a6b35d6c891cb
parentcb8f488c33539f096580e202f5438a809195008f (diff)
mm: rewrite vmap layer
Rewrite the vmap allocator to use rbtrees and lazy tlb flushing, and provide a fast, scalable percpu frontend for small vmaps (requires a slightly different API, though). The biggest problem with vmap is actually vunmap. Presently this requires a global kernel TLB flush, which on most architectures is a broadcast IPI to all CPUs to flush the cache. This is all done under a global lock. As the number of CPUs increases, so will the number of vunmaps a scaled workload will want to perform, and so will the cost of a global TLB flush. This gives terrible quadratic scalability characteristics. Another problem is that the entire vmap subsystem works under a single lock. It is a rwlock, but it is actually taken for write in all the fast paths, and the read locking would likely never be run concurrently anyway, so it's just pointless. This is a rewrite of vmap subsystem to solve those problems. The existing vmalloc API is implemented on top of the rewritten subsystem. The TLB flushing problem is solved by using lazy TLB unmapping. vmap addresses do not have to be flushed immediately when they are vunmapped, because the kernel will not reuse them again (would be a use-after-free) until they are reallocated. So the addresses aren't allocated again until a subsequent TLB flush. A single TLB flush then can flush multiple vunmaps from each CPU. XEN and PAT and such do not like deferred TLB flushing because they can't always handle multiple aliasing virtual addresses to a physical address. They now call vm_unmap_aliases() in order to flush any deferred mappings. That call is very expensive (well, actually not a lot more expensive than a single vunmap under the old scheme), however it should be OK if not called too often. The virtual memory extent information is stored in an rbtree rather than a linked list to improve the algorithmic scalability. There is a per-CPU allocator for small vmaps, which amortizes or avoids global locking. To use the per-CPU interface, the vm_map_ram / vm_unmap_ram interfaces must be used in place of vmap and vunmap. Vmalloc does not use these interfaces at the moment, so it will not be quite so scalable (although it will use lazy TLB flushing). As a quick test of performance, I ran a test that loops in the kernel, linearly mapping then touching then unmapping 4 pages. Different numbers of tests were run in parallel on an 4 core, 2 socket opteron. Results are in nanoseconds per map+touch+unmap. threads vanilla vmap rewrite 1 14700 2900 2 33600 3000 4 49500 2800 8 70631 2900 So with a 8 cores, the rewritten version is already 25x faster. In a slightly more realistic test (although with an older and less scalable version of the patch), I ripped the not-very-good vunmap batching code out of XFS, and implemented the large buffer mapping with vm_map_ram and vm_unmap_ram... along with a couple of other tricks, I was able to speed up a large directory workload by 20x on a 64 CPU system. I believe vmap/vunmap is actually sped up a lot more than 20x on such a system, but I'm running into other locks now. vmap is pretty well blown off the profiles. Before: 1352059 total 0.1401 798784 _write_lock 8320.6667 <- vmlist_lock 529313 default_idle 1181.5022 15242 smp_call_function 15.8771 <- vmap tlb flushing 2472 __get_vm_area_node 1.9312 <- vmap 1762 remove_vm_area 4.5885 <- vunmap 316 map_vm_area 0.2297 <- vmap 312 kfree 0.1950 300 _spin_lock 3.1250 252 sn_send_IPI_phys 0.4375 <- tlb flushing 238 vmap 0.8264 <- vmap 216 find_lock_page 0.5192 196 find_next_bit 0.3603 136 sn2_send_IPI 0.2024 130 pio_phys_write_mmr 2.0312 118 unmap_kernel_range 0.1229 After: 78406 total 0.0081 40053 default_idle 89.4040 33576 ia64_spinlock_contention 349.7500 1650 _spin_lock 17.1875 319 __reg_op 0.5538 281 _atomic_dec_and_lock 1.0977 153 mutex_unlock 1.5938 123 iget_locked 0.1671 117 xfs_dir_lookup 0.1662 117 dput 0.1406 114 xfs_iget_core 0.0268 92 xfs_da_hashname 0.1917 75 d_alloc 0.0670 68 vmap_page_range 0.0462 <- vmap 58 kmem_cache_alloc 0.0604 57 memset 0.0540 52 rb_next 0.1625 50 __copy_user 0.0208 49 bitmap_find_free_region 0.2188 <- vmap 46 ia64_sn_udelay 0.1106 45 find_inode_fast 0.1406 42 memcmp 0.2188 42 finish_task_switch 0.1094 42 __d_lookup 0.0410 40 radix_tree_lookup_slot 0.1250 37 _spin_unlock_irqrestore 0.3854 36 xfs_bmapi 0.0050 36 kmem_cache_free 0.0256 35 xfs_vn_getattr 0.0322 34 radix_tree_lookup 0.1062 33 __link_path_walk 0.0035 31 xfs_da_do_buf 0.0091 30 _xfs_buf_find 0.0204 28 find_get_page 0.0875 27 xfs_iread 0.0241 27 __strncpy_from_user 0.2812 26 _xfs_buf_initialize 0.0406 24 _xfs_buf_lookup_pages 0.0179 24 vunmap_page_range 0.0250 <- vunmap 23 find_lock_page 0.0799 22 vm_map_ram 0.0087 <- vmap 20 kfree 0.0125 19 put_page 0.0330 18 __kmalloc 0.0176 17 xfs_da_node_lookup_int 0.0086 17 _read_lock 0.0885 17 page_waitqueue 0.0664 vmap has gone from being the top 5 on the profiles and flushing the crap out of all TLBs, to using less than 1% of kernel time. [akpm@linux-foundation.org: cleanups, section fix] [akpm@linux-foundation.org: fix build on alpha] Signed-off-by: Nick Piggin <npiggin@suse.de> Cc: Jeremy Fitzhardinge <jeremy@goop.org> Cc: Krzysztof Helt <krzysztof.h1@poczta.fm> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/x86/mm/pageattr.c2
-rw-r--r--arch/x86/xen/enlighten.c1
-rw-r--r--arch/x86/xen/mmu.c1
-rw-r--r--include/linux/vmalloc.h15
-rw-r--r--init/main.c2
-rw-r--r--mm/vmalloc.c975
6 files changed, 862 insertions, 134 deletions
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a9ec89c3fbca..407d8784f669 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -792,6 +792,8 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
792 /* Must avoid aliasing mappings in the highmem code */ 792 /* Must avoid aliasing mappings in the highmem code */
793 kmap_flush_unused(); 793 kmap_flush_unused();
794 794
795 vm_unmap_aliases();
796
795 cpa.vaddr = addr; 797 cpa.vaddr = addr;
796 cpa.numpages = numpages; 798 cpa.numpages = numpages;
797 cpa.mask_set = mask_set; 799 cpa.mask_set = mask_set;
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0013a729b41d..b61534c7a4c4 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -871,6 +871,7 @@ static void xen_alloc_ptpage(struct mm_struct *mm, unsigned long pfn, unsigned l
871 /* make sure there are no stray mappings of 871 /* make sure there are no stray mappings of
872 this page */ 872 this page */
873 kmap_flush_unused(); 873 kmap_flush_unused();
874 vm_unmap_aliases();
874 } 875 }
875} 876}
876 877
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index ae173f6edd8b..d4d52f5a1cf7 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -846,6 +846,7 @@ static void __xen_pgd_pin(struct mm_struct *mm, pgd_t *pgd)
846 /* re-enable interrupts for kmap_flush_unused */ 846 /* re-enable interrupts for kmap_flush_unused */
847 xen_mc_issue(0); 847 xen_mc_issue(0);
848 kmap_flush_unused(); 848 kmap_flush_unused();
849 vm_unmap_aliases();
849 xen_mc_batch(); 850 xen_mc_batch();
850 } 851 }
851 852
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 328eb4022727..4c28c4d564e2 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -2,6 +2,7 @@
2#define _LINUX_VMALLOC_H 2#define _LINUX_VMALLOC_H
3 3
4#include <linux/spinlock.h> 4#include <linux/spinlock.h>
5#include <linux/init.h>
5#include <asm/page.h> /* pgprot_t */ 6#include <asm/page.h> /* pgprot_t */
6 7
7struct vm_area_struct; /* vma defining user mapping in mm_types.h */ 8struct vm_area_struct; /* vma defining user mapping in mm_types.h */
@@ -23,7 +24,6 @@ struct vm_area_struct; /* vma defining user mapping in mm_types.h */
23#endif 24#endif
24 25
25struct vm_struct { 26struct vm_struct {
26 /* keep next,addr,size together to speedup lookups */
27 struct vm_struct *next; 27 struct vm_struct *next;
28 void *addr; 28 void *addr;
29 unsigned long size; 29 unsigned long size;
@@ -37,6 +37,19 @@ struct vm_struct {
37/* 37/*
38 * Highlevel APIs for driver use 38 * Highlevel APIs for driver use
39 */ 39 */
40extern void vm_unmap_ram(const void *mem, unsigned int count);
41extern void *vm_map_ram(struct page **pages, unsigned int count,
42 int node, pgprot_t prot);
43extern void vm_unmap_aliases(void);
44
45#ifdef CONFIG_MMU
46extern void __init vmalloc_init(void);
47#else
48static inline void vmalloc_init(void)
49{
50}
51#endif
52
40extern void *vmalloc(unsigned long size); 53extern void *vmalloc(unsigned long size);
41extern void *vmalloc_user(unsigned long size); 54extern void *vmalloc_user(unsigned long size);
42extern void *vmalloc_node(unsigned long size, int node); 55extern void *vmalloc_node(unsigned long size, int node);
diff --git a/init/main.c b/init/main.c
index 27f6bf6108e9..4371d11721f6 100644
--- a/init/main.c
+++ b/init/main.c
@@ -27,6 +27,7 @@
27#include <linux/gfp.h> 27#include <linux/gfp.h>
28#include <linux/percpu.h> 28#include <linux/percpu.h>
29#include <linux/kmod.h> 29#include <linux/kmod.h>
30#include <linux/vmalloc.h>
30#include <linux/kernel_stat.h> 31#include <linux/kernel_stat.h>
31#include <linux/start_kernel.h> 32#include <linux/start_kernel.h>
32#include <linux/security.h> 33#include <linux/security.h>
@@ -642,6 +643,7 @@ asmlinkage void __init start_kernel(void)
642 initrd_start = 0; 643 initrd_start = 0;
643 } 644 }
644#endif 645#endif
646 vmalloc_init();
645 vfs_caches_init_early(); 647 vfs_caches_init_early();
646 cpuset_init_early(); 648 cpuset_init_early();
647 mem_init(); 649 mem_init();
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index bba06c41fc59..712ae47af0bf 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -8,6 +8,7 @@
8 * Numa awareness, Christoph Lameter, SGI, June 2005 8 * Numa awareness, Christoph Lameter, SGI, June 2005
9 */ 9 */
10 10
11#include <linux/vmalloc.h>
11#include <linux/mm.h> 12#include <linux/mm.h>
12#include <linux/module.h> 13#include <linux/module.h>
13#include <linux/highmem.h> 14#include <linux/highmem.h>
@@ -18,16 +19,17 @@
18#include <linux/debugobjects.h> 19#include <linux/debugobjects.h>
19#include <linux/vmalloc.h> 20#include <linux/vmalloc.h>
20#include <linux/kallsyms.h> 21#include <linux/kallsyms.h>
22#include <linux/list.h>
23#include <linux/rbtree.h>
24#include <linux/radix-tree.h>
25#include <linux/rcupdate.h>
21 26
27#include <asm/atomic.h>
22#include <asm/uaccess.h> 28#include <asm/uaccess.h>
23#include <asm/tlbflush.h> 29#include <asm/tlbflush.h>
24 30
25 31
26DEFINE_RWLOCK(vmlist_lock); 32/*** Page table manipulation functions ***/
27struct vm_struct *vmlist;
28
29static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
30 int node, void *caller);
31 33
32static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) 34static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
33{ 35{
@@ -40,8 +42,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
40 } while (pte++, addr += PAGE_SIZE, addr != end); 42 } while (pte++, addr += PAGE_SIZE, addr != end);
41} 43}
42 44
43static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, 45static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
44 unsigned long end)
45{ 46{
46 pmd_t *pmd; 47 pmd_t *pmd;
47 unsigned long next; 48 unsigned long next;
@@ -55,8 +56,7 @@ static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr,
55 } while (pmd++, addr = next, addr != end); 56 } while (pmd++, addr = next, addr != end);
56} 57}
57 58
58static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, 59static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end)
59 unsigned long end)
60{ 60{
61 pud_t *pud; 61 pud_t *pud;
62 unsigned long next; 62 unsigned long next;
@@ -70,12 +70,10 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr,
70 } while (pud++, addr = next, addr != end); 70 } while (pud++, addr = next, addr != end);
71} 71}
72 72
73void unmap_kernel_range(unsigned long addr, unsigned long size) 73static void vunmap_page_range(unsigned long addr, unsigned long end)
74{ 74{
75 pgd_t *pgd; 75 pgd_t *pgd;
76 unsigned long next; 76 unsigned long next;
77 unsigned long start = addr;
78 unsigned long end = addr + size;
79 77
80 BUG_ON(addr >= end); 78 BUG_ON(addr >= end);
81 pgd = pgd_offset_k(addr); 79 pgd = pgd_offset_k(addr);
@@ -86,35 +84,36 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
86 continue; 84 continue;
87 vunmap_pud_range(pgd, addr, next); 85 vunmap_pud_range(pgd, addr, next);
88 } while (pgd++, addr = next, addr != end); 86 } while (pgd++, addr = next, addr != end);
89 flush_tlb_kernel_range(start, end);
90}
91
92static void unmap_vm_area(struct vm_struct *area)
93{
94 unmap_kernel_range((unsigned long)area->addr, area->size);
95} 87}
96 88
97static int vmap_pte_range(pmd_t *pmd, unsigned long addr, 89static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
98 unsigned long end, pgprot_t prot, struct page ***pages) 90 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
99{ 91{
100 pte_t *pte; 92 pte_t *pte;
101 93
94 /*
95 * nr is a running index into the array which helps higher level
96 * callers keep track of where we're up to.
97 */
98
102 pte = pte_alloc_kernel(pmd, addr); 99 pte = pte_alloc_kernel(pmd, addr);
103 if (!pte) 100 if (!pte)
104 return -ENOMEM; 101 return -ENOMEM;
105 do { 102 do {
106 struct page *page = **pages; 103 struct page *page = pages[*nr];
107 WARN_ON(!pte_none(*pte)); 104
108 if (!page) 105 if (WARN_ON(!pte_none(*pte)))
106 return -EBUSY;
107 if (WARN_ON(!page))
109 return -ENOMEM; 108 return -ENOMEM;
110 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); 109 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
111 (*pages)++; 110 (*nr)++;
112 } while (pte++, addr += PAGE_SIZE, addr != end); 111 } while (pte++, addr += PAGE_SIZE, addr != end);
113 return 0; 112 return 0;
114} 113}
115 114
116static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, 115static int vmap_pmd_range(pud_t *pud, unsigned long addr,
117 unsigned long end, pgprot_t prot, struct page ***pages) 116 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
118{ 117{
119 pmd_t *pmd; 118 pmd_t *pmd;
120 unsigned long next; 119 unsigned long next;
@@ -124,14 +123,14 @@ static inline int vmap_pmd_range(pud_t *pud, unsigned long addr,
124 return -ENOMEM; 123 return -ENOMEM;
125 do { 124 do {
126 next = pmd_addr_end(addr, end); 125 next = pmd_addr_end(addr, end);
127 if (vmap_pte_range(pmd, addr, next, prot, pages)) 126 if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
128 return -ENOMEM; 127 return -ENOMEM;
129 } while (pmd++, addr = next, addr != end); 128 } while (pmd++, addr = next, addr != end);
130 return 0; 129 return 0;
131} 130}
132 131
133static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, 132static int vmap_pud_range(pgd_t *pgd, unsigned long addr,
134 unsigned long end, pgprot_t prot, struct page ***pages) 133 unsigned long end, pgprot_t prot, struct page **pages, int *nr)
135{ 134{
136 pud_t *pud; 135 pud_t *pud;
137 unsigned long next; 136 unsigned long next;
@@ -141,44 +140,49 @@ static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr,
141 return -ENOMEM; 140 return -ENOMEM;
142 do { 141 do {
143 next = pud_addr_end(addr, end); 142 next = pud_addr_end(addr, end);
144 if (vmap_pmd_range(pud, addr, next, prot, pages)) 143 if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
145 return -ENOMEM; 144 return -ENOMEM;
146 } while (pud++, addr = next, addr != end); 145 } while (pud++, addr = next, addr != end);
147 return 0; 146 return 0;
148} 147}
149 148
150int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) 149/*
150 * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
151 * will have pfns corresponding to the "pages" array.
152 *
153 * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
154 */
155static int vmap_page_range(unsigned long addr, unsigned long end,
156 pgprot_t prot, struct page **pages)
151{ 157{
152 pgd_t *pgd; 158 pgd_t *pgd;
153 unsigned long next; 159 unsigned long next;
154 unsigned long addr = (unsigned long) area->addr; 160 int err = 0;
155 unsigned long end = addr + area->size - PAGE_SIZE; 161 int nr = 0;
156 int err;
157 162
158 BUG_ON(addr >= end); 163 BUG_ON(addr >= end);
159 pgd = pgd_offset_k(addr); 164 pgd = pgd_offset_k(addr);
160 do { 165 do {
161 next = pgd_addr_end(addr, end); 166 next = pgd_addr_end(addr, end);
162 err = vmap_pud_range(pgd, addr, next, prot, pages); 167 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
163 if (err) 168 if (err)
164 break; 169 break;
165 } while (pgd++, addr = next, addr != end); 170 } while (pgd++, addr = next, addr != end);
166 flush_cache_vmap((unsigned long) area->addr, end); 171 flush_cache_vmap(addr, end);
167 return err; 172
173 if (unlikely(err))
174 return err;
175 return nr;
168} 176}
169EXPORT_SYMBOL_GPL(map_vm_area);
170 177
171/* 178/*
172 * Map a vmalloc()-space virtual address to the physical page. 179 * Walk a vmap address to the struct page it maps.
173 */ 180 */
174struct page *vmalloc_to_page(const void *vmalloc_addr) 181struct page *vmalloc_to_page(const void *vmalloc_addr)
175{ 182{
176 unsigned long addr = (unsigned long) vmalloc_addr; 183 unsigned long addr = (unsigned long) vmalloc_addr;
177 struct page *page = NULL; 184 struct page *page = NULL;
178 pgd_t *pgd = pgd_offset_k(addr); 185 pgd_t *pgd = pgd_offset_k(addr);
179 pud_t *pud;
180 pmd_t *pmd;
181 pte_t *ptep, pte;
182 186
183 /* 187 /*
184 * XXX we might need to change this if we add VIRTUAL_BUG_ON for 188 * XXX we might need to change this if we add VIRTUAL_BUG_ON for
@@ -188,10 +192,12 @@ struct page *vmalloc_to_page(const void *vmalloc_addr)
188 !is_module_address(addr)); 192 !is_module_address(addr));
189 193
190 if (!pgd_none(*pgd)) { 194 if (!pgd_none(*pgd)) {
191 pud = pud_offset(pgd, addr); 195 pud_t *pud = pud_offset(pgd, addr);
192 if (!pud_none(*pud)) { 196 if (!pud_none(*pud)) {
193 pmd = pmd_offset(pud, addr); 197 pmd_t *pmd = pmd_offset(pud, addr);
194 if (!pmd_none(*pmd)) { 198 if (!pmd_none(*pmd)) {
199 pte_t *ptep, pte;
200
195 ptep = pte_offset_map(pmd, addr); 201 ptep = pte_offset_map(pmd, addr);
196 pte = *ptep; 202 pte = *ptep;
197 if (pte_present(pte)) 203 if (pte_present(pte))
@@ -213,13 +219,751 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
213} 219}
214EXPORT_SYMBOL(vmalloc_to_pfn); 220EXPORT_SYMBOL(vmalloc_to_pfn);
215 221
216static struct vm_struct * 222
217__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, 223/*** Global kva allocator ***/
218 unsigned long end, int node, gfp_t gfp_mask, void *caller) 224
225#define VM_LAZY_FREE 0x01
226#define VM_LAZY_FREEING 0x02
227#define VM_VM_AREA 0x04
228
229struct vmap_area {
230 unsigned long va_start;
231 unsigned long va_end;
232 unsigned long flags;
233 struct rb_node rb_node; /* address sorted rbtree */
234 struct list_head list; /* address sorted list */
235 struct list_head purge_list; /* "lazy purge" list */
236 void *private;
237 struct rcu_head rcu_head;
238};
239
240static DEFINE_SPINLOCK(vmap_area_lock);
241static struct rb_root vmap_area_root = RB_ROOT;
242static LIST_HEAD(vmap_area_list);
243
244static struct vmap_area *__find_vmap_area(unsigned long addr)
219{ 245{
220 struct vm_struct **p, *tmp, *area; 246 struct rb_node *n = vmap_area_root.rb_node;
221 unsigned long align = 1; 247
248 while (n) {
249 struct vmap_area *va;
250
251 va = rb_entry(n, struct vmap_area, rb_node);
252 if (addr < va->va_start)
253 n = n->rb_left;
254 else if (addr > va->va_start)
255 n = n->rb_right;
256 else
257 return va;
258 }
259
260 return NULL;
261}
262
263static void __insert_vmap_area(struct vmap_area *va)
264{
265 struct rb_node **p = &vmap_area_root.rb_node;
266 struct rb_node *parent = NULL;
267 struct rb_node *tmp;
268
269 while (*p) {
270 struct vmap_area *tmp;
271
272 parent = *p;
273 tmp = rb_entry(parent, struct vmap_area, rb_node);
274 if (va->va_start < tmp->va_end)
275 p = &(*p)->rb_left;
276 else if (va->va_end > tmp->va_start)
277 p = &(*p)->rb_right;
278 else
279 BUG();
280 }
281
282 rb_link_node(&va->rb_node, parent, p);
283 rb_insert_color(&va->rb_node, &vmap_area_root);
284
285 /* address-sort this list so it is usable like the vmlist */
286 tmp = rb_prev(&va->rb_node);
287 if (tmp) {
288 struct vmap_area *prev;
289 prev = rb_entry(tmp, struct vmap_area, rb_node);
290 list_add_rcu(&va->list, &prev->list);
291 } else
292 list_add_rcu(&va->list, &vmap_area_list);
293}
294
295static void purge_vmap_area_lazy(void);
296
297/*
298 * Allocate a region of KVA of the specified size and alignment, within the
299 * vstart and vend.
300 */
301static struct vmap_area *alloc_vmap_area(unsigned long size,
302 unsigned long align,
303 unsigned long vstart, unsigned long vend,
304 int node, gfp_t gfp_mask)
305{
306 struct vmap_area *va;
307 struct rb_node *n;
308 unsigned long addr;
309 int purged = 0;
310
311 BUG_ON(size & ~PAGE_MASK);
312
313 addr = ALIGN(vstart, align);
314
315 va = kmalloc_node(sizeof(struct vmap_area),
316 gfp_mask & GFP_RECLAIM_MASK, node);
317 if (unlikely(!va))
318 return ERR_PTR(-ENOMEM);
319
320retry:
321 spin_lock(&vmap_area_lock);
322 /* XXX: could have a last_hole cache */
323 n = vmap_area_root.rb_node;
324 if (n) {
325 struct vmap_area *first = NULL;
326
327 do {
328 struct vmap_area *tmp;
329 tmp = rb_entry(n, struct vmap_area, rb_node);
330 if (tmp->va_end >= addr) {
331 if (!first && tmp->va_start < addr + size)
332 first = tmp;
333 n = n->rb_left;
334 } else {
335 first = tmp;
336 n = n->rb_right;
337 }
338 } while (n);
339
340 if (!first)
341 goto found;
342
343 if (first->va_end < addr) {
344 n = rb_next(&first->rb_node);
345 if (n)
346 first = rb_entry(n, struct vmap_area, rb_node);
347 else
348 goto found;
349 }
350
351 while (addr + size >= first->va_start && addr + size <= vend) {
352 addr = ALIGN(first->va_end + PAGE_SIZE, align);
353
354 n = rb_next(&first->rb_node);
355 if (n)
356 first = rb_entry(n, struct vmap_area, rb_node);
357 else
358 goto found;
359 }
360 }
361found:
362 if (addr + size > vend) {
363 spin_unlock(&vmap_area_lock);
364 if (!purged) {
365 purge_vmap_area_lazy();
366 purged = 1;
367 goto retry;
368 }
369 if (printk_ratelimit())
370 printk(KERN_WARNING "vmap allocation failed: "
371 "use vmalloc=<size> to increase size.\n");
372 return ERR_PTR(-EBUSY);
373 }
374
375 BUG_ON(addr & (align-1));
376
377 va->va_start = addr;
378 va->va_end = addr + size;
379 va->flags = 0;
380 __insert_vmap_area(va);
381 spin_unlock(&vmap_area_lock);
382
383 return va;
384}
385
386static void rcu_free_va(struct rcu_head *head)
387{
388 struct vmap_area *va = container_of(head, struct vmap_area, rcu_head);
389
390 kfree(va);
391}
392
393static void __free_vmap_area(struct vmap_area *va)
394{
395 BUG_ON(RB_EMPTY_NODE(&va->rb_node));
396 rb_erase(&va->rb_node, &vmap_area_root);
397 RB_CLEAR_NODE(&va->rb_node);
398 list_del_rcu(&va->list);
399
400 call_rcu(&va->rcu_head, rcu_free_va);
401}
402
403/*
404 * Free a region of KVA allocated by alloc_vmap_area
405 */
406static void free_vmap_area(struct vmap_area *va)
407{
408 spin_lock(&vmap_area_lock);
409 __free_vmap_area(va);
410 spin_unlock(&vmap_area_lock);
411}
412
413/*
414 * Clear the pagetable entries of a given vmap_area
415 */
416static void unmap_vmap_area(struct vmap_area *va)
417{
418 vunmap_page_range(va->va_start, va->va_end);
419}
420
421/*
422 * lazy_max_pages is the maximum amount of virtual address space we gather up
423 * before attempting to purge with a TLB flush.
424 *
425 * There is a tradeoff here: a larger number will cover more kernel page tables
426 * and take slightly longer to purge, but it will linearly reduce the number of
427 * global TLB flushes that must be performed. It would seem natural to scale
428 * this number up linearly with the number of CPUs (because vmapping activity
429 * could also scale linearly with the number of CPUs), however it is likely
430 * that in practice, workloads might be constrained in other ways that mean
431 * vmap activity will not scale linearly with CPUs. Also, I want to be
432 * conservative and not introduce a big latency on huge systems, so go with
433 * a less aggressive log scale. It will still be an improvement over the old
434 * code, and it will be simple to change the scale factor if we find that it
435 * becomes a problem on bigger systems.
436 */
437static unsigned long lazy_max_pages(void)
438{
439 unsigned int log;
440
441 log = fls(num_online_cpus());
442
443 return log * (32UL * 1024 * 1024 / PAGE_SIZE);
444}
445
446static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
447
448/*
449 * Purges all lazily-freed vmap areas.
450 *
451 * If sync is 0 then don't purge if there is already a purge in progress.
452 * If force_flush is 1, then flush kernel TLBs between *start and *end even
453 * if we found no lazy vmap areas to unmap (callers can use this to optimise
454 * their own TLB flushing).
455 * Returns with *start = min(*start, lowest purged address)
456 * *end = max(*end, highest purged address)
457 */
458static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end,
459 int sync, int force_flush)
460{
461 static DEFINE_SPINLOCK(purge_lock);
462 LIST_HEAD(valist);
463 struct vmap_area *va;
464 int nr = 0;
465
466 /*
467 * If sync is 0 but force_flush is 1, we'll go sync anyway but callers
468 * should not expect such behaviour. This just simplifies locking for
469 * the case that isn't actually used at the moment anyway.
470 */
471 if (!sync && !force_flush) {
472 if (!spin_trylock(&purge_lock))
473 return;
474 } else
475 spin_lock(&purge_lock);
476
477 rcu_read_lock();
478 list_for_each_entry_rcu(va, &vmap_area_list, list) {
479 if (va->flags & VM_LAZY_FREE) {
480 if (va->va_start < *start)
481 *start = va->va_start;
482 if (va->va_end > *end)
483 *end = va->va_end;
484 nr += (va->va_end - va->va_start) >> PAGE_SHIFT;
485 unmap_vmap_area(va);
486 list_add_tail(&va->purge_list, &valist);
487 va->flags |= VM_LAZY_FREEING;
488 va->flags &= ~VM_LAZY_FREE;
489 }
490 }
491 rcu_read_unlock();
492
493 if (nr) {
494 BUG_ON(nr > atomic_read(&vmap_lazy_nr));
495 atomic_sub(nr, &vmap_lazy_nr);
496 }
497
498 if (nr || force_flush)
499 flush_tlb_kernel_range(*start, *end);
500
501 if (nr) {
502 spin_lock(&vmap_area_lock);
503 list_for_each_entry(va, &valist, purge_list)
504 __free_vmap_area(va);
505 spin_unlock(&vmap_area_lock);
506 }
507 spin_unlock(&purge_lock);
508}
509
510/*
511 * Kick off a purge of the outstanding lazy areas.
512 */
513static void purge_vmap_area_lazy(void)
514{
515 unsigned long start = ULONG_MAX, end = 0;
516
517 __purge_vmap_area_lazy(&start, &end, 0, 0);
518}
519
520/*
521 * Free and unmap a vmap area
522 */
523static void free_unmap_vmap_area(struct vmap_area *va)
524{
525 va->flags |= VM_LAZY_FREE;
526 atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr);
527 if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages()))
528 purge_vmap_area_lazy();
529}
530
531static struct vmap_area *find_vmap_area(unsigned long addr)
532{
533 struct vmap_area *va;
534
535 spin_lock(&vmap_area_lock);
536 va = __find_vmap_area(addr);
537 spin_unlock(&vmap_area_lock);
538
539 return va;
540}
541
542static void free_unmap_vmap_area_addr(unsigned long addr)
543{
544 struct vmap_area *va;
545
546 va = find_vmap_area(addr);
547 BUG_ON(!va);
548 free_unmap_vmap_area(va);
549}
550
551
552/*** Per cpu kva allocator ***/
553
554/*
555 * vmap space is limited especially on 32 bit architectures. Ensure there is
556 * room for at least 16 percpu vmap blocks per CPU.
557 */
558/*
559 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
560 * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess
561 * instead (we just need a rough idea)
562 */
563#if BITS_PER_LONG == 32
564#define VMALLOC_SPACE (128UL*1024*1024)
565#else
566#define VMALLOC_SPACE (128UL*1024*1024*1024)
567#endif
568
569#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
570#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */
571#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */
572#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
573#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
574#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
575#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
576 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
577 VMALLOC_PAGES / NR_CPUS / 16))
578
579#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
580
581struct vmap_block_queue {
582 spinlock_t lock;
583 struct list_head free;
584 struct list_head dirty;
585 unsigned int nr_dirty;
586};
587
588struct vmap_block {
589 spinlock_t lock;
590 struct vmap_area *va;
591 struct vmap_block_queue *vbq;
592 unsigned long free, dirty;
593 DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
594 DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
595 union {
596 struct {
597 struct list_head free_list;
598 struct list_head dirty_list;
599 };
600 struct rcu_head rcu_head;
601 };
602};
603
604/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
605static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
606
607/*
608 * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
609 * in the free path. Could get rid of this if we change the API to return a
610 * "cookie" from alloc, to be passed to free. But no big deal yet.
611 */
612static DEFINE_SPINLOCK(vmap_block_tree_lock);
613static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
614
615/*
616 * We should probably have a fallback mechanism to allocate virtual memory
617 * out of partially filled vmap blocks. However vmap block sizing should be
618 * fairly reasonable according to the vmalloc size, so it shouldn't be a
619 * big problem.
620 */
621
622static unsigned long addr_to_vb_idx(unsigned long addr)
623{
624 addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
625 addr /= VMAP_BLOCK_SIZE;
626 return addr;
627}
628
629static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
630{
631 struct vmap_block_queue *vbq;
632 struct vmap_block *vb;
633 struct vmap_area *va;
634 unsigned long vb_idx;
635 int node, err;
636
637 node = numa_node_id();
638
639 vb = kmalloc_node(sizeof(struct vmap_block),
640 gfp_mask & GFP_RECLAIM_MASK, node);
641 if (unlikely(!vb))
642 return ERR_PTR(-ENOMEM);
643
644 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
645 VMALLOC_START, VMALLOC_END,
646 node, gfp_mask);
647 if (unlikely(IS_ERR(va))) {
648 kfree(vb);
649 return ERR_PTR(PTR_ERR(va));
650 }
651
652 err = radix_tree_preload(gfp_mask);
653 if (unlikely(err)) {
654 kfree(vb);
655 free_vmap_area(va);
656 return ERR_PTR(err);
657 }
658
659 spin_lock_init(&vb->lock);
660 vb->va = va;
661 vb->free = VMAP_BBMAP_BITS;
662 vb->dirty = 0;
663 bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
664 bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
665 INIT_LIST_HEAD(&vb->free_list);
666 INIT_LIST_HEAD(&vb->dirty_list);
667
668 vb_idx = addr_to_vb_idx(va->va_start);
669 spin_lock(&vmap_block_tree_lock);
670 err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
671 spin_unlock(&vmap_block_tree_lock);
672 BUG_ON(err);
673 radix_tree_preload_end();
674
675 vbq = &get_cpu_var(vmap_block_queue);
676 vb->vbq = vbq;
677 spin_lock(&vbq->lock);
678 list_add(&vb->free_list, &vbq->free);
679 spin_unlock(&vbq->lock);
680 put_cpu_var(vmap_cpu_blocks);
681
682 return vb;
683}
684
685static void rcu_free_vb(struct rcu_head *head)
686{
687 struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head);
688
689 kfree(vb);
690}
691
692static void free_vmap_block(struct vmap_block *vb)
693{
694 struct vmap_block *tmp;
695 unsigned long vb_idx;
696
697 spin_lock(&vb->vbq->lock);
698 if (!list_empty(&vb->free_list))
699 list_del(&vb->free_list);
700 if (!list_empty(&vb->dirty_list))
701 list_del(&vb->dirty_list);
702 spin_unlock(&vb->vbq->lock);
703
704 vb_idx = addr_to_vb_idx(vb->va->va_start);
705 spin_lock(&vmap_block_tree_lock);
706 tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
707 spin_unlock(&vmap_block_tree_lock);
708 BUG_ON(tmp != vb);
709
710 free_unmap_vmap_area(vb->va);
711 call_rcu(&vb->rcu_head, rcu_free_vb);
712}
713
714static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
715{
716 struct vmap_block_queue *vbq;
717 struct vmap_block *vb;
718 unsigned long addr = 0;
719 unsigned int order;
720
721 BUG_ON(size & ~PAGE_MASK);
722 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
723 order = get_order(size);
724
725again:
726 rcu_read_lock();
727 vbq = &get_cpu_var(vmap_block_queue);
728 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
729 int i;
730
731 spin_lock(&vb->lock);
732 i = bitmap_find_free_region(vb->alloc_map,
733 VMAP_BBMAP_BITS, order);
734
735 if (i >= 0) {
736 addr = vb->va->va_start + (i << PAGE_SHIFT);
737 BUG_ON(addr_to_vb_idx(addr) !=
738 addr_to_vb_idx(vb->va->va_start));
739 vb->free -= 1UL << order;
740 if (vb->free == 0) {
741 spin_lock(&vbq->lock);
742 list_del_init(&vb->free_list);
743 spin_unlock(&vbq->lock);
744 }
745 spin_unlock(&vb->lock);
746 break;
747 }
748 spin_unlock(&vb->lock);
749 }
750 put_cpu_var(vmap_cpu_blocks);
751 rcu_read_unlock();
752
753 if (!addr) {
754 vb = new_vmap_block(gfp_mask);
755 if (IS_ERR(vb))
756 return vb;
757 goto again;
758 }
759
760 return (void *)addr;
761}
762
763static void vb_free(const void *addr, unsigned long size)
764{
765 unsigned long offset;
766 unsigned long vb_idx;
767 unsigned int order;
768 struct vmap_block *vb;
769
770 BUG_ON(size & ~PAGE_MASK);
771 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
772 order = get_order(size);
773
774 offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
775
776 vb_idx = addr_to_vb_idx((unsigned long)addr);
777 rcu_read_lock();
778 vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
779 rcu_read_unlock();
780 BUG_ON(!vb);
781
782 spin_lock(&vb->lock);
783 bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order);
784 if (!vb->dirty) {
785 spin_lock(&vb->vbq->lock);
786 list_add(&vb->dirty_list, &vb->vbq->dirty);
787 spin_unlock(&vb->vbq->lock);
788 }
789 vb->dirty += 1UL << order;
790 if (vb->dirty == VMAP_BBMAP_BITS) {
791 BUG_ON(vb->free || !list_empty(&vb->free_list));
792 spin_unlock(&vb->lock);
793 free_vmap_block(vb);
794 } else
795 spin_unlock(&vb->lock);
796}
797
798/**
799 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
800 *
801 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
802 * to amortize TLB flushing overheads. What this means is that any page you
803 * have now, may, in a former life, have been mapped into kernel virtual
804 * address by the vmap layer and so there might be some CPUs with TLB entries
805 * still referencing that page (additional to the regular 1:1 kernel mapping).
806 *
807 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
808 * be sure that none of the pages we have control over will have any aliases
809 * from the vmap layer.
810 */
811void vm_unmap_aliases(void)
812{
813 unsigned long start = ULONG_MAX, end = 0;
814 int cpu;
815 int flush = 0;
816
817 for_each_possible_cpu(cpu) {
818 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
819 struct vmap_block *vb;
820
821 rcu_read_lock();
822 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
823 int i;
824
825 spin_lock(&vb->lock);
826 i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS);
827 while (i < VMAP_BBMAP_BITS) {
828 unsigned long s, e;
829 int j;
830 j = find_next_zero_bit(vb->dirty_map,
831 VMAP_BBMAP_BITS, i);
832
833 s = vb->va->va_start + (i << PAGE_SHIFT);
834 e = vb->va->va_start + (j << PAGE_SHIFT);
835 vunmap_page_range(s, e);
836 flush = 1;
837
838 if (s < start)
839 start = s;
840 if (e > end)
841 end = e;
842
843 i = j;
844 i = find_next_bit(vb->dirty_map,
845 VMAP_BBMAP_BITS, i);
846 }
847 spin_unlock(&vb->lock);
848 }
849 rcu_read_unlock();
850 }
851
852 __purge_vmap_area_lazy(&start, &end, 1, flush);
853}
854EXPORT_SYMBOL_GPL(vm_unmap_aliases);
855
856/**
857 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
858 * @mem: the pointer returned by vm_map_ram
859 * @count: the count passed to that vm_map_ram call (cannot unmap partial)
860 */
861void vm_unmap_ram(const void *mem, unsigned int count)
862{
863 unsigned long size = count << PAGE_SHIFT;
864 unsigned long addr = (unsigned long)mem;
865
866 BUG_ON(!addr);
867 BUG_ON(addr < VMALLOC_START);
868 BUG_ON(addr > VMALLOC_END);
869 BUG_ON(addr & (PAGE_SIZE-1));
870
871 debug_check_no_locks_freed(mem, size);
872
873 if (likely(count <= VMAP_MAX_ALLOC))
874 vb_free(mem, size);
875 else
876 free_unmap_vmap_area_addr(addr);
877}
878EXPORT_SYMBOL(vm_unmap_ram);
879
880/**
881 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
882 * @pages: an array of pointers to the pages to be mapped
883 * @count: number of pages
884 * @node: prefer to allocate data structures on this node
885 * @prot: memory protection to use. PAGE_KERNEL for regular RAM
886 * @returns: a pointer to the address that has been mapped, or NULL on failure
887 */
888void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
889{
890 unsigned long size = count << PAGE_SHIFT;
222 unsigned long addr; 891 unsigned long addr;
892 void *mem;
893
894 if (likely(count <= VMAP_MAX_ALLOC)) {
895 mem = vb_alloc(size, GFP_KERNEL);
896 if (IS_ERR(mem))
897 return NULL;
898 addr = (unsigned long)mem;
899 } else {
900 struct vmap_area *va;
901 va = alloc_vmap_area(size, PAGE_SIZE,
902 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
903 if (IS_ERR(va))
904 return NULL;
905
906 addr = va->va_start;
907 mem = (void *)addr;
908 }
909 if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
910 vm_unmap_ram(mem, count);
911 return NULL;
912 }
913 return mem;
914}
915EXPORT_SYMBOL(vm_map_ram);
916
917void __init vmalloc_init(void)
918{
919 int i;
920
921 for_each_possible_cpu(i) {
922 struct vmap_block_queue *vbq;
923
924 vbq = &per_cpu(vmap_block_queue, i);
925 spin_lock_init(&vbq->lock);
926 INIT_LIST_HEAD(&vbq->free);
927 INIT_LIST_HEAD(&vbq->dirty);
928 vbq->nr_dirty = 0;
929 }
930}
931
932void unmap_kernel_range(unsigned long addr, unsigned long size)
933{
934 unsigned long end = addr + size;
935 vunmap_page_range(addr, end);
936 flush_tlb_kernel_range(addr, end);
937}
938
939int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
940{
941 unsigned long addr = (unsigned long)area->addr;
942 unsigned long end = addr + area->size - PAGE_SIZE;
943 int err;
944
945 err = vmap_page_range(addr, end, prot, *pages);
946 if (err > 0) {
947 *pages += err;
948 err = 0;
949 }
950
951 return err;
952}
953EXPORT_SYMBOL_GPL(map_vm_area);
954
955/*** Old vmalloc interfaces ***/
956DEFINE_RWLOCK(vmlist_lock);
957struct vm_struct *vmlist;
958
959static struct vm_struct *__get_vm_area_node(unsigned long size,
960 unsigned long flags, unsigned long start, unsigned long end,
961 int node, gfp_t gfp_mask, void *caller)
962{
963 static struct vmap_area *va;
964 struct vm_struct *area;
965 struct vm_struct *tmp, **p;
966 unsigned long align = 1;
223 967
224 BUG_ON(in_interrupt()); 968 BUG_ON(in_interrupt());
225 if (flags & VM_IOREMAP) { 969 if (flags & VM_IOREMAP) {
@@ -232,13 +976,12 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
232 976
233 align = 1ul << bit; 977 align = 1ul << bit;
234 } 978 }
235 addr = ALIGN(start, align); 979
236 size = PAGE_ALIGN(size); 980 size = PAGE_ALIGN(size);
237 if (unlikely(!size)) 981 if (unlikely(!size))
238 return NULL; 982 return NULL;
239 983
240 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 984 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
241
242 if (unlikely(!area)) 985 if (unlikely(!area))
243 return NULL; 986 return NULL;
244 987
@@ -247,48 +990,32 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start,
247 */ 990 */
248 size += PAGE_SIZE; 991 size += PAGE_SIZE;
249 992
250 write_lock(&vmlist_lock); 993 va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
251 for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) { 994 if (IS_ERR(va)) {
252 if ((unsigned long)tmp->addr < addr) { 995 kfree(area);
253 if((unsigned long)tmp->addr + tmp->size >= addr) 996 return NULL;
254 addr = ALIGN(tmp->size +
255 (unsigned long)tmp->addr, align);
256 continue;
257 }
258 if ((size + addr) < addr)
259 goto out;
260 if (size + addr <= (unsigned long)tmp->addr)
261 goto found;
262 addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align);
263 if (addr > end - size)
264 goto out;
265 } 997 }
266 if ((size + addr) < addr)
267 goto out;
268 if (addr > end - size)
269 goto out;
270
271found:
272 area->next = *p;
273 *p = area;
274 998
275 area->flags = flags; 999 area->flags = flags;
276 area->addr = (void *)addr; 1000 area->addr = (void *)va->va_start;
277 area->size = size; 1001 area->size = size;
278 area->pages = NULL; 1002 area->pages = NULL;
279 area->nr_pages = 0; 1003 area->nr_pages = 0;
280 area->phys_addr = 0; 1004 area->phys_addr = 0;
281 area->caller = caller; 1005 area->caller = caller;
1006 va->private = area;
1007 va->flags |= VM_VM_AREA;
1008
1009 write_lock(&vmlist_lock);
1010 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1011 if (tmp->addr >= area->addr)
1012 break;
1013 }
1014 area->next = *p;
1015 *p = area;
282 write_unlock(&vmlist_lock); 1016 write_unlock(&vmlist_lock);
283 1017
284 return area; 1018 return area;
285
286out:
287 write_unlock(&vmlist_lock);
288 kfree(area);
289 if (printk_ratelimit())
290 printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n");
291 return NULL;
292} 1019}
293 1020
294struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 1021struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
@@ -328,39 +1055,15 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
328 gfp_mask, __builtin_return_address(0)); 1055 gfp_mask, __builtin_return_address(0));
329} 1056}
330 1057
331/* Caller must hold vmlist_lock */ 1058static struct vm_struct *find_vm_area(const void *addr)
332static struct vm_struct *__find_vm_area(const void *addr)
333{ 1059{
334 struct vm_struct *tmp; 1060 struct vmap_area *va;
335 1061
336 for (tmp = vmlist; tmp != NULL; tmp = tmp->next) { 1062 va = find_vmap_area((unsigned long)addr);
337 if (tmp->addr == addr) 1063 if (va && va->flags & VM_VM_AREA)
338 break; 1064 return va->private;
339 }
340
341 return tmp;
342}
343
344/* Caller must hold vmlist_lock */
345static struct vm_struct *__remove_vm_area(const void *addr)
346{
347 struct vm_struct **p, *tmp;
348 1065
349 for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) {
350 if (tmp->addr == addr)
351 goto found;
352 }
353 return NULL; 1066 return NULL;
354
355found:
356 unmap_vm_area(tmp);
357 *p = tmp->next;
358
359 /*
360 * Remove the guard page.
361 */
362 tmp->size -= PAGE_SIZE;
363 return tmp;
364} 1067}
365 1068
366/** 1069/**
@@ -373,11 +1076,24 @@ found:
373 */ 1076 */
374struct vm_struct *remove_vm_area(const void *addr) 1077struct vm_struct *remove_vm_area(const void *addr)
375{ 1078{
376 struct vm_struct *v; 1079 struct vmap_area *va;
377 write_lock(&vmlist_lock); 1080
378 v = __remove_vm_area(addr); 1081 va = find_vmap_area((unsigned long)addr);
379 write_unlock(&vmlist_lock); 1082 if (va && va->flags & VM_VM_AREA) {
380 return v; 1083 struct vm_struct *vm = va->private;
1084 struct vm_struct *tmp, **p;
1085 free_unmap_vmap_area(va);
1086 vm->size -= PAGE_SIZE;
1087
1088 write_lock(&vmlist_lock);
1089 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
1090 ;
1091 *p = tmp->next;
1092 write_unlock(&vmlist_lock);
1093
1094 return vm;
1095 }
1096 return NULL;
381} 1097}
382 1098
383static void __vunmap(const void *addr, int deallocate_pages) 1099static void __vunmap(const void *addr, int deallocate_pages)
@@ -487,6 +1203,8 @@ void *vmap(struct page **pages, unsigned int count,
487} 1203}
488EXPORT_SYMBOL(vmap); 1204EXPORT_SYMBOL(vmap);
489 1205
1206static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1207 int node, void *caller);
490static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1208static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
491 pgprot_t prot, int node, void *caller) 1209 pgprot_t prot, int node, void *caller)
492{ 1210{
@@ -613,10 +1331,8 @@ void *vmalloc_user(unsigned long size)
613 1331
614 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); 1332 ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL);
615 if (ret) { 1333 if (ret) {
616 write_lock(&vmlist_lock); 1334 area = find_vm_area(ret);
617 area = __find_vm_area(ret);
618 area->flags |= VM_USERMAP; 1335 area->flags |= VM_USERMAP;
619 write_unlock(&vmlist_lock);
620 } 1336 }
621 return ret; 1337 return ret;
622} 1338}
@@ -696,10 +1412,8 @@ void *vmalloc_32_user(unsigned long size)
696 1412
697 ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); 1413 ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL);
698 if (ret) { 1414 if (ret) {
699 write_lock(&vmlist_lock); 1415 area = find_vm_area(ret);
700 area = __find_vm_area(ret);
701 area->flags |= VM_USERMAP; 1416 area->flags |= VM_USERMAP;
702 write_unlock(&vmlist_lock);
703 } 1417 }
704 return ret; 1418 return ret;
705} 1419}
@@ -800,26 +1514,25 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
800 struct vm_struct *area; 1514 struct vm_struct *area;
801 unsigned long uaddr = vma->vm_start; 1515 unsigned long uaddr = vma->vm_start;
802 unsigned long usize = vma->vm_end - vma->vm_start; 1516 unsigned long usize = vma->vm_end - vma->vm_start;
803 int ret;
804 1517
805 if ((PAGE_SIZE-1) & (unsigned long)addr) 1518 if ((PAGE_SIZE-1) & (unsigned long)addr)
806 return -EINVAL; 1519 return -EINVAL;
807 1520
808 read_lock(&vmlist_lock); 1521 area = find_vm_area(addr);
809 area = __find_vm_area(addr);
810 if (!area) 1522 if (!area)
811 goto out_einval_locked; 1523 return -EINVAL;
812 1524
813 if (!(area->flags & VM_USERMAP)) 1525 if (!(area->flags & VM_USERMAP))
814 goto out_einval_locked; 1526 return -EINVAL;
815 1527
816 if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) 1528 if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE)
817 goto out_einval_locked; 1529 return -EINVAL;
818 read_unlock(&vmlist_lock);
819 1530
820 addr += pgoff << PAGE_SHIFT; 1531 addr += pgoff << PAGE_SHIFT;
821 do { 1532 do {
822 struct page *page = vmalloc_to_page(addr); 1533 struct page *page = vmalloc_to_page(addr);
1534 int ret;
1535
823 ret = vm_insert_page(vma, uaddr, page); 1536 ret = vm_insert_page(vma, uaddr, page);
824 if (ret) 1537 if (ret)
825 return ret; 1538 return ret;
@@ -832,11 +1545,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
832 /* Prevent "things" like memory migration? VM_flags need a cleanup... */ 1545 /* Prevent "things" like memory migration? VM_flags need a cleanup... */
833 vma->vm_flags |= VM_RESERVED; 1546 vma->vm_flags |= VM_RESERVED;
834 1547
835 return ret; 1548 return 0;
836
837out_einval_locked:
838 read_unlock(&vmlist_lock);
839 return -EINVAL;
840} 1549}
841EXPORT_SYMBOL(remap_vmalloc_range); 1550EXPORT_SYMBOL(remap_vmalloc_range);
842 1551