diff options
Diffstat (limited to 'mm/vmalloc.c')
-rw-r--r-- | mm/vmalloc.c | 994 |
1 files changed, 858 insertions, 136 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bba06c41fc59..65ae576030da 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -8,6 +8,7 @@ | |||
8 | * Numa awareness, Christoph Lameter, SGI, June 2005 | 8 | * Numa awareness, Christoph Lameter, SGI, June 2005 |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/vmalloc.h> | ||
11 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
12 | #include <linux/module.h> | 13 | #include <linux/module.h> |
13 | #include <linux/highmem.h> | 14 | #include <linux/highmem.h> |
@@ -16,18 +17,18 @@ | |||
16 | #include <linux/interrupt.h> | 17 | #include <linux/interrupt.h> |
17 | #include <linux/seq_file.h> | 18 | #include <linux/seq_file.h> |
18 | #include <linux/debugobjects.h> | 19 | #include <linux/debugobjects.h> |
19 | #include <linux/vmalloc.h> | ||
20 | #include <linux/kallsyms.h> | 20 | #include <linux/kallsyms.h> |
21 | #include <linux/list.h> | ||
22 | #include <linux/rbtree.h> | ||
23 | #include <linux/radix-tree.h> | ||
24 | #include <linux/rcupdate.h> | ||
21 | 25 | ||
26 | #include <asm/atomic.h> | ||
22 | #include <asm/uaccess.h> | 27 | #include <asm/uaccess.h> |
23 | #include <asm/tlbflush.h> | 28 | #include <asm/tlbflush.h> |
24 | 29 | ||
25 | 30 | ||
26 | DEFINE_RWLOCK(vmlist_lock); | 31 | /*** Page table manipulation functions ***/ |
27 | struct vm_struct *vmlist; | ||
28 | |||
29 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | ||
30 | int node, void *caller); | ||
31 | 32 | ||
32 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) | 33 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) |
33 | { | 34 | { |
@@ -40,8 +41,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) | |||
40 | } while (pte++, addr += PAGE_SIZE, addr != end); | 41 | } while (pte++, addr += PAGE_SIZE, addr != end); |
41 | } | 42 | } |
42 | 43 | ||
43 | static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, | 44 | static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) |
44 | unsigned long end) | ||
45 | { | 45 | { |
46 | pmd_t *pmd; | 46 | pmd_t *pmd; |
47 | unsigned long next; | 47 | unsigned long next; |
@@ -55,8 +55,7 @@ static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, | |||
55 | } while (pmd++, addr = next, addr != end); | 55 | } while (pmd++, addr = next, addr != end); |
56 | } | 56 | } |
57 | 57 | ||
58 | static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, | 58 | static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) |
59 | unsigned long end) | ||
60 | { | 59 | { |
61 | pud_t *pud; | 60 | pud_t *pud; |
62 | unsigned long next; | 61 | unsigned long next; |
@@ -70,12 +69,10 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, | |||
70 | } while (pud++, addr = next, addr != end); | 69 | } while (pud++, addr = next, addr != end); |
71 | } | 70 | } |
72 | 71 | ||
73 | void unmap_kernel_range(unsigned long addr, unsigned long size) | 72 | static void vunmap_page_range(unsigned long addr, unsigned long end) |
74 | { | 73 | { |
75 | pgd_t *pgd; | 74 | pgd_t *pgd; |
76 | unsigned long next; | 75 | unsigned long next; |
77 | unsigned long start = addr; | ||
78 | unsigned long end = addr + size; | ||
79 | 76 | ||
80 | BUG_ON(addr >= end); | 77 | BUG_ON(addr >= end); |
81 | pgd = pgd_offset_k(addr); | 78 | pgd = pgd_offset_k(addr); |
@@ -86,35 +83,36 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) | |||
86 | continue; | 83 | continue; |
87 | vunmap_pud_range(pgd, addr, next); | 84 | vunmap_pud_range(pgd, addr, next); |
88 | } while (pgd++, addr = next, addr != end); | 85 | } while (pgd++, addr = next, addr != end); |
89 | flush_tlb_kernel_range(start, end); | ||
90 | } | ||
91 | |||
92 | static void unmap_vm_area(struct vm_struct *area) | ||
93 | { | ||
94 | unmap_kernel_range((unsigned long)area->addr, area->size); | ||
95 | } | 86 | } |
96 | 87 | ||
97 | static int vmap_pte_range(pmd_t *pmd, unsigned long addr, | 88 | static int vmap_pte_range(pmd_t *pmd, unsigned long addr, |
98 | unsigned long end, pgprot_t prot, struct page ***pages) | 89 | unsigned long end, pgprot_t prot, struct page **pages, int *nr) |
99 | { | 90 | { |
100 | pte_t *pte; | 91 | pte_t *pte; |
101 | 92 | ||
93 | /* | ||
94 | * nr is a running index into the array which helps higher level | ||
95 | * callers keep track of where we're up to. | ||
96 | */ | ||
97 | |||
102 | pte = pte_alloc_kernel(pmd, addr); | 98 | pte = pte_alloc_kernel(pmd, addr); |
103 | if (!pte) | 99 | if (!pte) |
104 | return -ENOMEM; | 100 | return -ENOMEM; |
105 | do { | 101 | do { |
106 | struct page *page = **pages; | 102 | struct page *page = pages[*nr]; |
107 | WARN_ON(!pte_none(*pte)); | 103 | |
108 | if (!page) | 104 | if (WARN_ON(!pte_none(*pte))) |
105 | return -EBUSY; | ||
106 | if (WARN_ON(!page)) | ||
109 | return -ENOMEM; | 107 | return -ENOMEM; |
110 | set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); | 108 | set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); |
111 | (*pages)++; | 109 | (*nr)++; |
112 | } while (pte++, addr += PAGE_SIZE, addr != end); | 110 | } while (pte++, addr += PAGE_SIZE, addr != end); |
113 | return 0; | 111 | return 0; |
114 | } | 112 | } |
115 | 113 | ||
116 | static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, | 114 | static int vmap_pmd_range(pud_t *pud, unsigned long addr, |
117 | unsigned long end, pgprot_t prot, struct page ***pages) | 115 | unsigned long end, pgprot_t prot, struct page **pages, int *nr) |
118 | { | 116 | { |
119 | pmd_t *pmd; | 117 | pmd_t *pmd; |
120 | unsigned long next; | 118 | unsigned long next; |
@@ -124,14 +122,14 @@ static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, | |||
124 | return -ENOMEM; | 122 | return -ENOMEM; |
125 | do { | 123 | do { |
126 | next = pmd_addr_end(addr, end); | 124 | next = pmd_addr_end(addr, end); |
127 | if (vmap_pte_range(pmd, addr, next, prot, pages)) | 125 | if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) |
128 | return -ENOMEM; | 126 | return -ENOMEM; |
129 | } while (pmd++, addr = next, addr != end); | 127 | } while (pmd++, addr = next, addr != end); |
130 | return 0; | 128 | return 0; |
131 | } | 129 | } |
132 | 130 | ||
133 | static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, | 131 | static int vmap_pud_range(pgd_t *pgd, unsigned long addr, |
134 | unsigned long end, pgprot_t prot, struct page ***pages) | 132 | unsigned long end, pgprot_t prot, struct page **pages, int *nr) |
135 | { | 133 | { |
136 | pud_t *pud; | 134 | pud_t *pud; |
137 | unsigned long next; | 135 | unsigned long next; |
@@ -141,57 +139,78 @@ static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, | |||
141 | return -ENOMEM; | 139 | return -ENOMEM; |
142 | do { | 140 | do { |
143 | next = pud_addr_end(addr, end); | 141 | next = pud_addr_end(addr, end); |
144 | if (vmap_pmd_range(pud, addr, next, prot, pages)) | 142 | if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) |
145 | return -ENOMEM; | 143 | return -ENOMEM; |
146 | } while (pud++, addr = next, addr != end); | 144 | } while (pud++, addr = next, addr != end); |
147 | return 0; | 145 | return 0; |
148 | } | 146 | } |
149 | 147 | ||
150 | int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | 148 | /* |
149 | * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and | ||
150 | * will have pfns corresponding to the "pages" array. | ||
151 | * | ||
152 | * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] | ||
153 | */ | ||
154 | static int vmap_page_range(unsigned long addr, unsigned long end, | ||
155 | pgprot_t prot, struct page **pages) | ||
151 | { | 156 | { |
152 | pgd_t *pgd; | 157 | pgd_t *pgd; |
153 | unsigned long next; | 158 | unsigned long next; |
154 | unsigned long addr = (unsigned long) area->addr; | 159 | int err = 0; |
155 | unsigned long end = addr + area->size - PAGE_SIZE; | 160 | int nr = 0; |
156 | int err; | ||
157 | 161 | ||
158 | BUG_ON(addr >= end); | 162 | BUG_ON(addr >= end); |
159 | pgd = pgd_offset_k(addr); | 163 | pgd = pgd_offset_k(addr); |
160 | do { | 164 | do { |
161 | next = pgd_addr_end(addr, end); | 165 | next = pgd_addr_end(addr, end); |
162 | err = vmap_pud_range(pgd, addr, next, prot, pages); | 166 | err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); |
163 | if (err) | 167 | if (err) |
164 | break; | 168 | break; |
165 | } while (pgd++, addr = next, addr != end); | 169 | } while (pgd++, addr = next, addr != end); |
166 | flush_cache_vmap((unsigned long) area->addr, end); | 170 | flush_cache_vmap(addr, end); |
167 | return err; | 171 | |
172 | if (unlikely(err)) | ||
173 | return err; | ||
174 | return nr; | ||
175 | } | ||
176 | |||
177 | static inline int is_vmalloc_or_module_addr(const void *x) | ||
178 | { | ||
179 | /* | ||
180 | * x86-64 and sparc64 put modules in a special place, | ||
181 | * and fall back on vmalloc() if that fails. Others | ||
182 | * just put it in the vmalloc space. | ||
183 | */ | ||
184 | #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) | ||
185 | unsigned long addr = (unsigned long)x; | ||
186 | if (addr >= MODULES_VADDR && addr < MODULES_END) | ||
187 | return 1; | ||
188 | #endif | ||
189 | return is_vmalloc_addr(x); | ||
168 | } | 190 | } |
169 | EXPORT_SYMBOL_GPL(map_vm_area); | ||
170 | 191 | ||
171 | /* | 192 | /* |
172 | * Map a vmalloc()-space virtual address to the physical page. | 193 | * Walk a vmap address to the struct page it maps. |
173 | */ | 194 | */ |
174 | struct page *vmalloc_to_page(const void *vmalloc_addr) | 195 | struct page *vmalloc_to_page(const void *vmalloc_addr) |
175 | { | 196 | { |
176 | unsigned long addr = (unsigned long) vmalloc_addr; | 197 | unsigned long addr = (unsigned long) vmalloc_addr; |
177 | struct page *page = NULL; | 198 | struct page *page = NULL; |
178 | pgd_t *pgd = pgd_offset_k(addr); | 199 | pgd_t *pgd = pgd_offset_k(addr); |
179 | pud_t *pud; | ||
180 | pmd_t *pmd; | ||
181 | pte_t *ptep, pte; | ||
182 | 200 | ||
183 | /* | 201 | /* |
184 | * XXX we might need to change this if we add VIRTUAL_BUG_ON for | 202 | * XXX we might need to change this if we add VIRTUAL_BUG_ON for |
185 | * architectures that do not vmalloc module space | 203 | * architectures that do not vmalloc module space |
186 | */ | 204 | */ |
187 | VIRTUAL_BUG_ON(!is_vmalloc_addr(vmalloc_addr) && | 205 | VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); |
188 | !is_module_address(addr)); | ||
189 | 206 | ||
190 | if (!pgd_none(*pgd)) { | 207 | if (!pgd_none(*pgd)) { |
191 | pud = pud_offset(pgd, addr); | 208 | pud_t *pud = pud_offset(pgd, addr); |
192 | if (!pud_none(*pud)) { | 209 | if (!pud_none(*pud)) { |
193 | pmd = pmd_offset(pud, addr); | 210 | pmd_t *pmd = pmd_offset(pud, addr); |
194 | if (!pmd_none(*pmd)) { | 211 | if (!pmd_none(*pmd)) { |
212 | pte_t *ptep, pte; | ||
213 | |||
195 | ptep = pte_offset_map(pmd, addr); | 214 | ptep = pte_offset_map(pmd, addr); |
196 | pte = *ptep; | 215 | pte = *ptep; |
197 | if (pte_present(pte)) | 216 | if (pte_present(pte)) |
@@ -213,13 +232,751 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr) | |||
213 | } | 232 | } |
214 | EXPORT_SYMBOL(vmalloc_to_pfn); | 233 | EXPORT_SYMBOL(vmalloc_to_pfn); |
215 | 234 | ||
216 | static struct vm_struct * | 235 | |
217 | __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, | 236 | /*** Global kva allocator ***/ |
218 | unsigned long end, int node, gfp_t gfp_mask, void *caller) | 237 | |
238 | #define VM_LAZY_FREE 0x01 | ||
239 | #define VM_LAZY_FREEING 0x02 | ||
240 | #define VM_VM_AREA 0x04 | ||
241 | |||
242 | struct vmap_area { | ||
243 | unsigned long va_start; | ||
244 | unsigned long va_end; | ||
245 | unsigned long flags; | ||
246 | struct rb_node rb_node; /* address sorted rbtree */ | ||
247 | struct list_head list; /* address sorted list */ | ||
248 | struct list_head purge_list; /* "lazy purge" list */ | ||
249 | void *private; | ||
250 | struct rcu_head rcu_head; | ||
251 | }; | ||
252 | |||
253 | static DEFINE_SPINLOCK(vmap_area_lock); | ||
254 | static struct rb_root vmap_area_root = RB_ROOT; | ||
255 | static LIST_HEAD(vmap_area_list); | ||
256 | |||
257 | static struct vmap_area *__find_vmap_area(unsigned long addr) | ||
219 | { | 258 | { |
220 | struct vm_struct **p, *tmp, *area; | 259 | struct rb_node *n = vmap_area_root.rb_node; |
221 | unsigned long align = 1; | 260 | |
261 | while (n) { | ||
262 | struct vmap_area *va; | ||
263 | |||
264 | va = rb_entry(n, struct vmap_area, rb_node); | ||
265 | if (addr < va->va_start) | ||
266 | n = n->rb_left; | ||
267 | else if (addr > va->va_start) | ||
268 | n = n->rb_right; | ||
269 | else | ||
270 | return va; | ||
271 | } | ||
272 | |||
273 | return NULL; | ||
274 | } | ||
275 | |||
276 | static void __insert_vmap_area(struct vmap_area *va) | ||
277 | { | ||
278 | struct rb_node **p = &vmap_area_root.rb_node; | ||
279 | struct rb_node *parent = NULL; | ||
280 | struct rb_node *tmp; | ||
281 | |||
282 | while (*p) { | ||
283 | struct vmap_area *tmp; | ||
284 | |||
285 | parent = *p; | ||
286 | tmp = rb_entry(parent, struct vmap_area, rb_node); | ||
287 | if (va->va_start < tmp->va_end) | ||
288 | p = &(*p)->rb_left; | ||
289 | else if (va->va_end > tmp->va_start) | ||
290 | p = &(*p)->rb_right; | ||
291 | else | ||
292 | BUG(); | ||
293 | } | ||
294 | |||
295 | rb_link_node(&va->rb_node, parent, p); | ||
296 | rb_insert_color(&va->rb_node, &vmap_area_root); | ||
297 | |||
298 | /* address-sort this list so it is usable like the vmlist */ | ||
299 | tmp = rb_prev(&va->rb_node); | ||
300 | if (tmp) { | ||
301 | struct vmap_area *prev; | ||
302 | prev = rb_entry(tmp, struct vmap_area, rb_node); | ||
303 | list_add_rcu(&va->list, &prev->list); | ||
304 | } else | ||
305 | list_add_rcu(&va->list, &vmap_area_list); | ||
306 | } | ||
307 | |||
308 | static void purge_vmap_area_lazy(void); | ||
309 | |||
310 | /* | ||
311 | * Allocate a region of KVA of the specified size and alignment, within the | ||
312 | * vstart and vend. | ||
313 | */ | ||
314 | static struct vmap_area *alloc_vmap_area(unsigned long size, | ||
315 | unsigned long align, | ||
316 | unsigned long vstart, unsigned long vend, | ||
317 | int node, gfp_t gfp_mask) | ||
318 | { | ||
319 | struct vmap_area *va; | ||
320 | struct rb_node *n; | ||
222 | unsigned long addr; | 321 | unsigned long addr; |
322 | int purged = 0; | ||
323 | |||
324 | BUG_ON(size & ~PAGE_MASK); | ||
325 | |||
326 | addr = ALIGN(vstart, align); | ||
327 | |||
328 | va = kmalloc_node(sizeof(struct vmap_area), | ||
329 | gfp_mask & GFP_RECLAIM_MASK, node); | ||
330 | if (unlikely(!va)) | ||
331 | return ERR_PTR(-ENOMEM); | ||
332 | |||
333 | retry: | ||
334 | spin_lock(&vmap_area_lock); | ||
335 | /* XXX: could have a last_hole cache */ | ||
336 | n = vmap_area_root.rb_node; | ||
337 | if (n) { | ||
338 | struct vmap_area *first = NULL; | ||
339 | |||
340 | do { | ||
341 | struct vmap_area *tmp; | ||
342 | tmp = rb_entry(n, struct vmap_area, rb_node); | ||
343 | if (tmp->va_end >= addr) { | ||
344 | if (!first && tmp->va_start < addr + size) | ||
345 | first = tmp; | ||
346 | n = n->rb_left; | ||
347 | } else { | ||
348 | first = tmp; | ||
349 | n = n->rb_right; | ||
350 | } | ||
351 | } while (n); | ||
352 | |||
353 | if (!first) | ||
354 | goto found; | ||
355 | |||
356 | if (first->va_end < addr) { | ||
357 | n = rb_next(&first->rb_node); | ||
358 | if (n) | ||
359 | first = rb_entry(n, struct vmap_area, rb_node); | ||
360 | else | ||
361 | goto found; | ||
362 | } | ||
363 | |||
364 | while (addr + size >= first->va_start && addr + size <= vend) { | ||
365 | addr = ALIGN(first->va_end + PAGE_SIZE, align); | ||
366 | |||
367 | n = rb_next(&first->rb_node); | ||
368 | if (n) | ||
369 | first = rb_entry(n, struct vmap_area, rb_node); | ||
370 | else | ||
371 | goto found; | ||
372 | } | ||
373 | } | ||
374 | found: | ||
375 | if (addr + size > vend) { | ||
376 | spin_unlock(&vmap_area_lock); | ||
377 | if (!purged) { | ||
378 | purge_vmap_area_lazy(); | ||
379 | purged = 1; | ||
380 | goto retry; | ||
381 | } | ||
382 | if (printk_ratelimit()) | ||
383 | printk(KERN_WARNING "vmap allocation failed: " | ||
384 | "use vmalloc=<size> to increase size.\n"); | ||
385 | return ERR_PTR(-EBUSY); | ||
386 | } | ||
387 | |||
388 | BUG_ON(addr & (align-1)); | ||
389 | |||
390 | va->va_start = addr; | ||
391 | va->va_end = addr + size; | ||
392 | va->flags = 0; | ||
393 | __insert_vmap_area(va); | ||
394 | spin_unlock(&vmap_area_lock); | ||
395 | |||
396 | return va; | ||
397 | } | ||
398 | |||
399 | static void rcu_free_va(struct rcu_head *head) | ||
400 | { | ||
401 | struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); | ||
402 | |||
403 | kfree(va); | ||
404 | } | ||
405 | |||
406 | static void __free_vmap_area(struct vmap_area *va) | ||
407 | { | ||
408 | BUG_ON(RB_EMPTY_NODE(&va->rb_node)); | ||
409 | rb_erase(&va->rb_node, &vmap_area_root); | ||
410 | RB_CLEAR_NODE(&va->rb_node); | ||
411 | list_del_rcu(&va->list); | ||
412 | |||
413 | call_rcu(&va->rcu_head, rcu_free_va); | ||
414 | } | ||
415 | |||
416 | /* | ||
417 | * Free a region of KVA allocated by alloc_vmap_area | ||
418 | */ | ||
419 | static void free_vmap_area(struct vmap_area *va) | ||
420 | { | ||
421 | spin_lock(&vmap_area_lock); | ||
422 | __free_vmap_area(va); | ||
423 | spin_unlock(&vmap_area_lock); | ||
424 | } | ||
425 | |||
426 | /* | ||
427 | * Clear the pagetable entries of a given vmap_area | ||
428 | */ | ||
429 | static void unmap_vmap_area(struct vmap_area *va) | ||
430 | { | ||
431 | vunmap_page_range(va->va_start, va->va_end); | ||
432 | } | ||
433 | |||
434 | /* | ||
435 | * lazy_max_pages is the maximum amount of virtual address space we gather up | ||
436 | * before attempting to purge with a TLB flush. | ||
437 | * | ||
438 | * There is a tradeoff here: a larger number will cover more kernel page tables | ||
439 | * and take slightly longer to purge, but it will linearly reduce the number of | ||
440 | * global TLB flushes that must be performed. It would seem natural to scale | ||
441 | * this number up linearly with the number of CPUs (because vmapping activity | ||
442 | * could also scale linearly with the number of CPUs), however it is likely | ||
443 | * that in practice, workloads might be constrained in other ways that mean | ||
444 | * vmap activity will not scale linearly with CPUs. Also, I want to be | ||
445 | * conservative and not introduce a big latency on huge systems, so go with | ||
446 | * a less aggressive log scale. It will still be an improvement over the old | ||
447 | * code, and it will be simple to change the scale factor if we find that it | ||
448 | * becomes a problem on bigger systems. | ||
449 | */ | ||
450 | static unsigned long lazy_max_pages(void) | ||
451 | { | ||
452 | unsigned int log; | ||
453 | |||
454 | log = fls(num_online_cpus()); | ||
455 | |||
456 | return log * (32UL * 1024 * 1024 / PAGE_SIZE); | ||
457 | } | ||
458 | |||
459 | static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); | ||
460 | |||
461 | /* | ||
462 | * Purges all lazily-freed vmap areas. | ||
463 | * | ||
464 | * If sync is 0 then don't purge if there is already a purge in progress. | ||
465 | * If force_flush is 1, then flush kernel TLBs between *start and *end even | ||
466 | * if we found no lazy vmap areas to unmap (callers can use this to optimise | ||
467 | * their own TLB flushing). | ||
468 | * Returns with *start = min(*start, lowest purged address) | ||
469 | * *end = max(*end, highest purged address) | ||
470 | */ | ||
471 | static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, | ||
472 | int sync, int force_flush) | ||
473 | { | ||
474 | static DEFINE_SPINLOCK(purge_lock); | ||
475 | LIST_HEAD(valist); | ||
476 | struct vmap_area *va; | ||
477 | int nr = 0; | ||
478 | |||
479 | /* | ||
480 | * If sync is 0 but force_flush is 1, we'll go sync anyway but callers | ||
481 | * should not expect such behaviour. This just simplifies locking for | ||
482 | * the case that isn't actually used at the moment anyway. | ||
483 | */ | ||
484 | if (!sync && !force_flush) { | ||
485 | if (!spin_trylock(&purge_lock)) | ||
486 | return; | ||
487 | } else | ||
488 | spin_lock(&purge_lock); | ||
489 | |||
490 | rcu_read_lock(); | ||
491 | list_for_each_entry_rcu(va, &vmap_area_list, list) { | ||
492 | if (va->flags & VM_LAZY_FREE) { | ||
493 | if (va->va_start < *start) | ||
494 | *start = va->va_start; | ||
495 | if (va->va_end > *end) | ||
496 | *end = va->va_end; | ||
497 | nr += (va->va_end - va->va_start) >> PAGE_SHIFT; | ||
498 | unmap_vmap_area(va); | ||
499 | list_add_tail(&va->purge_list, &valist); | ||
500 | va->flags |= VM_LAZY_FREEING; | ||
501 | va->flags &= ~VM_LAZY_FREE; | ||
502 | } | ||
503 | } | ||
504 | rcu_read_unlock(); | ||
505 | |||
506 | if (nr) { | ||
507 | BUG_ON(nr > atomic_read(&vmap_lazy_nr)); | ||
508 | atomic_sub(nr, &vmap_lazy_nr); | ||
509 | } | ||
510 | |||
511 | if (nr || force_flush) | ||
512 | flush_tlb_kernel_range(*start, *end); | ||
513 | |||
514 | if (nr) { | ||
515 | spin_lock(&vmap_area_lock); | ||
516 | list_for_each_entry(va, &valist, purge_list) | ||
517 | __free_vmap_area(va); | ||
518 | spin_unlock(&vmap_area_lock); | ||
519 | } | ||
520 | spin_unlock(&purge_lock); | ||
521 | } | ||
522 | |||
523 | /* | ||
524 | * Kick off a purge of the outstanding lazy areas. | ||
525 | */ | ||
526 | static void purge_vmap_area_lazy(void) | ||
527 | { | ||
528 | unsigned long start = ULONG_MAX, end = 0; | ||
529 | |||
530 | __purge_vmap_area_lazy(&start, &end, 0, 0); | ||
531 | } | ||
532 | |||
533 | /* | ||
534 | * Free and unmap a vmap area | ||
535 | */ | ||
536 | static void free_unmap_vmap_area(struct vmap_area *va) | ||
537 | { | ||
538 | va->flags |= VM_LAZY_FREE; | ||
539 | atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); | ||
540 | if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) | ||
541 | purge_vmap_area_lazy(); | ||
542 | } | ||
543 | |||
544 | static struct vmap_area *find_vmap_area(unsigned long addr) | ||
545 | { | ||
546 | struct vmap_area *va; | ||
547 | |||
548 | spin_lock(&vmap_area_lock); | ||
549 | va = __find_vmap_area(addr); | ||
550 | spin_unlock(&vmap_area_lock); | ||
551 | |||
552 | return va; | ||
553 | } | ||
554 | |||
555 | static void free_unmap_vmap_area_addr(unsigned long addr) | ||
556 | { | ||
557 | struct vmap_area *va; | ||
558 | |||
559 | va = find_vmap_area(addr); | ||
560 | BUG_ON(!va); | ||
561 | free_unmap_vmap_area(va); | ||
562 | } | ||
563 | |||
564 | |||
565 | /*** Per cpu kva allocator ***/ | ||
566 | |||
567 | /* | ||
568 | * vmap space is limited especially on 32 bit architectures. Ensure there is | ||
569 | * room for at least 16 percpu vmap blocks per CPU. | ||
570 | */ | ||
571 | /* | ||
572 | * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able | ||
573 | * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess | ||
574 | * instead (we just need a rough idea) | ||
575 | */ | ||
576 | #if BITS_PER_LONG == 32 | ||
577 | #define VMALLOC_SPACE (128UL*1024*1024) | ||
578 | #else | ||
579 | #define VMALLOC_SPACE (128UL*1024*1024*1024) | ||
580 | #endif | ||
581 | |||
582 | #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) | ||
583 | #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ | ||
584 | #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ | ||
585 | #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) | ||
586 | #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ | ||
587 | #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ | ||
588 | #define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ | ||
589 | VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ | ||
590 | VMALLOC_PAGES / NR_CPUS / 16)) | ||
591 | |||
592 | #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) | ||
593 | |||
594 | struct vmap_block_queue { | ||
595 | spinlock_t lock; | ||
596 | struct list_head free; | ||
597 | struct list_head dirty; | ||
598 | unsigned int nr_dirty; | ||
599 | }; | ||
600 | |||
601 | struct vmap_block { | ||
602 | spinlock_t lock; | ||
603 | struct vmap_area *va; | ||
604 | struct vmap_block_queue *vbq; | ||
605 | unsigned long free, dirty; | ||
606 | DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); | ||
607 | DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); | ||
608 | union { | ||
609 | struct { | ||
610 | struct list_head free_list; | ||
611 | struct list_head dirty_list; | ||
612 | }; | ||
613 | struct rcu_head rcu_head; | ||
614 | }; | ||
615 | }; | ||
616 | |||
617 | /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ | ||
618 | static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); | ||
619 | |||
620 | /* | ||
621 | * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block | ||
622 | * in the free path. Could get rid of this if we change the API to return a | ||
623 | * "cookie" from alloc, to be passed to free. But no big deal yet. | ||
624 | */ | ||
625 | static DEFINE_SPINLOCK(vmap_block_tree_lock); | ||
626 | static RADIX_TREE(vmap_block_tree, GFP_ATOMIC); | ||
627 | |||
628 | /* | ||
629 | * We should probably have a fallback mechanism to allocate virtual memory | ||
630 | * out of partially filled vmap blocks. However vmap block sizing should be | ||
631 | * fairly reasonable according to the vmalloc size, so it shouldn't be a | ||
632 | * big problem. | ||
633 | */ | ||
634 | |||
635 | static unsigned long addr_to_vb_idx(unsigned long addr) | ||
636 | { | ||
637 | addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); | ||
638 | addr /= VMAP_BLOCK_SIZE; | ||
639 | return addr; | ||
640 | } | ||
641 | |||
642 | static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | ||
643 | { | ||
644 | struct vmap_block_queue *vbq; | ||
645 | struct vmap_block *vb; | ||
646 | struct vmap_area *va; | ||
647 | unsigned long vb_idx; | ||
648 | int node, err; | ||
649 | |||
650 | node = numa_node_id(); | ||
651 | |||
652 | vb = kmalloc_node(sizeof(struct vmap_block), | ||
653 | gfp_mask & GFP_RECLAIM_MASK, node); | ||
654 | if (unlikely(!vb)) | ||
655 | return ERR_PTR(-ENOMEM); | ||
656 | |||
657 | va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, | ||
658 | VMALLOC_START, VMALLOC_END, | ||
659 | node, gfp_mask); | ||
660 | if (unlikely(IS_ERR(va))) { | ||
661 | kfree(vb); | ||
662 | return ERR_PTR(PTR_ERR(va)); | ||
663 | } | ||
664 | |||
665 | err = radix_tree_preload(gfp_mask); | ||
666 | if (unlikely(err)) { | ||
667 | kfree(vb); | ||
668 | free_vmap_area(va); | ||
669 | return ERR_PTR(err); | ||
670 | } | ||
671 | |||
672 | spin_lock_init(&vb->lock); | ||
673 | vb->va = va; | ||
674 | vb->free = VMAP_BBMAP_BITS; | ||
675 | vb->dirty = 0; | ||
676 | bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); | ||
677 | bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); | ||
678 | INIT_LIST_HEAD(&vb->free_list); | ||
679 | INIT_LIST_HEAD(&vb->dirty_list); | ||
680 | |||
681 | vb_idx = addr_to_vb_idx(va->va_start); | ||
682 | spin_lock(&vmap_block_tree_lock); | ||
683 | err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); | ||
684 | spin_unlock(&vmap_block_tree_lock); | ||
685 | BUG_ON(err); | ||
686 | radix_tree_preload_end(); | ||
687 | |||
688 | vbq = &get_cpu_var(vmap_block_queue); | ||
689 | vb->vbq = vbq; | ||
690 | spin_lock(&vbq->lock); | ||
691 | list_add(&vb->free_list, &vbq->free); | ||
692 | spin_unlock(&vbq->lock); | ||
693 | put_cpu_var(vmap_cpu_blocks); | ||
694 | |||
695 | return vb; | ||
696 | } | ||
697 | |||
698 | static void rcu_free_vb(struct rcu_head *head) | ||
699 | { | ||
700 | struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); | ||
701 | |||
702 | kfree(vb); | ||
703 | } | ||
704 | |||
705 | static void free_vmap_block(struct vmap_block *vb) | ||
706 | { | ||
707 | struct vmap_block *tmp; | ||
708 | unsigned long vb_idx; | ||
709 | |||
710 | spin_lock(&vb->vbq->lock); | ||
711 | if (!list_empty(&vb->free_list)) | ||
712 | list_del(&vb->free_list); | ||
713 | if (!list_empty(&vb->dirty_list)) | ||
714 | list_del(&vb->dirty_list); | ||
715 | spin_unlock(&vb->vbq->lock); | ||
716 | |||
717 | vb_idx = addr_to_vb_idx(vb->va->va_start); | ||
718 | spin_lock(&vmap_block_tree_lock); | ||
719 | tmp = radix_tree_delete(&vmap_block_tree, vb_idx); | ||
720 | spin_unlock(&vmap_block_tree_lock); | ||
721 | BUG_ON(tmp != vb); | ||
722 | |||
723 | free_unmap_vmap_area(vb->va); | ||
724 | call_rcu(&vb->rcu_head, rcu_free_vb); | ||
725 | } | ||
726 | |||
727 | static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | ||
728 | { | ||
729 | struct vmap_block_queue *vbq; | ||
730 | struct vmap_block *vb; | ||
731 | unsigned long addr = 0; | ||
732 | unsigned int order; | ||
733 | |||
734 | BUG_ON(size & ~PAGE_MASK); | ||
735 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | ||
736 | order = get_order(size); | ||
737 | |||
738 | again: | ||
739 | rcu_read_lock(); | ||
740 | vbq = &get_cpu_var(vmap_block_queue); | ||
741 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | ||
742 | int i; | ||
743 | |||
744 | spin_lock(&vb->lock); | ||
745 | i = bitmap_find_free_region(vb->alloc_map, | ||
746 | VMAP_BBMAP_BITS, order); | ||
747 | |||
748 | if (i >= 0) { | ||
749 | addr = vb->va->va_start + (i << PAGE_SHIFT); | ||
750 | BUG_ON(addr_to_vb_idx(addr) != | ||
751 | addr_to_vb_idx(vb->va->va_start)); | ||
752 | vb->free -= 1UL << order; | ||
753 | if (vb->free == 0) { | ||
754 | spin_lock(&vbq->lock); | ||
755 | list_del_init(&vb->free_list); | ||
756 | spin_unlock(&vbq->lock); | ||
757 | } | ||
758 | spin_unlock(&vb->lock); | ||
759 | break; | ||
760 | } | ||
761 | spin_unlock(&vb->lock); | ||
762 | } | ||
763 | put_cpu_var(vmap_cpu_blocks); | ||
764 | rcu_read_unlock(); | ||
765 | |||
766 | if (!addr) { | ||
767 | vb = new_vmap_block(gfp_mask); | ||
768 | if (IS_ERR(vb)) | ||
769 | return vb; | ||
770 | goto again; | ||
771 | } | ||
772 | |||
773 | return (void *)addr; | ||
774 | } | ||
775 | |||
776 | static void vb_free(const void *addr, unsigned long size) | ||
777 | { | ||
778 | unsigned long offset; | ||
779 | unsigned long vb_idx; | ||
780 | unsigned int order; | ||
781 | struct vmap_block *vb; | ||
782 | |||
783 | BUG_ON(size & ~PAGE_MASK); | ||
784 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | ||
785 | order = get_order(size); | ||
786 | |||
787 | offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); | ||
788 | |||
789 | vb_idx = addr_to_vb_idx((unsigned long)addr); | ||
790 | rcu_read_lock(); | ||
791 | vb = radix_tree_lookup(&vmap_block_tree, vb_idx); | ||
792 | rcu_read_unlock(); | ||
793 | BUG_ON(!vb); | ||
794 | |||
795 | spin_lock(&vb->lock); | ||
796 | bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); | ||
797 | if (!vb->dirty) { | ||
798 | spin_lock(&vb->vbq->lock); | ||
799 | list_add(&vb->dirty_list, &vb->vbq->dirty); | ||
800 | spin_unlock(&vb->vbq->lock); | ||
801 | } | ||
802 | vb->dirty += 1UL << order; | ||
803 | if (vb->dirty == VMAP_BBMAP_BITS) { | ||
804 | BUG_ON(vb->free || !list_empty(&vb->free_list)); | ||
805 | spin_unlock(&vb->lock); | ||
806 | free_vmap_block(vb); | ||
807 | } else | ||
808 | spin_unlock(&vb->lock); | ||
809 | } | ||
810 | |||
811 | /** | ||
812 | * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer | ||
813 | * | ||
814 | * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily | ||
815 | * to amortize TLB flushing overheads. What this means is that any page you | ||
816 | * have now, may, in a former life, have been mapped into kernel virtual | ||
817 | * address by the vmap layer and so there might be some CPUs with TLB entries | ||
818 | * still referencing that page (additional to the regular 1:1 kernel mapping). | ||
819 | * | ||
820 | * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can | ||
821 | * be sure that none of the pages we have control over will have any aliases | ||
822 | * from the vmap layer. | ||
823 | */ | ||
824 | void vm_unmap_aliases(void) | ||
825 | { | ||
826 | unsigned long start = ULONG_MAX, end = 0; | ||
827 | int cpu; | ||
828 | int flush = 0; | ||
829 | |||
830 | for_each_possible_cpu(cpu) { | ||
831 | struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); | ||
832 | struct vmap_block *vb; | ||
833 | |||
834 | rcu_read_lock(); | ||
835 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | ||
836 | int i; | ||
837 | |||
838 | spin_lock(&vb->lock); | ||
839 | i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); | ||
840 | while (i < VMAP_BBMAP_BITS) { | ||
841 | unsigned long s, e; | ||
842 | int j; | ||
843 | j = find_next_zero_bit(vb->dirty_map, | ||
844 | VMAP_BBMAP_BITS, i); | ||
845 | |||
846 | s = vb->va->va_start + (i << PAGE_SHIFT); | ||
847 | e = vb->va->va_start + (j << PAGE_SHIFT); | ||
848 | vunmap_page_range(s, e); | ||
849 | flush = 1; | ||
850 | |||
851 | if (s < start) | ||
852 | start = s; | ||
853 | if (e > end) | ||
854 | end = e; | ||
855 | |||
856 | i = j; | ||
857 | i = find_next_bit(vb->dirty_map, | ||
858 | VMAP_BBMAP_BITS, i); | ||
859 | } | ||
860 | spin_unlock(&vb->lock); | ||
861 | } | ||
862 | rcu_read_unlock(); | ||
863 | } | ||
864 | |||
865 | __purge_vmap_area_lazy(&start, &end, 1, flush); | ||
866 | } | ||
867 | EXPORT_SYMBOL_GPL(vm_unmap_aliases); | ||
868 | |||
869 | /** | ||
870 | * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram | ||
871 | * @mem: the pointer returned by vm_map_ram | ||
872 | * @count: the count passed to that vm_map_ram call (cannot unmap partial) | ||
873 | */ | ||
874 | void vm_unmap_ram(const void *mem, unsigned int count) | ||
875 | { | ||
876 | unsigned long size = count << PAGE_SHIFT; | ||
877 | unsigned long addr = (unsigned long)mem; | ||
878 | |||
879 | BUG_ON(!addr); | ||
880 | BUG_ON(addr < VMALLOC_START); | ||
881 | BUG_ON(addr > VMALLOC_END); | ||
882 | BUG_ON(addr & (PAGE_SIZE-1)); | ||
883 | |||
884 | debug_check_no_locks_freed(mem, size); | ||
885 | |||
886 | if (likely(count <= VMAP_MAX_ALLOC)) | ||
887 | vb_free(mem, size); | ||
888 | else | ||
889 | free_unmap_vmap_area_addr(addr); | ||
890 | } | ||
891 | EXPORT_SYMBOL(vm_unmap_ram); | ||
892 | |||
893 | /** | ||
894 | * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) | ||
895 | * @pages: an array of pointers to the pages to be mapped | ||
896 | * @count: number of pages | ||
897 | * @node: prefer to allocate data structures on this node | ||
898 | * @prot: memory protection to use. PAGE_KERNEL for regular RAM | ||
899 | * @returns: a pointer to the address that has been mapped, or NULL on failure | ||
900 | */ | ||
901 | void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) | ||
902 | { | ||
903 | unsigned long size = count << PAGE_SHIFT; | ||
904 | unsigned long addr; | ||
905 | void *mem; | ||
906 | |||
907 | if (likely(count <= VMAP_MAX_ALLOC)) { | ||
908 | mem = vb_alloc(size, GFP_KERNEL); | ||
909 | if (IS_ERR(mem)) | ||
910 | return NULL; | ||
911 | addr = (unsigned long)mem; | ||
912 | } else { | ||
913 | struct vmap_area *va; | ||
914 | va = alloc_vmap_area(size, PAGE_SIZE, | ||
915 | VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); | ||
916 | if (IS_ERR(va)) | ||
917 | return NULL; | ||
918 | |||
919 | addr = va->va_start; | ||
920 | mem = (void *)addr; | ||
921 | } | ||
922 | if (vmap_page_range(addr, addr + size, prot, pages) < 0) { | ||
923 | vm_unmap_ram(mem, count); | ||
924 | return NULL; | ||
925 | } | ||
926 | return mem; | ||
927 | } | ||
928 | EXPORT_SYMBOL(vm_map_ram); | ||
929 | |||
930 | void __init vmalloc_init(void) | ||
931 | { | ||
932 | int i; | ||
933 | |||
934 | for_each_possible_cpu(i) { | ||
935 | struct vmap_block_queue *vbq; | ||
936 | |||
937 | vbq = &per_cpu(vmap_block_queue, i); | ||
938 | spin_lock_init(&vbq->lock); | ||
939 | INIT_LIST_HEAD(&vbq->free); | ||
940 | INIT_LIST_HEAD(&vbq->dirty); | ||
941 | vbq->nr_dirty = 0; | ||
942 | } | ||
943 | } | ||
944 | |||
945 | void unmap_kernel_range(unsigned long addr, unsigned long size) | ||
946 | { | ||
947 | unsigned long end = addr + size; | ||
948 | vunmap_page_range(addr, end); | ||
949 | flush_tlb_kernel_range(addr, end); | ||
950 | } | ||
951 | |||
952 | int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | ||
953 | { | ||
954 | unsigned long addr = (unsigned long)area->addr; | ||
955 | unsigned long end = addr + area->size - PAGE_SIZE; | ||
956 | int err; | ||
957 | |||
958 | err = vmap_page_range(addr, end, prot, *pages); | ||
959 | if (err > 0) { | ||
960 | *pages += err; | ||
961 | err = 0; | ||
962 | } | ||
963 | |||
964 | return err; | ||
965 | } | ||
966 | EXPORT_SYMBOL_GPL(map_vm_area); | ||
967 | |||
968 | /*** Old vmalloc interfaces ***/ | ||
969 | DEFINE_RWLOCK(vmlist_lock); | ||
970 | struct vm_struct *vmlist; | ||
971 | |||
972 | static struct vm_struct *__get_vm_area_node(unsigned long size, | ||
973 | unsigned long flags, unsigned long start, unsigned long end, | ||
974 | int node, gfp_t gfp_mask, void *caller) | ||
975 | { | ||
976 | static struct vmap_area *va; | ||
977 | struct vm_struct *area; | ||
978 | struct vm_struct *tmp, **p; | ||
979 | unsigned long align = 1; | ||
223 | 980 | ||
224 | BUG_ON(in_interrupt()); | 981 | BUG_ON(in_interrupt()); |
225 | if (flags & VM_IOREMAP) { | 982 | if (flags & VM_IOREMAP) { |
@@ -232,13 +989,12 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, | |||
232 | 989 | ||
233 | align = 1ul << bit; | 990 | align = 1ul << bit; |
234 | } | 991 | } |
235 | addr = ALIGN(start, align); | 992 | |
236 | size = PAGE_ALIGN(size); | 993 | size = PAGE_ALIGN(size); |
237 | if (unlikely(!size)) | 994 | if (unlikely(!size)) |
238 | return NULL; | 995 | return NULL; |
239 | 996 | ||
240 | area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); | 997 | area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); |
241 | |||
242 | if (unlikely(!area)) | 998 | if (unlikely(!area)) |
243 | return NULL; | 999 | return NULL; |
244 | 1000 | ||
@@ -247,48 +1003,32 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, | |||
247 | */ | 1003 | */ |
248 | size += PAGE_SIZE; | 1004 | size += PAGE_SIZE; |
249 | 1005 | ||
250 | write_lock(&vmlist_lock); | 1006 | va = alloc_vmap_area(size, align, start, end, node, gfp_mask); |
251 | for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) { | 1007 | if (IS_ERR(va)) { |
252 | if ((unsigned long)tmp->addr < addr) { | 1008 | kfree(area); |
253 | if((unsigned long)tmp->addr + tmp->size >= addr) | 1009 | return NULL; |
254 | addr = ALIGN(tmp->size + | ||
255 | (unsigned long)tmp->addr, align); | ||
256 | continue; | ||
257 | } | ||
258 | if ((size + addr) < addr) | ||
259 | goto out; | ||
260 | if (size + addr <= (unsigned long)tmp->addr) | ||
261 | goto found; | ||
262 | addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align); | ||
263 | if (addr > end - size) | ||
264 | goto out; | ||
265 | } | 1010 | } |
266 | if ((size + addr) < addr) | ||
267 | goto out; | ||
268 | if (addr > end - size) | ||
269 | goto out; | ||
270 | |||
271 | found: | ||
272 | area->next = *p; | ||
273 | *p = area; | ||
274 | 1011 | ||
275 | area->flags = flags; | 1012 | area->flags = flags; |
276 | area->addr = (void *)addr; | 1013 | area->addr = (void *)va->va_start; |
277 | area->size = size; | 1014 | area->size = size; |
278 | area->pages = NULL; | 1015 | area->pages = NULL; |
279 | area->nr_pages = 0; | 1016 | area->nr_pages = 0; |
280 | area->phys_addr = 0; | 1017 | area->phys_addr = 0; |
281 | area->caller = caller; | 1018 | area->caller = caller; |
1019 | va->private = area; | ||
1020 | va->flags |= VM_VM_AREA; | ||
1021 | |||
1022 | write_lock(&vmlist_lock); | ||
1023 | for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { | ||
1024 | if (tmp->addr >= area->addr) | ||
1025 | break; | ||
1026 | } | ||
1027 | area->next = *p; | ||
1028 | *p = area; | ||
282 | write_unlock(&vmlist_lock); | 1029 | write_unlock(&vmlist_lock); |
283 | 1030 | ||
284 | return area; | 1031 | return area; |
285 | |||
286 | out: | ||
287 | write_unlock(&vmlist_lock); | ||
288 | kfree(area); | ||
289 | if (printk_ratelimit()) | ||
290 | printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n"); | ||
291 | return NULL; | ||
292 | } | 1032 | } |
293 | 1033 | ||
294 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | 1034 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, |
@@ -328,39 +1068,15 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, | |||
328 | gfp_mask, __builtin_return_address(0)); | 1068 | gfp_mask, __builtin_return_address(0)); |
329 | } | 1069 | } |
330 | 1070 | ||
331 | /* Caller must hold vmlist_lock */ | 1071 | static struct vm_struct *find_vm_area(const void *addr) |
332 | static struct vm_struct *__find_vm_area(const void *addr) | ||
333 | { | 1072 | { |
334 | struct vm_struct *tmp; | 1073 | struct vmap_area *va; |
335 | |||
336 | for (tmp = vmlist; tmp != NULL; tmp = tmp->next) { | ||
337 | if (tmp->addr == addr) | ||
338 | break; | ||
339 | } | ||
340 | |||
341 | return tmp; | ||
342 | } | ||
343 | 1074 | ||
344 | /* Caller must hold vmlist_lock */ | 1075 | va = find_vmap_area((unsigned long)addr); |
345 | static struct vm_struct *__remove_vm_area(const void *addr) | 1076 | if (va && va->flags & VM_VM_AREA) |
346 | { | 1077 | return va->private; |
347 | struct vm_struct **p, *tmp; | ||
348 | 1078 | ||
349 | for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) { | ||
350 | if (tmp->addr == addr) | ||
351 | goto found; | ||
352 | } | ||
353 | return NULL; | 1079 | return NULL; |
354 | |||
355 | found: | ||
356 | unmap_vm_area(tmp); | ||
357 | *p = tmp->next; | ||
358 | |||
359 | /* | ||
360 | * Remove the guard page. | ||
361 | */ | ||
362 | tmp->size -= PAGE_SIZE; | ||
363 | return tmp; | ||
364 | } | 1080 | } |
365 | 1081 | ||
366 | /** | 1082 | /** |
@@ -373,11 +1089,24 @@ found: | |||
373 | */ | 1089 | */ |
374 | struct vm_struct *remove_vm_area(const void *addr) | 1090 | struct vm_struct *remove_vm_area(const void *addr) |
375 | { | 1091 | { |
376 | struct vm_struct *v; | 1092 | struct vmap_area *va; |
377 | write_lock(&vmlist_lock); | 1093 | |
378 | v = __remove_vm_area(addr); | 1094 | va = find_vmap_area((unsigned long)addr); |
379 | write_unlock(&vmlist_lock); | 1095 | if (va && va->flags & VM_VM_AREA) { |
380 | return v; | 1096 | struct vm_struct *vm = va->private; |
1097 | struct vm_struct *tmp, **p; | ||
1098 | free_unmap_vmap_area(va); | ||
1099 | vm->size -= PAGE_SIZE; | ||
1100 | |||
1101 | write_lock(&vmlist_lock); | ||
1102 | for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) | ||
1103 | ; | ||
1104 | *p = tmp->next; | ||
1105 | write_unlock(&vmlist_lock); | ||
1106 | |||
1107 | return vm; | ||
1108 | } | ||
1109 | return NULL; | ||
381 | } | 1110 | } |
382 | 1111 | ||
383 | static void __vunmap(const void *addr, int deallocate_pages) | 1112 | static void __vunmap(const void *addr, int deallocate_pages) |
@@ -487,6 +1216,8 @@ void *vmap(struct page **pages, unsigned int count, | |||
487 | } | 1216 | } |
488 | EXPORT_SYMBOL(vmap); | 1217 | EXPORT_SYMBOL(vmap); |
489 | 1218 | ||
1219 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | ||
1220 | int node, void *caller); | ||
490 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | 1221 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, |
491 | pgprot_t prot, int node, void *caller) | 1222 | pgprot_t prot, int node, void *caller) |
492 | { | 1223 | { |
@@ -613,10 +1344,8 @@ void *vmalloc_user(unsigned long size) | |||
613 | 1344 | ||
614 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); | 1345 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); |
615 | if (ret) { | 1346 | if (ret) { |
616 | write_lock(&vmlist_lock); | 1347 | area = find_vm_area(ret); |
617 | area = __find_vm_area(ret); | ||
618 | area->flags |= VM_USERMAP; | 1348 | area->flags |= VM_USERMAP; |
619 | write_unlock(&vmlist_lock); | ||
620 | } | 1349 | } |
621 | return ret; | 1350 | return ret; |
622 | } | 1351 | } |
@@ -696,10 +1425,8 @@ void *vmalloc_32_user(unsigned long size) | |||
696 | 1425 | ||
697 | ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); | 1426 | ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); |
698 | if (ret) { | 1427 | if (ret) { |
699 | write_lock(&vmlist_lock); | 1428 | area = find_vm_area(ret); |
700 | area = __find_vm_area(ret); | ||
701 | area->flags |= VM_USERMAP; | 1429 | area->flags |= VM_USERMAP; |
702 | write_unlock(&vmlist_lock); | ||
703 | } | 1430 | } |
704 | return ret; | 1431 | return ret; |
705 | } | 1432 | } |
@@ -800,26 +1527,25 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | |||
800 | struct vm_struct *area; | 1527 | struct vm_struct *area; |
801 | unsigned long uaddr = vma->vm_start; | 1528 | unsigned long uaddr = vma->vm_start; |
802 | unsigned long usize = vma->vm_end - vma->vm_start; | 1529 | unsigned long usize = vma->vm_end - vma->vm_start; |
803 | int ret; | ||
804 | 1530 | ||
805 | if ((PAGE_SIZE-1) & (unsigned long)addr) | 1531 | if ((PAGE_SIZE-1) & (unsigned long)addr) |
806 | return -EINVAL; | 1532 | return -EINVAL; |
807 | 1533 | ||
808 | read_lock(&vmlist_lock); | 1534 | area = find_vm_area(addr); |
809 | area = __find_vm_area(addr); | ||
810 | if (!area) | 1535 | if (!area) |
811 | goto out_einval_locked; | 1536 | return -EINVAL; |
812 | 1537 | ||
813 | if (!(area->flags & VM_USERMAP)) | 1538 | if (!(area->flags & VM_USERMAP)) |
814 | goto out_einval_locked; | 1539 | return -EINVAL; |
815 | 1540 | ||
816 | if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) | 1541 | if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) |
817 | goto out_einval_locked; | 1542 | return -EINVAL; |
818 | read_unlock(&vmlist_lock); | ||
819 | 1543 | ||
820 | addr += pgoff << PAGE_SHIFT; | 1544 | addr += pgoff << PAGE_SHIFT; |
821 | do { | 1545 | do { |
822 | struct page *page = vmalloc_to_page(addr); | 1546 | struct page *page = vmalloc_to_page(addr); |
1547 | int ret; | ||
1548 | |||
823 | ret = vm_insert_page(vma, uaddr, page); | 1549 | ret = vm_insert_page(vma, uaddr, page); |
824 | if (ret) | 1550 | if (ret) |
825 | return ret; | 1551 | return ret; |
@@ -832,11 +1558,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | |||
832 | /* Prevent "things" like memory migration? VM_flags need a cleanup... */ | 1558 | /* Prevent "things" like memory migration? VM_flags need a cleanup... */ |
833 | vma->vm_flags |= VM_RESERVED; | 1559 | vma->vm_flags |= VM_RESERVED; |
834 | 1560 | ||
835 | return ret; | 1561 | return 0; |
836 | |||
837 | out_einval_locked: | ||
838 | read_unlock(&vmlist_lock); | ||
839 | return -EINVAL; | ||
840 | } | 1562 | } |
841 | EXPORT_SYMBOL(remap_vmalloc_range); | 1563 | EXPORT_SYMBOL(remap_vmalloc_range); |
842 | 1564 | ||