diff options
author | Ingo Molnar <mingo@elte.hu> | 2008-10-28 11:26:12 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@elte.hu> | 2008-10-28 11:26:12 -0400 |
commit | 7a9787e1eba95a166265e6a260cf30af04ef0a99 (patch) | |
tree | e730a4565e0318140d2fbd2f0415d18a339d7336 /mm/vmalloc.c | |
parent | 41b9eb264c8407655db57b60b4457fe1b2ec9977 (diff) | |
parent | 0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff) |
Merge commit 'v2.6.28-rc2' into x86/pci-ioapic-boot-irq-quirks
Diffstat (limited to 'mm/vmalloc.c')
-rw-r--r-- | mm/vmalloc.c | 1056 |
1 files changed, 917 insertions, 139 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 6e45b0f3d125..036536945dd9 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
@@ -8,26 +8,28 @@ | |||
8 | * Numa awareness, Christoph Lameter, SGI, June 2005 | 8 | * Numa awareness, Christoph Lameter, SGI, June 2005 |
9 | */ | 9 | */ |
10 | 10 | ||
11 | #include <linux/vmalloc.h> | ||
11 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
12 | #include <linux/module.h> | 13 | #include <linux/module.h> |
13 | #include <linux/highmem.h> | 14 | #include <linux/highmem.h> |
14 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
15 | #include <linux/spinlock.h> | 16 | #include <linux/spinlock.h> |
16 | #include <linux/interrupt.h> | 17 | #include <linux/interrupt.h> |
18 | #include <linux/proc_fs.h> | ||
17 | #include <linux/seq_file.h> | 19 | #include <linux/seq_file.h> |
18 | #include <linux/debugobjects.h> | 20 | #include <linux/debugobjects.h> |
19 | #include <linux/vmalloc.h> | ||
20 | #include <linux/kallsyms.h> | 21 | #include <linux/kallsyms.h> |
22 | #include <linux/list.h> | ||
23 | #include <linux/rbtree.h> | ||
24 | #include <linux/radix-tree.h> | ||
25 | #include <linux/rcupdate.h> | ||
21 | 26 | ||
27 | #include <asm/atomic.h> | ||
22 | #include <asm/uaccess.h> | 28 | #include <asm/uaccess.h> |
23 | #include <asm/tlbflush.h> | 29 | #include <asm/tlbflush.h> |
24 | 30 | ||
25 | 31 | ||
26 | DEFINE_RWLOCK(vmlist_lock); | 32 | /*** Page table manipulation functions ***/ |
27 | struct vm_struct *vmlist; | ||
28 | |||
29 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | ||
30 | int node, void *caller); | ||
31 | 33 | ||
32 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) | 34 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) |
33 | { | 35 | { |
@@ -40,8 +42,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) | |||
40 | } while (pte++, addr += PAGE_SIZE, addr != end); | 42 | } while (pte++, addr += PAGE_SIZE, addr != end); |
41 | } | 43 | } |
42 | 44 | ||
43 | static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, | 45 | static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) |
44 | unsigned long end) | ||
45 | { | 46 | { |
46 | pmd_t *pmd; | 47 | pmd_t *pmd; |
47 | unsigned long next; | 48 | unsigned long next; |
@@ -55,8 +56,7 @@ static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, | |||
55 | } while (pmd++, addr = next, addr != end); | 56 | } while (pmd++, addr = next, addr != end); |
56 | } | 57 | } |
57 | 58 | ||
58 | static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, | 59 | static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) |
59 | unsigned long end) | ||
60 | { | 60 | { |
61 | pud_t *pud; | 61 | pud_t *pud; |
62 | unsigned long next; | 62 | unsigned long next; |
@@ -70,12 +70,10 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, | |||
70 | } while (pud++, addr = next, addr != end); | 70 | } while (pud++, addr = next, addr != end); |
71 | } | 71 | } |
72 | 72 | ||
73 | void unmap_kernel_range(unsigned long addr, unsigned long size) | 73 | static void vunmap_page_range(unsigned long addr, unsigned long end) |
74 | { | 74 | { |
75 | pgd_t *pgd; | 75 | pgd_t *pgd; |
76 | unsigned long next; | 76 | unsigned long next; |
77 | unsigned long start = addr; | ||
78 | unsigned long end = addr + size; | ||
79 | 77 | ||
80 | BUG_ON(addr >= end); | 78 | BUG_ON(addr >= end); |
81 | pgd = pgd_offset_k(addr); | 79 | pgd = pgd_offset_k(addr); |
@@ -86,35 +84,36 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) | |||
86 | continue; | 84 | continue; |
87 | vunmap_pud_range(pgd, addr, next); | 85 | vunmap_pud_range(pgd, addr, next); |
88 | } while (pgd++, addr = next, addr != end); | 86 | } while (pgd++, addr = next, addr != end); |
89 | flush_tlb_kernel_range(start, end); | ||
90 | } | ||
91 | |||
92 | static void unmap_vm_area(struct vm_struct *area) | ||
93 | { | ||
94 | unmap_kernel_range((unsigned long)area->addr, area->size); | ||
95 | } | 87 | } |
96 | 88 | ||
97 | static int vmap_pte_range(pmd_t *pmd, unsigned long addr, | 89 | static int vmap_pte_range(pmd_t *pmd, unsigned long addr, |
98 | unsigned long end, pgprot_t prot, struct page ***pages) | 90 | unsigned long end, pgprot_t prot, struct page **pages, int *nr) |
99 | { | 91 | { |
100 | pte_t *pte; | 92 | pte_t *pte; |
101 | 93 | ||
94 | /* | ||
95 | * nr is a running index into the array which helps higher level | ||
96 | * callers keep track of where we're up to. | ||
97 | */ | ||
98 | |||
102 | pte = pte_alloc_kernel(pmd, addr); | 99 | pte = pte_alloc_kernel(pmd, addr); |
103 | if (!pte) | 100 | if (!pte) |
104 | return -ENOMEM; | 101 | return -ENOMEM; |
105 | do { | 102 | do { |
106 | struct page *page = **pages; | 103 | struct page *page = pages[*nr]; |
107 | WARN_ON(!pte_none(*pte)); | 104 | |
108 | if (!page) | 105 | if (WARN_ON(!pte_none(*pte))) |
106 | return -EBUSY; | ||
107 | if (WARN_ON(!page)) | ||
109 | return -ENOMEM; | 108 | return -ENOMEM; |
110 | set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); | 109 | set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); |
111 | (*pages)++; | 110 | (*nr)++; |
112 | } while (pte++, addr += PAGE_SIZE, addr != end); | 111 | } while (pte++, addr += PAGE_SIZE, addr != end); |
113 | return 0; | 112 | return 0; |
114 | } | 113 | } |
115 | 114 | ||
116 | static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, | 115 | static int vmap_pmd_range(pud_t *pud, unsigned long addr, |
117 | unsigned long end, pgprot_t prot, struct page ***pages) | 116 | unsigned long end, pgprot_t prot, struct page **pages, int *nr) |
118 | { | 117 | { |
119 | pmd_t *pmd; | 118 | pmd_t *pmd; |
120 | unsigned long next; | 119 | unsigned long next; |
@@ -124,14 +123,14 @@ static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, | |||
124 | return -ENOMEM; | 123 | return -ENOMEM; |
125 | do { | 124 | do { |
126 | next = pmd_addr_end(addr, end); | 125 | next = pmd_addr_end(addr, end); |
127 | if (vmap_pte_range(pmd, addr, next, prot, pages)) | 126 | if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) |
128 | return -ENOMEM; | 127 | return -ENOMEM; |
129 | } while (pmd++, addr = next, addr != end); | 128 | } while (pmd++, addr = next, addr != end); |
130 | return 0; | 129 | return 0; |
131 | } | 130 | } |
132 | 131 | ||
133 | static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, | 132 | static int vmap_pud_range(pgd_t *pgd, unsigned long addr, |
134 | unsigned long end, pgprot_t prot, struct page ***pages) | 133 | unsigned long end, pgprot_t prot, struct page **pages, int *nr) |
135 | { | 134 | { |
136 | pud_t *pud; | 135 | pud_t *pud; |
137 | unsigned long next; | 136 | unsigned long next; |
@@ -141,50 +140,78 @@ static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, | |||
141 | return -ENOMEM; | 140 | return -ENOMEM; |
142 | do { | 141 | do { |
143 | next = pud_addr_end(addr, end); | 142 | next = pud_addr_end(addr, end); |
144 | if (vmap_pmd_range(pud, addr, next, prot, pages)) | 143 | if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) |
145 | return -ENOMEM; | 144 | return -ENOMEM; |
146 | } while (pud++, addr = next, addr != end); | 145 | } while (pud++, addr = next, addr != end); |
147 | return 0; | 146 | return 0; |
148 | } | 147 | } |
149 | 148 | ||
150 | int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | 149 | /* |
150 | * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and | ||
151 | * will have pfns corresponding to the "pages" array. | ||
152 | * | ||
153 | * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] | ||
154 | */ | ||
155 | static int vmap_page_range(unsigned long addr, unsigned long end, | ||
156 | pgprot_t prot, struct page **pages) | ||
151 | { | 157 | { |
152 | pgd_t *pgd; | 158 | pgd_t *pgd; |
153 | unsigned long next; | 159 | unsigned long next; |
154 | unsigned long addr = (unsigned long) area->addr; | 160 | int err = 0; |
155 | unsigned long end = addr + area->size - PAGE_SIZE; | 161 | int nr = 0; |
156 | int err; | ||
157 | 162 | ||
158 | BUG_ON(addr >= end); | 163 | BUG_ON(addr >= end); |
159 | pgd = pgd_offset_k(addr); | 164 | pgd = pgd_offset_k(addr); |
160 | do { | 165 | do { |
161 | next = pgd_addr_end(addr, end); | 166 | next = pgd_addr_end(addr, end); |
162 | err = vmap_pud_range(pgd, addr, next, prot, pages); | 167 | err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); |
163 | if (err) | 168 | if (err) |
164 | break; | 169 | break; |
165 | } while (pgd++, addr = next, addr != end); | 170 | } while (pgd++, addr = next, addr != end); |
166 | flush_cache_vmap((unsigned long) area->addr, end); | 171 | flush_cache_vmap(addr, end); |
167 | return err; | 172 | |
173 | if (unlikely(err)) | ||
174 | return err; | ||
175 | return nr; | ||
176 | } | ||
177 | |||
178 | static inline int is_vmalloc_or_module_addr(const void *x) | ||
179 | { | ||
180 | /* | ||
181 | * x86-64 and sparc64 put modules in a special place, | ||
182 | * and fall back on vmalloc() if that fails. Others | ||
183 | * just put it in the vmalloc space. | ||
184 | */ | ||
185 | #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) | ||
186 | unsigned long addr = (unsigned long)x; | ||
187 | if (addr >= MODULES_VADDR && addr < MODULES_END) | ||
188 | return 1; | ||
189 | #endif | ||
190 | return is_vmalloc_addr(x); | ||
168 | } | 191 | } |
169 | EXPORT_SYMBOL_GPL(map_vm_area); | ||
170 | 192 | ||
171 | /* | 193 | /* |
172 | * Map a vmalloc()-space virtual address to the physical page. | 194 | * Walk a vmap address to the struct page it maps. |
173 | */ | 195 | */ |
174 | struct page *vmalloc_to_page(const void *vmalloc_addr) | 196 | struct page *vmalloc_to_page(const void *vmalloc_addr) |
175 | { | 197 | { |
176 | unsigned long addr = (unsigned long) vmalloc_addr; | 198 | unsigned long addr = (unsigned long) vmalloc_addr; |
177 | struct page *page = NULL; | 199 | struct page *page = NULL; |
178 | pgd_t *pgd = pgd_offset_k(addr); | 200 | pgd_t *pgd = pgd_offset_k(addr); |
179 | pud_t *pud; | 201 | |
180 | pmd_t *pmd; | 202 | /* |
181 | pte_t *ptep, pte; | 203 | * XXX we might need to change this if we add VIRTUAL_BUG_ON for |
204 | * architectures that do not vmalloc module space | ||
205 | */ | ||
206 | VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); | ||
182 | 207 | ||
183 | if (!pgd_none(*pgd)) { | 208 | if (!pgd_none(*pgd)) { |
184 | pud = pud_offset(pgd, addr); | 209 | pud_t *pud = pud_offset(pgd, addr); |
185 | if (!pud_none(*pud)) { | 210 | if (!pud_none(*pud)) { |
186 | pmd = pmd_offset(pud, addr); | 211 | pmd_t *pmd = pmd_offset(pud, addr); |
187 | if (!pmd_none(*pmd)) { | 212 | if (!pmd_none(*pmd)) { |
213 | pte_t *ptep, pte; | ||
214 | |||
188 | ptep = pte_offset_map(pmd, addr); | 215 | ptep = pte_offset_map(pmd, addr); |
189 | pte = *ptep; | 216 | pte = *ptep; |
190 | if (pte_present(pte)) | 217 | if (pte_present(pte)) |
@@ -206,13 +233,751 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr) | |||
206 | } | 233 | } |
207 | EXPORT_SYMBOL(vmalloc_to_pfn); | 234 | EXPORT_SYMBOL(vmalloc_to_pfn); |
208 | 235 | ||
209 | static struct vm_struct * | 236 | |
210 | __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, | 237 | /*** Global kva allocator ***/ |
211 | unsigned long end, int node, gfp_t gfp_mask, void *caller) | 238 | |
239 | #define VM_LAZY_FREE 0x01 | ||
240 | #define VM_LAZY_FREEING 0x02 | ||
241 | #define VM_VM_AREA 0x04 | ||
242 | |||
243 | struct vmap_area { | ||
244 | unsigned long va_start; | ||
245 | unsigned long va_end; | ||
246 | unsigned long flags; | ||
247 | struct rb_node rb_node; /* address sorted rbtree */ | ||
248 | struct list_head list; /* address sorted list */ | ||
249 | struct list_head purge_list; /* "lazy purge" list */ | ||
250 | void *private; | ||
251 | struct rcu_head rcu_head; | ||
252 | }; | ||
253 | |||
254 | static DEFINE_SPINLOCK(vmap_area_lock); | ||
255 | static struct rb_root vmap_area_root = RB_ROOT; | ||
256 | static LIST_HEAD(vmap_area_list); | ||
257 | |||
258 | static struct vmap_area *__find_vmap_area(unsigned long addr) | ||
212 | { | 259 | { |
213 | struct vm_struct **p, *tmp, *area; | 260 | struct rb_node *n = vmap_area_root.rb_node; |
214 | unsigned long align = 1; | 261 | |
262 | while (n) { | ||
263 | struct vmap_area *va; | ||
264 | |||
265 | va = rb_entry(n, struct vmap_area, rb_node); | ||
266 | if (addr < va->va_start) | ||
267 | n = n->rb_left; | ||
268 | else if (addr > va->va_start) | ||
269 | n = n->rb_right; | ||
270 | else | ||
271 | return va; | ||
272 | } | ||
273 | |||
274 | return NULL; | ||
275 | } | ||
276 | |||
277 | static void __insert_vmap_area(struct vmap_area *va) | ||
278 | { | ||
279 | struct rb_node **p = &vmap_area_root.rb_node; | ||
280 | struct rb_node *parent = NULL; | ||
281 | struct rb_node *tmp; | ||
282 | |||
283 | while (*p) { | ||
284 | struct vmap_area *tmp; | ||
285 | |||
286 | parent = *p; | ||
287 | tmp = rb_entry(parent, struct vmap_area, rb_node); | ||
288 | if (va->va_start < tmp->va_end) | ||
289 | p = &(*p)->rb_left; | ||
290 | else if (va->va_end > tmp->va_start) | ||
291 | p = &(*p)->rb_right; | ||
292 | else | ||
293 | BUG(); | ||
294 | } | ||
295 | |||
296 | rb_link_node(&va->rb_node, parent, p); | ||
297 | rb_insert_color(&va->rb_node, &vmap_area_root); | ||
298 | |||
299 | /* address-sort this list so it is usable like the vmlist */ | ||
300 | tmp = rb_prev(&va->rb_node); | ||
301 | if (tmp) { | ||
302 | struct vmap_area *prev; | ||
303 | prev = rb_entry(tmp, struct vmap_area, rb_node); | ||
304 | list_add_rcu(&va->list, &prev->list); | ||
305 | } else | ||
306 | list_add_rcu(&va->list, &vmap_area_list); | ||
307 | } | ||
308 | |||
309 | static void purge_vmap_area_lazy(void); | ||
310 | |||
311 | /* | ||
312 | * Allocate a region of KVA of the specified size and alignment, within the | ||
313 | * vstart and vend. | ||
314 | */ | ||
315 | static struct vmap_area *alloc_vmap_area(unsigned long size, | ||
316 | unsigned long align, | ||
317 | unsigned long vstart, unsigned long vend, | ||
318 | int node, gfp_t gfp_mask) | ||
319 | { | ||
320 | struct vmap_area *va; | ||
321 | struct rb_node *n; | ||
322 | unsigned long addr; | ||
323 | int purged = 0; | ||
324 | |||
325 | BUG_ON(size & ~PAGE_MASK); | ||
326 | |||
327 | addr = ALIGN(vstart, align); | ||
328 | |||
329 | va = kmalloc_node(sizeof(struct vmap_area), | ||
330 | gfp_mask & GFP_RECLAIM_MASK, node); | ||
331 | if (unlikely(!va)) | ||
332 | return ERR_PTR(-ENOMEM); | ||
333 | |||
334 | retry: | ||
335 | spin_lock(&vmap_area_lock); | ||
336 | /* XXX: could have a last_hole cache */ | ||
337 | n = vmap_area_root.rb_node; | ||
338 | if (n) { | ||
339 | struct vmap_area *first = NULL; | ||
340 | |||
341 | do { | ||
342 | struct vmap_area *tmp; | ||
343 | tmp = rb_entry(n, struct vmap_area, rb_node); | ||
344 | if (tmp->va_end >= addr) { | ||
345 | if (!first && tmp->va_start < addr + size) | ||
346 | first = tmp; | ||
347 | n = n->rb_left; | ||
348 | } else { | ||
349 | first = tmp; | ||
350 | n = n->rb_right; | ||
351 | } | ||
352 | } while (n); | ||
353 | |||
354 | if (!first) | ||
355 | goto found; | ||
356 | |||
357 | if (first->va_end < addr) { | ||
358 | n = rb_next(&first->rb_node); | ||
359 | if (n) | ||
360 | first = rb_entry(n, struct vmap_area, rb_node); | ||
361 | else | ||
362 | goto found; | ||
363 | } | ||
364 | |||
365 | while (addr + size >= first->va_start && addr + size <= vend) { | ||
366 | addr = ALIGN(first->va_end + PAGE_SIZE, align); | ||
367 | |||
368 | n = rb_next(&first->rb_node); | ||
369 | if (n) | ||
370 | first = rb_entry(n, struct vmap_area, rb_node); | ||
371 | else | ||
372 | goto found; | ||
373 | } | ||
374 | } | ||
375 | found: | ||
376 | if (addr + size > vend) { | ||
377 | spin_unlock(&vmap_area_lock); | ||
378 | if (!purged) { | ||
379 | purge_vmap_area_lazy(); | ||
380 | purged = 1; | ||
381 | goto retry; | ||
382 | } | ||
383 | if (printk_ratelimit()) | ||
384 | printk(KERN_WARNING "vmap allocation failed: " | ||
385 | "use vmalloc=<size> to increase size.\n"); | ||
386 | return ERR_PTR(-EBUSY); | ||
387 | } | ||
388 | |||
389 | BUG_ON(addr & (align-1)); | ||
390 | |||
391 | va->va_start = addr; | ||
392 | va->va_end = addr + size; | ||
393 | va->flags = 0; | ||
394 | __insert_vmap_area(va); | ||
395 | spin_unlock(&vmap_area_lock); | ||
396 | |||
397 | return va; | ||
398 | } | ||
399 | |||
400 | static void rcu_free_va(struct rcu_head *head) | ||
401 | { | ||
402 | struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); | ||
403 | |||
404 | kfree(va); | ||
405 | } | ||
406 | |||
407 | static void __free_vmap_area(struct vmap_area *va) | ||
408 | { | ||
409 | BUG_ON(RB_EMPTY_NODE(&va->rb_node)); | ||
410 | rb_erase(&va->rb_node, &vmap_area_root); | ||
411 | RB_CLEAR_NODE(&va->rb_node); | ||
412 | list_del_rcu(&va->list); | ||
413 | |||
414 | call_rcu(&va->rcu_head, rcu_free_va); | ||
415 | } | ||
416 | |||
417 | /* | ||
418 | * Free a region of KVA allocated by alloc_vmap_area | ||
419 | */ | ||
420 | static void free_vmap_area(struct vmap_area *va) | ||
421 | { | ||
422 | spin_lock(&vmap_area_lock); | ||
423 | __free_vmap_area(va); | ||
424 | spin_unlock(&vmap_area_lock); | ||
425 | } | ||
426 | |||
427 | /* | ||
428 | * Clear the pagetable entries of a given vmap_area | ||
429 | */ | ||
430 | static void unmap_vmap_area(struct vmap_area *va) | ||
431 | { | ||
432 | vunmap_page_range(va->va_start, va->va_end); | ||
433 | } | ||
434 | |||
435 | /* | ||
436 | * lazy_max_pages is the maximum amount of virtual address space we gather up | ||
437 | * before attempting to purge with a TLB flush. | ||
438 | * | ||
439 | * There is a tradeoff here: a larger number will cover more kernel page tables | ||
440 | * and take slightly longer to purge, but it will linearly reduce the number of | ||
441 | * global TLB flushes that must be performed. It would seem natural to scale | ||
442 | * this number up linearly with the number of CPUs (because vmapping activity | ||
443 | * could also scale linearly with the number of CPUs), however it is likely | ||
444 | * that in practice, workloads might be constrained in other ways that mean | ||
445 | * vmap activity will not scale linearly with CPUs. Also, I want to be | ||
446 | * conservative and not introduce a big latency on huge systems, so go with | ||
447 | * a less aggressive log scale. It will still be an improvement over the old | ||
448 | * code, and it will be simple to change the scale factor if we find that it | ||
449 | * becomes a problem on bigger systems. | ||
450 | */ | ||
451 | static unsigned long lazy_max_pages(void) | ||
452 | { | ||
453 | unsigned int log; | ||
454 | |||
455 | log = fls(num_online_cpus()); | ||
456 | |||
457 | return log * (32UL * 1024 * 1024 / PAGE_SIZE); | ||
458 | } | ||
459 | |||
460 | static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); | ||
461 | |||
462 | /* | ||
463 | * Purges all lazily-freed vmap areas. | ||
464 | * | ||
465 | * If sync is 0 then don't purge if there is already a purge in progress. | ||
466 | * If force_flush is 1, then flush kernel TLBs between *start and *end even | ||
467 | * if we found no lazy vmap areas to unmap (callers can use this to optimise | ||
468 | * their own TLB flushing). | ||
469 | * Returns with *start = min(*start, lowest purged address) | ||
470 | * *end = max(*end, highest purged address) | ||
471 | */ | ||
472 | static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, | ||
473 | int sync, int force_flush) | ||
474 | { | ||
475 | static DEFINE_SPINLOCK(purge_lock); | ||
476 | LIST_HEAD(valist); | ||
477 | struct vmap_area *va; | ||
478 | int nr = 0; | ||
479 | |||
480 | /* | ||
481 | * If sync is 0 but force_flush is 1, we'll go sync anyway but callers | ||
482 | * should not expect such behaviour. This just simplifies locking for | ||
483 | * the case that isn't actually used at the moment anyway. | ||
484 | */ | ||
485 | if (!sync && !force_flush) { | ||
486 | if (!spin_trylock(&purge_lock)) | ||
487 | return; | ||
488 | } else | ||
489 | spin_lock(&purge_lock); | ||
490 | |||
491 | rcu_read_lock(); | ||
492 | list_for_each_entry_rcu(va, &vmap_area_list, list) { | ||
493 | if (va->flags & VM_LAZY_FREE) { | ||
494 | if (va->va_start < *start) | ||
495 | *start = va->va_start; | ||
496 | if (va->va_end > *end) | ||
497 | *end = va->va_end; | ||
498 | nr += (va->va_end - va->va_start) >> PAGE_SHIFT; | ||
499 | unmap_vmap_area(va); | ||
500 | list_add_tail(&va->purge_list, &valist); | ||
501 | va->flags |= VM_LAZY_FREEING; | ||
502 | va->flags &= ~VM_LAZY_FREE; | ||
503 | } | ||
504 | } | ||
505 | rcu_read_unlock(); | ||
506 | |||
507 | if (nr) { | ||
508 | BUG_ON(nr > atomic_read(&vmap_lazy_nr)); | ||
509 | atomic_sub(nr, &vmap_lazy_nr); | ||
510 | } | ||
511 | |||
512 | if (nr || force_flush) | ||
513 | flush_tlb_kernel_range(*start, *end); | ||
514 | |||
515 | if (nr) { | ||
516 | spin_lock(&vmap_area_lock); | ||
517 | list_for_each_entry(va, &valist, purge_list) | ||
518 | __free_vmap_area(va); | ||
519 | spin_unlock(&vmap_area_lock); | ||
520 | } | ||
521 | spin_unlock(&purge_lock); | ||
522 | } | ||
523 | |||
524 | /* | ||
525 | * Kick off a purge of the outstanding lazy areas. | ||
526 | */ | ||
527 | static void purge_vmap_area_lazy(void) | ||
528 | { | ||
529 | unsigned long start = ULONG_MAX, end = 0; | ||
530 | |||
531 | __purge_vmap_area_lazy(&start, &end, 0, 0); | ||
532 | } | ||
533 | |||
534 | /* | ||
535 | * Free and unmap a vmap area | ||
536 | */ | ||
537 | static void free_unmap_vmap_area(struct vmap_area *va) | ||
538 | { | ||
539 | va->flags |= VM_LAZY_FREE; | ||
540 | atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); | ||
541 | if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) | ||
542 | purge_vmap_area_lazy(); | ||
543 | } | ||
544 | |||
545 | static struct vmap_area *find_vmap_area(unsigned long addr) | ||
546 | { | ||
547 | struct vmap_area *va; | ||
548 | |||
549 | spin_lock(&vmap_area_lock); | ||
550 | va = __find_vmap_area(addr); | ||
551 | spin_unlock(&vmap_area_lock); | ||
552 | |||
553 | return va; | ||
554 | } | ||
555 | |||
556 | static void free_unmap_vmap_area_addr(unsigned long addr) | ||
557 | { | ||
558 | struct vmap_area *va; | ||
559 | |||
560 | va = find_vmap_area(addr); | ||
561 | BUG_ON(!va); | ||
562 | free_unmap_vmap_area(va); | ||
563 | } | ||
564 | |||
565 | |||
566 | /*** Per cpu kva allocator ***/ | ||
567 | |||
568 | /* | ||
569 | * vmap space is limited especially on 32 bit architectures. Ensure there is | ||
570 | * room for at least 16 percpu vmap blocks per CPU. | ||
571 | */ | ||
572 | /* | ||
573 | * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able | ||
574 | * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess | ||
575 | * instead (we just need a rough idea) | ||
576 | */ | ||
577 | #if BITS_PER_LONG == 32 | ||
578 | #define VMALLOC_SPACE (128UL*1024*1024) | ||
579 | #else | ||
580 | #define VMALLOC_SPACE (128UL*1024*1024*1024) | ||
581 | #endif | ||
582 | |||
583 | #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) | ||
584 | #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ | ||
585 | #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ | ||
586 | #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) | ||
587 | #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ | ||
588 | #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ | ||
589 | #define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ | ||
590 | VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ | ||
591 | VMALLOC_PAGES / NR_CPUS / 16)) | ||
592 | |||
593 | #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) | ||
594 | |||
595 | struct vmap_block_queue { | ||
596 | spinlock_t lock; | ||
597 | struct list_head free; | ||
598 | struct list_head dirty; | ||
599 | unsigned int nr_dirty; | ||
600 | }; | ||
601 | |||
602 | struct vmap_block { | ||
603 | spinlock_t lock; | ||
604 | struct vmap_area *va; | ||
605 | struct vmap_block_queue *vbq; | ||
606 | unsigned long free, dirty; | ||
607 | DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); | ||
608 | DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); | ||
609 | union { | ||
610 | struct { | ||
611 | struct list_head free_list; | ||
612 | struct list_head dirty_list; | ||
613 | }; | ||
614 | struct rcu_head rcu_head; | ||
615 | }; | ||
616 | }; | ||
617 | |||
618 | /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ | ||
619 | static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); | ||
620 | |||
621 | /* | ||
622 | * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block | ||
623 | * in the free path. Could get rid of this if we change the API to return a | ||
624 | * "cookie" from alloc, to be passed to free. But no big deal yet. | ||
625 | */ | ||
626 | static DEFINE_SPINLOCK(vmap_block_tree_lock); | ||
627 | static RADIX_TREE(vmap_block_tree, GFP_ATOMIC); | ||
628 | |||
629 | /* | ||
630 | * We should probably have a fallback mechanism to allocate virtual memory | ||
631 | * out of partially filled vmap blocks. However vmap block sizing should be | ||
632 | * fairly reasonable according to the vmalloc size, so it shouldn't be a | ||
633 | * big problem. | ||
634 | */ | ||
635 | |||
636 | static unsigned long addr_to_vb_idx(unsigned long addr) | ||
637 | { | ||
638 | addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); | ||
639 | addr /= VMAP_BLOCK_SIZE; | ||
640 | return addr; | ||
641 | } | ||
642 | |||
643 | static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | ||
644 | { | ||
645 | struct vmap_block_queue *vbq; | ||
646 | struct vmap_block *vb; | ||
647 | struct vmap_area *va; | ||
648 | unsigned long vb_idx; | ||
649 | int node, err; | ||
650 | |||
651 | node = numa_node_id(); | ||
652 | |||
653 | vb = kmalloc_node(sizeof(struct vmap_block), | ||
654 | gfp_mask & GFP_RECLAIM_MASK, node); | ||
655 | if (unlikely(!vb)) | ||
656 | return ERR_PTR(-ENOMEM); | ||
657 | |||
658 | va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, | ||
659 | VMALLOC_START, VMALLOC_END, | ||
660 | node, gfp_mask); | ||
661 | if (unlikely(IS_ERR(va))) { | ||
662 | kfree(vb); | ||
663 | return ERR_PTR(PTR_ERR(va)); | ||
664 | } | ||
665 | |||
666 | err = radix_tree_preload(gfp_mask); | ||
667 | if (unlikely(err)) { | ||
668 | kfree(vb); | ||
669 | free_vmap_area(va); | ||
670 | return ERR_PTR(err); | ||
671 | } | ||
672 | |||
673 | spin_lock_init(&vb->lock); | ||
674 | vb->va = va; | ||
675 | vb->free = VMAP_BBMAP_BITS; | ||
676 | vb->dirty = 0; | ||
677 | bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); | ||
678 | bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); | ||
679 | INIT_LIST_HEAD(&vb->free_list); | ||
680 | INIT_LIST_HEAD(&vb->dirty_list); | ||
681 | |||
682 | vb_idx = addr_to_vb_idx(va->va_start); | ||
683 | spin_lock(&vmap_block_tree_lock); | ||
684 | err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); | ||
685 | spin_unlock(&vmap_block_tree_lock); | ||
686 | BUG_ON(err); | ||
687 | radix_tree_preload_end(); | ||
688 | |||
689 | vbq = &get_cpu_var(vmap_block_queue); | ||
690 | vb->vbq = vbq; | ||
691 | spin_lock(&vbq->lock); | ||
692 | list_add(&vb->free_list, &vbq->free); | ||
693 | spin_unlock(&vbq->lock); | ||
694 | put_cpu_var(vmap_cpu_blocks); | ||
695 | |||
696 | return vb; | ||
697 | } | ||
698 | |||
699 | static void rcu_free_vb(struct rcu_head *head) | ||
700 | { | ||
701 | struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); | ||
702 | |||
703 | kfree(vb); | ||
704 | } | ||
705 | |||
706 | static void free_vmap_block(struct vmap_block *vb) | ||
707 | { | ||
708 | struct vmap_block *tmp; | ||
709 | unsigned long vb_idx; | ||
710 | |||
711 | spin_lock(&vb->vbq->lock); | ||
712 | if (!list_empty(&vb->free_list)) | ||
713 | list_del(&vb->free_list); | ||
714 | if (!list_empty(&vb->dirty_list)) | ||
715 | list_del(&vb->dirty_list); | ||
716 | spin_unlock(&vb->vbq->lock); | ||
717 | |||
718 | vb_idx = addr_to_vb_idx(vb->va->va_start); | ||
719 | spin_lock(&vmap_block_tree_lock); | ||
720 | tmp = radix_tree_delete(&vmap_block_tree, vb_idx); | ||
721 | spin_unlock(&vmap_block_tree_lock); | ||
722 | BUG_ON(tmp != vb); | ||
723 | |||
724 | free_unmap_vmap_area(vb->va); | ||
725 | call_rcu(&vb->rcu_head, rcu_free_vb); | ||
726 | } | ||
727 | |||
728 | static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | ||
729 | { | ||
730 | struct vmap_block_queue *vbq; | ||
731 | struct vmap_block *vb; | ||
732 | unsigned long addr = 0; | ||
733 | unsigned int order; | ||
734 | |||
735 | BUG_ON(size & ~PAGE_MASK); | ||
736 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | ||
737 | order = get_order(size); | ||
738 | |||
739 | again: | ||
740 | rcu_read_lock(); | ||
741 | vbq = &get_cpu_var(vmap_block_queue); | ||
742 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | ||
743 | int i; | ||
744 | |||
745 | spin_lock(&vb->lock); | ||
746 | i = bitmap_find_free_region(vb->alloc_map, | ||
747 | VMAP_BBMAP_BITS, order); | ||
748 | |||
749 | if (i >= 0) { | ||
750 | addr = vb->va->va_start + (i << PAGE_SHIFT); | ||
751 | BUG_ON(addr_to_vb_idx(addr) != | ||
752 | addr_to_vb_idx(vb->va->va_start)); | ||
753 | vb->free -= 1UL << order; | ||
754 | if (vb->free == 0) { | ||
755 | spin_lock(&vbq->lock); | ||
756 | list_del_init(&vb->free_list); | ||
757 | spin_unlock(&vbq->lock); | ||
758 | } | ||
759 | spin_unlock(&vb->lock); | ||
760 | break; | ||
761 | } | ||
762 | spin_unlock(&vb->lock); | ||
763 | } | ||
764 | put_cpu_var(vmap_cpu_blocks); | ||
765 | rcu_read_unlock(); | ||
766 | |||
767 | if (!addr) { | ||
768 | vb = new_vmap_block(gfp_mask); | ||
769 | if (IS_ERR(vb)) | ||
770 | return vb; | ||
771 | goto again; | ||
772 | } | ||
773 | |||
774 | return (void *)addr; | ||
775 | } | ||
776 | |||
777 | static void vb_free(const void *addr, unsigned long size) | ||
778 | { | ||
779 | unsigned long offset; | ||
780 | unsigned long vb_idx; | ||
781 | unsigned int order; | ||
782 | struct vmap_block *vb; | ||
783 | |||
784 | BUG_ON(size & ~PAGE_MASK); | ||
785 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | ||
786 | order = get_order(size); | ||
787 | |||
788 | offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); | ||
789 | |||
790 | vb_idx = addr_to_vb_idx((unsigned long)addr); | ||
791 | rcu_read_lock(); | ||
792 | vb = radix_tree_lookup(&vmap_block_tree, vb_idx); | ||
793 | rcu_read_unlock(); | ||
794 | BUG_ON(!vb); | ||
795 | |||
796 | spin_lock(&vb->lock); | ||
797 | bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); | ||
798 | if (!vb->dirty) { | ||
799 | spin_lock(&vb->vbq->lock); | ||
800 | list_add(&vb->dirty_list, &vb->vbq->dirty); | ||
801 | spin_unlock(&vb->vbq->lock); | ||
802 | } | ||
803 | vb->dirty += 1UL << order; | ||
804 | if (vb->dirty == VMAP_BBMAP_BITS) { | ||
805 | BUG_ON(vb->free || !list_empty(&vb->free_list)); | ||
806 | spin_unlock(&vb->lock); | ||
807 | free_vmap_block(vb); | ||
808 | } else | ||
809 | spin_unlock(&vb->lock); | ||
810 | } | ||
811 | |||
812 | /** | ||
813 | * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer | ||
814 | * | ||
815 | * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily | ||
816 | * to amortize TLB flushing overheads. What this means is that any page you | ||
817 | * have now, may, in a former life, have been mapped into kernel virtual | ||
818 | * address by the vmap layer and so there might be some CPUs with TLB entries | ||
819 | * still referencing that page (additional to the regular 1:1 kernel mapping). | ||
820 | * | ||
821 | * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can | ||
822 | * be sure that none of the pages we have control over will have any aliases | ||
823 | * from the vmap layer. | ||
824 | */ | ||
825 | void vm_unmap_aliases(void) | ||
826 | { | ||
827 | unsigned long start = ULONG_MAX, end = 0; | ||
828 | int cpu; | ||
829 | int flush = 0; | ||
830 | |||
831 | for_each_possible_cpu(cpu) { | ||
832 | struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); | ||
833 | struct vmap_block *vb; | ||
834 | |||
835 | rcu_read_lock(); | ||
836 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | ||
837 | int i; | ||
838 | |||
839 | spin_lock(&vb->lock); | ||
840 | i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); | ||
841 | while (i < VMAP_BBMAP_BITS) { | ||
842 | unsigned long s, e; | ||
843 | int j; | ||
844 | j = find_next_zero_bit(vb->dirty_map, | ||
845 | VMAP_BBMAP_BITS, i); | ||
846 | |||
847 | s = vb->va->va_start + (i << PAGE_SHIFT); | ||
848 | e = vb->va->va_start + (j << PAGE_SHIFT); | ||
849 | vunmap_page_range(s, e); | ||
850 | flush = 1; | ||
851 | |||
852 | if (s < start) | ||
853 | start = s; | ||
854 | if (e > end) | ||
855 | end = e; | ||
856 | |||
857 | i = j; | ||
858 | i = find_next_bit(vb->dirty_map, | ||
859 | VMAP_BBMAP_BITS, i); | ||
860 | } | ||
861 | spin_unlock(&vb->lock); | ||
862 | } | ||
863 | rcu_read_unlock(); | ||
864 | } | ||
865 | |||
866 | __purge_vmap_area_lazy(&start, &end, 1, flush); | ||
867 | } | ||
868 | EXPORT_SYMBOL_GPL(vm_unmap_aliases); | ||
869 | |||
870 | /** | ||
871 | * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram | ||
872 | * @mem: the pointer returned by vm_map_ram | ||
873 | * @count: the count passed to that vm_map_ram call (cannot unmap partial) | ||
874 | */ | ||
875 | void vm_unmap_ram(const void *mem, unsigned int count) | ||
876 | { | ||
877 | unsigned long size = count << PAGE_SHIFT; | ||
878 | unsigned long addr = (unsigned long)mem; | ||
879 | |||
880 | BUG_ON(!addr); | ||
881 | BUG_ON(addr < VMALLOC_START); | ||
882 | BUG_ON(addr > VMALLOC_END); | ||
883 | BUG_ON(addr & (PAGE_SIZE-1)); | ||
884 | |||
885 | debug_check_no_locks_freed(mem, size); | ||
886 | |||
887 | if (likely(count <= VMAP_MAX_ALLOC)) | ||
888 | vb_free(mem, size); | ||
889 | else | ||
890 | free_unmap_vmap_area_addr(addr); | ||
891 | } | ||
892 | EXPORT_SYMBOL(vm_unmap_ram); | ||
893 | |||
894 | /** | ||
895 | * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) | ||
896 | * @pages: an array of pointers to the pages to be mapped | ||
897 | * @count: number of pages | ||
898 | * @node: prefer to allocate data structures on this node | ||
899 | * @prot: memory protection to use. PAGE_KERNEL for regular RAM | ||
900 | * @returns: a pointer to the address that has been mapped, or NULL on failure | ||
901 | */ | ||
902 | void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) | ||
903 | { | ||
904 | unsigned long size = count << PAGE_SHIFT; | ||
215 | unsigned long addr; | 905 | unsigned long addr; |
906 | void *mem; | ||
907 | |||
908 | if (likely(count <= VMAP_MAX_ALLOC)) { | ||
909 | mem = vb_alloc(size, GFP_KERNEL); | ||
910 | if (IS_ERR(mem)) | ||
911 | return NULL; | ||
912 | addr = (unsigned long)mem; | ||
913 | } else { | ||
914 | struct vmap_area *va; | ||
915 | va = alloc_vmap_area(size, PAGE_SIZE, | ||
916 | VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); | ||
917 | if (IS_ERR(va)) | ||
918 | return NULL; | ||
919 | |||
920 | addr = va->va_start; | ||
921 | mem = (void *)addr; | ||
922 | } | ||
923 | if (vmap_page_range(addr, addr + size, prot, pages) < 0) { | ||
924 | vm_unmap_ram(mem, count); | ||
925 | return NULL; | ||
926 | } | ||
927 | return mem; | ||
928 | } | ||
929 | EXPORT_SYMBOL(vm_map_ram); | ||
930 | |||
931 | void __init vmalloc_init(void) | ||
932 | { | ||
933 | int i; | ||
934 | |||
935 | for_each_possible_cpu(i) { | ||
936 | struct vmap_block_queue *vbq; | ||
937 | |||
938 | vbq = &per_cpu(vmap_block_queue, i); | ||
939 | spin_lock_init(&vbq->lock); | ||
940 | INIT_LIST_HEAD(&vbq->free); | ||
941 | INIT_LIST_HEAD(&vbq->dirty); | ||
942 | vbq->nr_dirty = 0; | ||
943 | } | ||
944 | } | ||
945 | |||
946 | void unmap_kernel_range(unsigned long addr, unsigned long size) | ||
947 | { | ||
948 | unsigned long end = addr + size; | ||
949 | vunmap_page_range(addr, end); | ||
950 | flush_tlb_kernel_range(addr, end); | ||
951 | } | ||
952 | |||
953 | int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | ||
954 | { | ||
955 | unsigned long addr = (unsigned long)area->addr; | ||
956 | unsigned long end = addr + area->size - PAGE_SIZE; | ||
957 | int err; | ||
958 | |||
959 | err = vmap_page_range(addr, end, prot, *pages); | ||
960 | if (err > 0) { | ||
961 | *pages += err; | ||
962 | err = 0; | ||
963 | } | ||
964 | |||
965 | return err; | ||
966 | } | ||
967 | EXPORT_SYMBOL_GPL(map_vm_area); | ||
968 | |||
969 | /*** Old vmalloc interfaces ***/ | ||
970 | DEFINE_RWLOCK(vmlist_lock); | ||
971 | struct vm_struct *vmlist; | ||
972 | |||
973 | static struct vm_struct *__get_vm_area_node(unsigned long size, | ||
974 | unsigned long flags, unsigned long start, unsigned long end, | ||
975 | int node, gfp_t gfp_mask, void *caller) | ||
976 | { | ||
977 | static struct vmap_area *va; | ||
978 | struct vm_struct *area; | ||
979 | struct vm_struct *tmp, **p; | ||
980 | unsigned long align = 1; | ||
216 | 981 | ||
217 | BUG_ON(in_interrupt()); | 982 | BUG_ON(in_interrupt()); |
218 | if (flags & VM_IOREMAP) { | 983 | if (flags & VM_IOREMAP) { |
@@ -225,13 +990,12 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, | |||
225 | 990 | ||
226 | align = 1ul << bit; | 991 | align = 1ul << bit; |
227 | } | 992 | } |
228 | addr = ALIGN(start, align); | 993 | |
229 | size = PAGE_ALIGN(size); | 994 | size = PAGE_ALIGN(size); |
230 | if (unlikely(!size)) | 995 | if (unlikely(!size)) |
231 | return NULL; | 996 | return NULL; |
232 | 997 | ||
233 | area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); | 998 | area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); |
234 | |||
235 | if (unlikely(!area)) | 999 | if (unlikely(!area)) |
236 | return NULL; | 1000 | return NULL; |
237 | 1001 | ||
@@ -240,48 +1004,32 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, | |||
240 | */ | 1004 | */ |
241 | size += PAGE_SIZE; | 1005 | size += PAGE_SIZE; |
242 | 1006 | ||
243 | write_lock(&vmlist_lock); | 1007 | va = alloc_vmap_area(size, align, start, end, node, gfp_mask); |
244 | for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) { | 1008 | if (IS_ERR(va)) { |
245 | if ((unsigned long)tmp->addr < addr) { | 1009 | kfree(area); |
246 | if((unsigned long)tmp->addr + tmp->size >= addr) | 1010 | return NULL; |
247 | addr = ALIGN(tmp->size + | ||
248 | (unsigned long)tmp->addr, align); | ||
249 | continue; | ||
250 | } | ||
251 | if ((size + addr) < addr) | ||
252 | goto out; | ||
253 | if (size + addr <= (unsigned long)tmp->addr) | ||
254 | goto found; | ||
255 | addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align); | ||
256 | if (addr > end - size) | ||
257 | goto out; | ||
258 | } | 1011 | } |
259 | if ((size + addr) < addr) | ||
260 | goto out; | ||
261 | if (addr > end - size) | ||
262 | goto out; | ||
263 | |||
264 | found: | ||
265 | area->next = *p; | ||
266 | *p = area; | ||
267 | 1012 | ||
268 | area->flags = flags; | 1013 | area->flags = flags; |
269 | area->addr = (void *)addr; | 1014 | area->addr = (void *)va->va_start; |
270 | area->size = size; | 1015 | area->size = size; |
271 | area->pages = NULL; | 1016 | area->pages = NULL; |
272 | area->nr_pages = 0; | 1017 | area->nr_pages = 0; |
273 | area->phys_addr = 0; | 1018 | area->phys_addr = 0; |
274 | area->caller = caller; | 1019 | area->caller = caller; |
1020 | va->private = area; | ||
1021 | va->flags |= VM_VM_AREA; | ||
1022 | |||
1023 | write_lock(&vmlist_lock); | ||
1024 | for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { | ||
1025 | if (tmp->addr >= area->addr) | ||
1026 | break; | ||
1027 | } | ||
1028 | area->next = *p; | ||
1029 | *p = area; | ||
275 | write_unlock(&vmlist_lock); | 1030 | write_unlock(&vmlist_lock); |
276 | 1031 | ||
277 | return area; | 1032 | return area; |
278 | |||
279 | out: | ||
280 | write_unlock(&vmlist_lock); | ||
281 | kfree(area); | ||
282 | if (printk_ratelimit()) | ||
283 | printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n"); | ||
284 | return NULL; | ||
285 | } | 1033 | } |
286 | 1034 | ||
287 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | 1035 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, |
@@ -321,39 +1069,15 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, | |||
321 | gfp_mask, __builtin_return_address(0)); | 1069 | gfp_mask, __builtin_return_address(0)); |
322 | } | 1070 | } |
323 | 1071 | ||
324 | /* Caller must hold vmlist_lock */ | 1072 | static struct vm_struct *find_vm_area(const void *addr) |
325 | static struct vm_struct *__find_vm_area(const void *addr) | ||
326 | { | 1073 | { |
327 | struct vm_struct *tmp; | 1074 | struct vmap_area *va; |
328 | 1075 | ||
329 | for (tmp = vmlist; tmp != NULL; tmp = tmp->next) { | 1076 | va = find_vmap_area((unsigned long)addr); |
330 | if (tmp->addr == addr) | 1077 | if (va && va->flags & VM_VM_AREA) |
331 | break; | 1078 | return va->private; |
332 | } | ||
333 | 1079 | ||
334 | return tmp; | ||
335 | } | ||
336 | |||
337 | /* Caller must hold vmlist_lock */ | ||
338 | static struct vm_struct *__remove_vm_area(const void *addr) | ||
339 | { | ||
340 | struct vm_struct **p, *tmp; | ||
341 | |||
342 | for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) { | ||
343 | if (tmp->addr == addr) | ||
344 | goto found; | ||
345 | } | ||
346 | return NULL; | 1080 | return NULL; |
347 | |||
348 | found: | ||
349 | unmap_vm_area(tmp); | ||
350 | *p = tmp->next; | ||
351 | |||
352 | /* | ||
353 | * Remove the guard page. | ||
354 | */ | ||
355 | tmp->size -= PAGE_SIZE; | ||
356 | return tmp; | ||
357 | } | 1081 | } |
358 | 1082 | ||
359 | /** | 1083 | /** |
@@ -366,11 +1090,24 @@ found: | |||
366 | */ | 1090 | */ |
367 | struct vm_struct *remove_vm_area(const void *addr) | 1091 | struct vm_struct *remove_vm_area(const void *addr) |
368 | { | 1092 | { |
369 | struct vm_struct *v; | 1093 | struct vmap_area *va; |
370 | write_lock(&vmlist_lock); | 1094 | |
371 | v = __remove_vm_area(addr); | 1095 | va = find_vmap_area((unsigned long)addr); |
372 | write_unlock(&vmlist_lock); | 1096 | if (va && va->flags & VM_VM_AREA) { |
373 | return v; | 1097 | struct vm_struct *vm = va->private; |
1098 | struct vm_struct *tmp, **p; | ||
1099 | free_unmap_vmap_area(va); | ||
1100 | vm->size -= PAGE_SIZE; | ||
1101 | |||
1102 | write_lock(&vmlist_lock); | ||
1103 | for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) | ||
1104 | ; | ||
1105 | *p = tmp->next; | ||
1106 | write_unlock(&vmlist_lock); | ||
1107 | |||
1108 | return vm; | ||
1109 | } | ||
1110 | return NULL; | ||
374 | } | 1111 | } |
375 | 1112 | ||
376 | static void __vunmap(const void *addr, int deallocate_pages) | 1113 | static void __vunmap(const void *addr, int deallocate_pages) |
@@ -381,16 +1118,14 @@ static void __vunmap(const void *addr, int deallocate_pages) | |||
381 | return; | 1118 | return; |
382 | 1119 | ||
383 | if ((PAGE_SIZE-1) & (unsigned long)addr) { | 1120 | if ((PAGE_SIZE-1) & (unsigned long)addr) { |
384 | printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); | 1121 | WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); |
385 | WARN_ON(1); | ||
386 | return; | 1122 | return; |
387 | } | 1123 | } |
388 | 1124 | ||
389 | area = remove_vm_area(addr); | 1125 | area = remove_vm_area(addr); |
390 | if (unlikely(!area)) { | 1126 | if (unlikely(!area)) { |
391 | printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", | 1127 | WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", |
392 | addr); | 1128 | addr); |
393 | WARN_ON(1); | ||
394 | return; | 1129 | return; |
395 | } | 1130 | } |
396 | 1131 | ||
@@ -482,6 +1217,8 @@ void *vmap(struct page **pages, unsigned int count, | |||
482 | } | 1217 | } |
483 | EXPORT_SYMBOL(vmap); | 1218 | EXPORT_SYMBOL(vmap); |
484 | 1219 | ||
1220 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | ||
1221 | int node, void *caller); | ||
485 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | 1222 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, |
486 | pgprot_t prot, int node, void *caller) | 1223 | pgprot_t prot, int node, void *caller) |
487 | { | 1224 | { |
@@ -608,10 +1345,8 @@ void *vmalloc_user(unsigned long size) | |||
608 | 1345 | ||
609 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); | 1346 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); |
610 | if (ret) { | 1347 | if (ret) { |
611 | write_lock(&vmlist_lock); | 1348 | area = find_vm_area(ret); |
612 | area = __find_vm_area(ret); | ||
613 | area->flags |= VM_USERMAP; | 1349 | area->flags |= VM_USERMAP; |
614 | write_unlock(&vmlist_lock); | ||
615 | } | 1350 | } |
616 | return ret; | 1351 | return ret; |
617 | } | 1352 | } |
@@ -691,10 +1426,8 @@ void *vmalloc_32_user(unsigned long size) | |||
691 | 1426 | ||
692 | ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); | 1427 | ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); |
693 | if (ret) { | 1428 | if (ret) { |
694 | write_lock(&vmlist_lock); | 1429 | area = find_vm_area(ret); |
695 | area = __find_vm_area(ret); | ||
696 | area->flags |= VM_USERMAP; | 1430 | area->flags |= VM_USERMAP; |
697 | write_unlock(&vmlist_lock); | ||
698 | } | 1431 | } |
699 | return ret; | 1432 | return ret; |
700 | } | 1433 | } |
@@ -795,26 +1528,25 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | |||
795 | struct vm_struct *area; | 1528 | struct vm_struct *area; |
796 | unsigned long uaddr = vma->vm_start; | 1529 | unsigned long uaddr = vma->vm_start; |
797 | unsigned long usize = vma->vm_end - vma->vm_start; | 1530 | unsigned long usize = vma->vm_end - vma->vm_start; |
798 | int ret; | ||
799 | 1531 | ||
800 | if ((PAGE_SIZE-1) & (unsigned long)addr) | 1532 | if ((PAGE_SIZE-1) & (unsigned long)addr) |
801 | return -EINVAL; | 1533 | return -EINVAL; |
802 | 1534 | ||
803 | read_lock(&vmlist_lock); | 1535 | area = find_vm_area(addr); |
804 | area = __find_vm_area(addr); | ||
805 | if (!area) | 1536 | if (!area) |
806 | goto out_einval_locked; | 1537 | return -EINVAL; |
807 | 1538 | ||
808 | if (!(area->flags & VM_USERMAP)) | 1539 | if (!(area->flags & VM_USERMAP)) |
809 | goto out_einval_locked; | 1540 | return -EINVAL; |
810 | 1541 | ||
811 | if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) | 1542 | if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) |
812 | goto out_einval_locked; | 1543 | return -EINVAL; |
813 | read_unlock(&vmlist_lock); | ||
814 | 1544 | ||
815 | addr += pgoff << PAGE_SHIFT; | 1545 | addr += pgoff << PAGE_SHIFT; |
816 | do { | 1546 | do { |
817 | struct page *page = vmalloc_to_page(addr); | 1547 | struct page *page = vmalloc_to_page(addr); |
1548 | int ret; | ||
1549 | |||
818 | ret = vm_insert_page(vma, uaddr, page); | 1550 | ret = vm_insert_page(vma, uaddr, page); |
819 | if (ret) | 1551 | if (ret) |
820 | return ret; | 1552 | return ret; |
@@ -827,11 +1559,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | |||
827 | /* Prevent "things" like memory migration? VM_flags need a cleanup... */ | 1559 | /* Prevent "things" like memory migration? VM_flags need a cleanup... */ |
828 | vma->vm_flags |= VM_RESERVED; | 1560 | vma->vm_flags |= VM_RESERVED; |
829 | 1561 | ||
830 | return ret; | 1562 | return 0; |
831 | |||
832 | out_einval_locked: | ||
833 | read_unlock(&vmlist_lock); | ||
834 | return -EINVAL; | ||
835 | } | 1563 | } |
836 | EXPORT_SYMBOL(remap_vmalloc_range); | 1564 | EXPORT_SYMBOL(remap_vmalloc_range); |
837 | 1565 | ||
@@ -931,6 +1659,25 @@ static void s_stop(struct seq_file *m, void *p) | |||
931 | read_unlock(&vmlist_lock); | 1659 | read_unlock(&vmlist_lock); |
932 | } | 1660 | } |
933 | 1661 | ||
1662 | static void show_numa_info(struct seq_file *m, struct vm_struct *v) | ||
1663 | { | ||
1664 | if (NUMA_BUILD) { | ||
1665 | unsigned int nr, *counters = m->private; | ||
1666 | |||
1667 | if (!counters) | ||
1668 | return; | ||
1669 | |||
1670 | memset(counters, 0, nr_node_ids * sizeof(unsigned int)); | ||
1671 | |||
1672 | for (nr = 0; nr < v->nr_pages; nr++) | ||
1673 | counters[page_to_nid(v->pages[nr])]++; | ||
1674 | |||
1675 | for_each_node_state(nr, N_HIGH_MEMORY) | ||
1676 | if (counters[nr]) | ||
1677 | seq_printf(m, " N%u=%u", nr, counters[nr]); | ||
1678 | } | ||
1679 | } | ||
1680 | |||
934 | static int s_show(struct seq_file *m, void *p) | 1681 | static int s_show(struct seq_file *m, void *p) |
935 | { | 1682 | { |
936 | struct vm_struct *v = p; | 1683 | struct vm_struct *v = p; |
@@ -967,15 +1714,46 @@ static int s_show(struct seq_file *m, void *p) | |||
967 | if (v->flags & VM_VPAGES) | 1714 | if (v->flags & VM_VPAGES) |
968 | seq_printf(m, " vpages"); | 1715 | seq_printf(m, " vpages"); |
969 | 1716 | ||
1717 | show_numa_info(m, v); | ||
970 | seq_putc(m, '\n'); | 1718 | seq_putc(m, '\n'); |
971 | return 0; | 1719 | return 0; |
972 | } | 1720 | } |
973 | 1721 | ||
974 | const struct seq_operations vmalloc_op = { | 1722 | static const struct seq_operations vmalloc_op = { |
975 | .start = s_start, | 1723 | .start = s_start, |
976 | .next = s_next, | 1724 | .next = s_next, |
977 | .stop = s_stop, | 1725 | .stop = s_stop, |
978 | .show = s_show, | 1726 | .show = s_show, |
979 | }; | 1727 | }; |
1728 | |||
1729 | static int vmalloc_open(struct inode *inode, struct file *file) | ||
1730 | { | ||
1731 | unsigned int *ptr = NULL; | ||
1732 | int ret; | ||
1733 | |||
1734 | if (NUMA_BUILD) | ||
1735 | ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); | ||
1736 | ret = seq_open(file, &vmalloc_op); | ||
1737 | if (!ret) { | ||
1738 | struct seq_file *m = file->private_data; | ||
1739 | m->private = ptr; | ||
1740 | } else | ||
1741 | kfree(ptr); | ||
1742 | return ret; | ||
1743 | } | ||
1744 | |||
1745 | static const struct file_operations proc_vmalloc_operations = { | ||
1746 | .open = vmalloc_open, | ||
1747 | .read = seq_read, | ||
1748 | .llseek = seq_lseek, | ||
1749 | .release = seq_release_private, | ||
1750 | }; | ||
1751 | |||
1752 | static int __init proc_vmalloc_init(void) | ||
1753 | { | ||
1754 | proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations); | ||
1755 | return 0; | ||
1756 | } | ||
1757 | module_init(proc_vmalloc_init); | ||
980 | #endif | 1758 | #endif |
981 | 1759 | ||