diff options
Diffstat (limited to 'mm/vmalloc.c')
| -rw-r--r-- | mm/vmalloc.c | 1038 |
1 files changed, 903 insertions, 135 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 85b9a0d2c877..ba6b0f5f7fac 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c | |||
| @@ -8,26 +8,28 @@ | |||
| 8 | * Numa awareness, Christoph Lameter, SGI, June 2005 | 8 | * Numa awareness, Christoph Lameter, SGI, June 2005 |
| 9 | */ | 9 | */ |
| 10 | 10 | ||
| 11 | #include <linux/vmalloc.h> | ||
| 11 | #include <linux/mm.h> | 12 | #include <linux/mm.h> |
| 12 | #include <linux/module.h> | 13 | #include <linux/module.h> |
| 13 | #include <linux/highmem.h> | 14 | #include <linux/highmem.h> |
| 14 | #include <linux/slab.h> | 15 | #include <linux/slab.h> |
| 15 | #include <linux/spinlock.h> | 16 | #include <linux/spinlock.h> |
| 16 | #include <linux/interrupt.h> | 17 | #include <linux/interrupt.h> |
| 18 | #include <linux/proc_fs.h> | ||
| 17 | #include <linux/seq_file.h> | 19 | #include <linux/seq_file.h> |
| 18 | #include <linux/debugobjects.h> | 20 | #include <linux/debugobjects.h> |
| 19 | #include <linux/vmalloc.h> | ||
| 20 | #include <linux/kallsyms.h> | 21 | #include <linux/kallsyms.h> |
| 22 | #include <linux/list.h> | ||
| 23 | #include <linux/rbtree.h> | ||
| 24 | #include <linux/radix-tree.h> | ||
| 25 | #include <linux/rcupdate.h> | ||
| 21 | 26 | ||
| 27 | #include <asm/atomic.h> | ||
| 22 | #include <asm/uaccess.h> | 28 | #include <asm/uaccess.h> |
| 23 | #include <asm/tlbflush.h> | 29 | #include <asm/tlbflush.h> |
| 24 | 30 | ||
| 25 | 31 | ||
| 26 | DEFINE_RWLOCK(vmlist_lock); | 32 | /*** Page table manipulation functions ***/ |
| 27 | struct vm_struct *vmlist; | ||
| 28 | |||
| 29 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | ||
| 30 | int node, void *caller); | ||
| 31 | 33 | ||
| 32 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) | 34 | static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) |
| 33 | { | 35 | { |
| @@ -40,8 +42,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) | |||
| 40 | } while (pte++, addr += PAGE_SIZE, addr != end); | 42 | } while (pte++, addr += PAGE_SIZE, addr != end); |
| 41 | } | 43 | } |
| 42 | 44 | ||
| 43 | static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, | 45 | static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) |
| 44 | unsigned long end) | ||
| 45 | { | 46 | { |
| 46 | pmd_t *pmd; | 47 | pmd_t *pmd; |
| 47 | unsigned long next; | 48 | unsigned long next; |
| @@ -55,8 +56,7 @@ static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, | |||
| 55 | } while (pmd++, addr = next, addr != end); | 56 | } while (pmd++, addr = next, addr != end); |
| 56 | } | 57 | } |
| 57 | 58 | ||
| 58 | static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, | 59 | static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) |
| 59 | unsigned long end) | ||
| 60 | { | 60 | { |
| 61 | pud_t *pud; | 61 | pud_t *pud; |
| 62 | unsigned long next; | 62 | unsigned long next; |
| @@ -70,12 +70,10 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, | |||
| 70 | } while (pud++, addr = next, addr != end); | 70 | } while (pud++, addr = next, addr != end); |
| 71 | } | 71 | } |
| 72 | 72 | ||
| 73 | void unmap_kernel_range(unsigned long addr, unsigned long size) | 73 | static void vunmap_page_range(unsigned long addr, unsigned long end) |
| 74 | { | 74 | { |
| 75 | pgd_t *pgd; | 75 | pgd_t *pgd; |
| 76 | unsigned long next; | 76 | unsigned long next; |
| 77 | unsigned long start = addr; | ||
| 78 | unsigned long end = addr + size; | ||
| 79 | 77 | ||
| 80 | BUG_ON(addr >= end); | 78 | BUG_ON(addr >= end); |
| 81 | pgd = pgd_offset_k(addr); | 79 | pgd = pgd_offset_k(addr); |
| @@ -86,35 +84,36 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) | |||
| 86 | continue; | 84 | continue; |
| 87 | vunmap_pud_range(pgd, addr, next); | 85 | vunmap_pud_range(pgd, addr, next); |
| 88 | } while (pgd++, addr = next, addr != end); | 86 | } while (pgd++, addr = next, addr != end); |
| 89 | flush_tlb_kernel_range(start, end); | ||
| 90 | } | ||
| 91 | |||
| 92 | static void unmap_vm_area(struct vm_struct *area) | ||
| 93 | { | ||
| 94 | unmap_kernel_range((unsigned long)area->addr, area->size); | ||
| 95 | } | 87 | } |
| 96 | 88 | ||
| 97 | static int vmap_pte_range(pmd_t *pmd, unsigned long addr, | 89 | static int vmap_pte_range(pmd_t *pmd, unsigned long addr, |
| 98 | unsigned long end, pgprot_t prot, struct page ***pages) | 90 | unsigned long end, pgprot_t prot, struct page **pages, int *nr) |
| 99 | { | 91 | { |
| 100 | pte_t *pte; | 92 | pte_t *pte; |
| 101 | 93 | ||
| 94 | /* | ||
| 95 | * nr is a running index into the array which helps higher level | ||
| 96 | * callers keep track of where we're up to. | ||
| 97 | */ | ||
| 98 | |||
| 102 | pte = pte_alloc_kernel(pmd, addr); | 99 | pte = pte_alloc_kernel(pmd, addr); |
| 103 | if (!pte) | 100 | if (!pte) |
| 104 | return -ENOMEM; | 101 | return -ENOMEM; |
| 105 | do { | 102 | do { |
| 106 | struct page *page = **pages; | 103 | struct page *page = pages[*nr]; |
| 107 | WARN_ON(!pte_none(*pte)); | 104 | |
| 108 | if (!page) | 105 | if (WARN_ON(!pte_none(*pte))) |
| 106 | return -EBUSY; | ||
| 107 | if (WARN_ON(!page)) | ||
| 109 | return -ENOMEM; | 108 | return -ENOMEM; |
| 110 | set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); | 109 | set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); |
| 111 | (*pages)++; | 110 | (*nr)++; |
| 112 | } while (pte++, addr += PAGE_SIZE, addr != end); | 111 | } while (pte++, addr += PAGE_SIZE, addr != end); |
| 113 | return 0; | 112 | return 0; |
| 114 | } | 113 | } |
| 115 | 114 | ||
| 116 | static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, | 115 | static int vmap_pmd_range(pud_t *pud, unsigned long addr, |
| 117 | unsigned long end, pgprot_t prot, struct page ***pages) | 116 | unsigned long end, pgprot_t prot, struct page **pages, int *nr) |
| 118 | { | 117 | { |
| 119 | pmd_t *pmd; | 118 | pmd_t *pmd; |
| 120 | unsigned long next; | 119 | unsigned long next; |
| @@ -124,14 +123,14 @@ static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, | |||
| 124 | return -ENOMEM; | 123 | return -ENOMEM; |
| 125 | do { | 124 | do { |
| 126 | next = pmd_addr_end(addr, end); | 125 | next = pmd_addr_end(addr, end); |
| 127 | if (vmap_pte_range(pmd, addr, next, prot, pages)) | 126 | if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) |
| 128 | return -ENOMEM; | 127 | return -ENOMEM; |
| 129 | } while (pmd++, addr = next, addr != end); | 128 | } while (pmd++, addr = next, addr != end); |
| 130 | return 0; | 129 | return 0; |
| 131 | } | 130 | } |
| 132 | 131 | ||
| 133 | static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, | 132 | static int vmap_pud_range(pgd_t *pgd, unsigned long addr, |
| 134 | unsigned long end, pgprot_t prot, struct page ***pages) | 133 | unsigned long end, pgprot_t prot, struct page **pages, int *nr) |
| 135 | { | 134 | { |
| 136 | pud_t *pud; | 135 | pud_t *pud; |
| 137 | unsigned long next; | 136 | unsigned long next; |
| @@ -141,50 +140,78 @@ static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, | |||
| 141 | return -ENOMEM; | 140 | return -ENOMEM; |
| 142 | do { | 141 | do { |
| 143 | next = pud_addr_end(addr, end); | 142 | next = pud_addr_end(addr, end); |
| 144 | if (vmap_pmd_range(pud, addr, next, prot, pages)) | 143 | if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) |
| 145 | return -ENOMEM; | 144 | return -ENOMEM; |
| 146 | } while (pud++, addr = next, addr != end); | 145 | } while (pud++, addr = next, addr != end); |
| 147 | return 0; | 146 | return 0; |
| 148 | } | 147 | } |
| 149 | 148 | ||
| 150 | int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | 149 | /* |
| 150 | * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and | ||
| 151 | * will have pfns corresponding to the "pages" array. | ||
| 152 | * | ||
| 153 | * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] | ||
| 154 | */ | ||
| 155 | static int vmap_page_range(unsigned long addr, unsigned long end, | ||
| 156 | pgprot_t prot, struct page **pages) | ||
| 151 | { | 157 | { |
| 152 | pgd_t *pgd; | 158 | pgd_t *pgd; |
| 153 | unsigned long next; | 159 | unsigned long next; |
| 154 | unsigned long addr = (unsigned long) area->addr; | 160 | int err = 0; |
| 155 | unsigned long end = addr + area->size - PAGE_SIZE; | 161 | int nr = 0; |
| 156 | int err; | ||
| 157 | 162 | ||
| 158 | BUG_ON(addr >= end); | 163 | BUG_ON(addr >= end); |
| 159 | pgd = pgd_offset_k(addr); | 164 | pgd = pgd_offset_k(addr); |
| 160 | do { | 165 | do { |
| 161 | next = pgd_addr_end(addr, end); | 166 | next = pgd_addr_end(addr, end); |
| 162 | err = vmap_pud_range(pgd, addr, next, prot, pages); | 167 | err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); |
| 163 | if (err) | 168 | if (err) |
| 164 | break; | 169 | break; |
| 165 | } while (pgd++, addr = next, addr != end); | 170 | } while (pgd++, addr = next, addr != end); |
| 166 | flush_cache_vmap((unsigned long) area->addr, end); | 171 | flush_cache_vmap(addr, end); |
| 167 | return err; | 172 | |
| 173 | if (unlikely(err)) | ||
| 174 | return err; | ||
| 175 | return nr; | ||
| 176 | } | ||
| 177 | |||
| 178 | static inline int is_vmalloc_or_module_addr(const void *x) | ||
| 179 | { | ||
| 180 | /* | ||
| 181 | * ARM, x86-64 and sparc64 put modules in a special place, | ||
| 182 | * and fall back on vmalloc() if that fails. Others | ||
| 183 | * just put it in the vmalloc space. | ||
| 184 | */ | ||
| 185 | #if defined(CONFIG_MODULES) && defined(MODULES_VADDR) | ||
| 186 | unsigned long addr = (unsigned long)x; | ||
| 187 | if (addr >= MODULES_VADDR && addr < MODULES_END) | ||
| 188 | return 1; | ||
| 189 | #endif | ||
| 190 | return is_vmalloc_addr(x); | ||
| 168 | } | 191 | } |
| 169 | EXPORT_SYMBOL_GPL(map_vm_area); | ||
| 170 | 192 | ||
| 171 | /* | 193 | /* |
| 172 | * Map a vmalloc()-space virtual address to the physical page. | 194 | * Walk a vmap address to the struct page it maps. |
| 173 | */ | 195 | */ |
| 174 | struct page *vmalloc_to_page(const void *vmalloc_addr) | 196 | struct page *vmalloc_to_page(const void *vmalloc_addr) |
| 175 | { | 197 | { |
| 176 | unsigned long addr = (unsigned long) vmalloc_addr; | 198 | unsigned long addr = (unsigned long) vmalloc_addr; |
| 177 | struct page *page = NULL; | 199 | struct page *page = NULL; |
| 178 | pgd_t *pgd = pgd_offset_k(addr); | 200 | pgd_t *pgd = pgd_offset_k(addr); |
| 179 | pud_t *pud; | 201 | |
| 180 | pmd_t *pmd; | 202 | /* |
| 181 | pte_t *ptep, pte; | 203 | * XXX we might need to change this if we add VIRTUAL_BUG_ON for |
| 204 | * architectures that do not vmalloc module space | ||
| 205 | */ | ||
| 206 | VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); | ||
| 182 | 207 | ||
| 183 | if (!pgd_none(*pgd)) { | 208 | if (!pgd_none(*pgd)) { |
| 184 | pud = pud_offset(pgd, addr); | 209 | pud_t *pud = pud_offset(pgd, addr); |
| 185 | if (!pud_none(*pud)) { | 210 | if (!pud_none(*pud)) { |
| 186 | pmd = pmd_offset(pud, addr); | 211 | pmd_t *pmd = pmd_offset(pud, addr); |
| 187 | if (!pmd_none(*pmd)) { | 212 | if (!pmd_none(*pmd)) { |
| 213 | pte_t *ptep, pte; | ||
| 214 | |||
| 188 | ptep = pte_offset_map(pmd, addr); | 215 | ptep = pte_offset_map(pmd, addr); |
| 189 | pte = *ptep; | 216 | pte = *ptep; |
| 190 | if (pte_present(pte)) | 217 | if (pte_present(pte)) |
| @@ -206,13 +233,759 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr) | |||
| 206 | } | 233 | } |
| 207 | EXPORT_SYMBOL(vmalloc_to_pfn); | 234 | EXPORT_SYMBOL(vmalloc_to_pfn); |
| 208 | 235 | ||
| 209 | static struct vm_struct * | 236 | |
| 210 | __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, | 237 | /*** Global kva allocator ***/ |
| 211 | unsigned long end, int node, gfp_t gfp_mask, void *caller) | 238 | |
| 239 | #define VM_LAZY_FREE 0x01 | ||
| 240 | #define VM_LAZY_FREEING 0x02 | ||
| 241 | #define VM_VM_AREA 0x04 | ||
| 242 | |||
| 243 | struct vmap_area { | ||
| 244 | unsigned long va_start; | ||
| 245 | unsigned long va_end; | ||
| 246 | unsigned long flags; | ||
| 247 | struct rb_node rb_node; /* address sorted rbtree */ | ||
| 248 | struct list_head list; /* address sorted list */ | ||
| 249 | struct list_head purge_list; /* "lazy purge" list */ | ||
| 250 | void *private; | ||
| 251 | struct rcu_head rcu_head; | ||
| 252 | }; | ||
| 253 | |||
| 254 | static DEFINE_SPINLOCK(vmap_area_lock); | ||
| 255 | static struct rb_root vmap_area_root = RB_ROOT; | ||
| 256 | static LIST_HEAD(vmap_area_list); | ||
| 257 | |||
| 258 | static struct vmap_area *__find_vmap_area(unsigned long addr) | ||
| 212 | { | 259 | { |
| 213 | struct vm_struct **p, *tmp, *area; | 260 | struct rb_node *n = vmap_area_root.rb_node; |
| 214 | unsigned long align = 1; | 261 | |
| 262 | while (n) { | ||
| 263 | struct vmap_area *va; | ||
| 264 | |||
| 265 | va = rb_entry(n, struct vmap_area, rb_node); | ||
| 266 | if (addr < va->va_start) | ||
| 267 | n = n->rb_left; | ||
| 268 | else if (addr > va->va_start) | ||
| 269 | n = n->rb_right; | ||
| 270 | else | ||
| 271 | return va; | ||
| 272 | } | ||
| 273 | |||
| 274 | return NULL; | ||
| 275 | } | ||
| 276 | |||
| 277 | static void __insert_vmap_area(struct vmap_area *va) | ||
| 278 | { | ||
| 279 | struct rb_node **p = &vmap_area_root.rb_node; | ||
| 280 | struct rb_node *parent = NULL; | ||
| 281 | struct rb_node *tmp; | ||
| 282 | |||
| 283 | while (*p) { | ||
| 284 | struct vmap_area *tmp; | ||
| 285 | |||
| 286 | parent = *p; | ||
| 287 | tmp = rb_entry(parent, struct vmap_area, rb_node); | ||
| 288 | if (va->va_start < tmp->va_end) | ||
| 289 | p = &(*p)->rb_left; | ||
| 290 | else if (va->va_end > tmp->va_start) | ||
| 291 | p = &(*p)->rb_right; | ||
| 292 | else | ||
| 293 | BUG(); | ||
| 294 | } | ||
| 295 | |||
| 296 | rb_link_node(&va->rb_node, parent, p); | ||
| 297 | rb_insert_color(&va->rb_node, &vmap_area_root); | ||
| 298 | |||
| 299 | /* address-sort this list so it is usable like the vmlist */ | ||
| 300 | tmp = rb_prev(&va->rb_node); | ||
| 301 | if (tmp) { | ||
| 302 | struct vmap_area *prev; | ||
| 303 | prev = rb_entry(tmp, struct vmap_area, rb_node); | ||
| 304 | list_add_rcu(&va->list, &prev->list); | ||
| 305 | } else | ||
| 306 | list_add_rcu(&va->list, &vmap_area_list); | ||
| 307 | } | ||
| 308 | |||
| 309 | static void purge_vmap_area_lazy(void); | ||
| 310 | |||
| 311 | /* | ||
| 312 | * Allocate a region of KVA of the specified size and alignment, within the | ||
| 313 | * vstart and vend. | ||
| 314 | */ | ||
| 315 | static struct vmap_area *alloc_vmap_area(unsigned long size, | ||
| 316 | unsigned long align, | ||
| 317 | unsigned long vstart, unsigned long vend, | ||
| 318 | int node, gfp_t gfp_mask) | ||
| 319 | { | ||
| 320 | struct vmap_area *va; | ||
| 321 | struct rb_node *n; | ||
| 322 | unsigned long addr; | ||
| 323 | int purged = 0; | ||
| 324 | |||
| 325 | BUG_ON(size & ~PAGE_MASK); | ||
| 326 | |||
| 327 | addr = ALIGN(vstart, align); | ||
| 328 | |||
| 329 | va = kmalloc_node(sizeof(struct vmap_area), | ||
| 330 | gfp_mask & GFP_RECLAIM_MASK, node); | ||
| 331 | if (unlikely(!va)) | ||
| 332 | return ERR_PTR(-ENOMEM); | ||
| 333 | |||
| 334 | retry: | ||
| 335 | spin_lock(&vmap_area_lock); | ||
| 336 | /* XXX: could have a last_hole cache */ | ||
| 337 | n = vmap_area_root.rb_node; | ||
| 338 | if (n) { | ||
| 339 | struct vmap_area *first = NULL; | ||
| 340 | |||
| 341 | do { | ||
| 342 | struct vmap_area *tmp; | ||
| 343 | tmp = rb_entry(n, struct vmap_area, rb_node); | ||
| 344 | if (tmp->va_end >= addr) { | ||
| 345 | if (!first && tmp->va_start < addr + size) | ||
| 346 | first = tmp; | ||
| 347 | n = n->rb_left; | ||
| 348 | } else { | ||
| 349 | first = tmp; | ||
| 350 | n = n->rb_right; | ||
| 351 | } | ||
| 352 | } while (n); | ||
| 353 | |||
| 354 | if (!first) | ||
| 355 | goto found; | ||
| 356 | |||
| 357 | if (first->va_end < addr) { | ||
| 358 | n = rb_next(&first->rb_node); | ||
| 359 | if (n) | ||
| 360 | first = rb_entry(n, struct vmap_area, rb_node); | ||
| 361 | else | ||
| 362 | goto found; | ||
| 363 | } | ||
| 364 | |||
| 365 | while (addr + size >= first->va_start && addr + size <= vend) { | ||
| 366 | addr = ALIGN(first->va_end + PAGE_SIZE, align); | ||
| 367 | |||
| 368 | n = rb_next(&first->rb_node); | ||
| 369 | if (n) | ||
| 370 | first = rb_entry(n, struct vmap_area, rb_node); | ||
| 371 | else | ||
| 372 | goto found; | ||
| 373 | } | ||
| 374 | } | ||
| 375 | found: | ||
| 376 | if (addr + size > vend) { | ||
| 377 | spin_unlock(&vmap_area_lock); | ||
| 378 | if (!purged) { | ||
| 379 | purge_vmap_area_lazy(); | ||
| 380 | purged = 1; | ||
| 381 | goto retry; | ||
| 382 | } | ||
| 383 | if (printk_ratelimit()) | ||
| 384 | printk(KERN_WARNING "vmap allocation failed: " | ||
| 385 | "use vmalloc=<size> to increase size.\n"); | ||
| 386 | return ERR_PTR(-EBUSY); | ||
| 387 | } | ||
| 388 | |||
| 389 | BUG_ON(addr & (align-1)); | ||
| 390 | |||
| 391 | va->va_start = addr; | ||
| 392 | va->va_end = addr + size; | ||
| 393 | va->flags = 0; | ||
| 394 | __insert_vmap_area(va); | ||
| 395 | spin_unlock(&vmap_area_lock); | ||
| 396 | |||
| 397 | return va; | ||
| 398 | } | ||
| 399 | |||
| 400 | static void rcu_free_va(struct rcu_head *head) | ||
| 401 | { | ||
| 402 | struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); | ||
| 403 | |||
| 404 | kfree(va); | ||
| 405 | } | ||
| 406 | |||
| 407 | static void __free_vmap_area(struct vmap_area *va) | ||
| 408 | { | ||
| 409 | BUG_ON(RB_EMPTY_NODE(&va->rb_node)); | ||
| 410 | rb_erase(&va->rb_node, &vmap_area_root); | ||
| 411 | RB_CLEAR_NODE(&va->rb_node); | ||
| 412 | list_del_rcu(&va->list); | ||
| 413 | |||
| 414 | call_rcu(&va->rcu_head, rcu_free_va); | ||
| 415 | } | ||
| 416 | |||
| 417 | /* | ||
| 418 | * Free a region of KVA allocated by alloc_vmap_area | ||
| 419 | */ | ||
| 420 | static void free_vmap_area(struct vmap_area *va) | ||
| 421 | { | ||
| 422 | spin_lock(&vmap_area_lock); | ||
| 423 | __free_vmap_area(va); | ||
| 424 | spin_unlock(&vmap_area_lock); | ||
| 425 | } | ||
| 426 | |||
| 427 | /* | ||
| 428 | * Clear the pagetable entries of a given vmap_area | ||
| 429 | */ | ||
| 430 | static void unmap_vmap_area(struct vmap_area *va) | ||
| 431 | { | ||
| 432 | vunmap_page_range(va->va_start, va->va_end); | ||
| 433 | } | ||
| 434 | |||
| 435 | /* | ||
| 436 | * lazy_max_pages is the maximum amount of virtual address space we gather up | ||
| 437 | * before attempting to purge with a TLB flush. | ||
| 438 | * | ||
| 439 | * There is a tradeoff here: a larger number will cover more kernel page tables | ||
| 440 | * and take slightly longer to purge, but it will linearly reduce the number of | ||
| 441 | * global TLB flushes that must be performed. It would seem natural to scale | ||
| 442 | * this number up linearly with the number of CPUs (because vmapping activity | ||
| 443 | * could also scale linearly with the number of CPUs), however it is likely | ||
| 444 | * that in practice, workloads might be constrained in other ways that mean | ||
| 445 | * vmap activity will not scale linearly with CPUs. Also, I want to be | ||
| 446 | * conservative and not introduce a big latency on huge systems, so go with | ||
| 447 | * a less aggressive log scale. It will still be an improvement over the old | ||
| 448 | * code, and it will be simple to change the scale factor if we find that it | ||
| 449 | * becomes a problem on bigger systems. | ||
| 450 | */ | ||
| 451 | static unsigned long lazy_max_pages(void) | ||
| 452 | { | ||
| 453 | unsigned int log; | ||
| 454 | |||
| 455 | log = fls(num_online_cpus()); | ||
| 456 | |||
| 457 | return log * (32UL * 1024 * 1024 / PAGE_SIZE); | ||
| 458 | } | ||
| 459 | |||
| 460 | static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); | ||
| 461 | |||
| 462 | /* | ||
| 463 | * Purges all lazily-freed vmap areas. | ||
| 464 | * | ||
| 465 | * If sync is 0 then don't purge if there is already a purge in progress. | ||
| 466 | * If force_flush is 1, then flush kernel TLBs between *start and *end even | ||
| 467 | * if we found no lazy vmap areas to unmap (callers can use this to optimise | ||
| 468 | * their own TLB flushing). | ||
| 469 | * Returns with *start = min(*start, lowest purged address) | ||
| 470 | * *end = max(*end, highest purged address) | ||
| 471 | */ | ||
| 472 | static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, | ||
| 473 | int sync, int force_flush) | ||
| 474 | { | ||
| 475 | static DEFINE_SPINLOCK(purge_lock); | ||
| 476 | LIST_HEAD(valist); | ||
| 477 | struct vmap_area *va; | ||
| 478 | int nr = 0; | ||
| 479 | |||
| 480 | /* | ||
| 481 | * If sync is 0 but force_flush is 1, we'll go sync anyway but callers | ||
| 482 | * should not expect such behaviour. This just simplifies locking for | ||
| 483 | * the case that isn't actually used at the moment anyway. | ||
| 484 | */ | ||
| 485 | if (!sync && !force_flush) { | ||
| 486 | if (!spin_trylock(&purge_lock)) | ||
| 487 | return; | ||
| 488 | } else | ||
| 489 | spin_lock(&purge_lock); | ||
| 490 | |||
| 491 | rcu_read_lock(); | ||
| 492 | list_for_each_entry_rcu(va, &vmap_area_list, list) { | ||
| 493 | if (va->flags & VM_LAZY_FREE) { | ||
| 494 | if (va->va_start < *start) | ||
| 495 | *start = va->va_start; | ||
| 496 | if (va->va_end > *end) | ||
| 497 | *end = va->va_end; | ||
| 498 | nr += (va->va_end - va->va_start) >> PAGE_SHIFT; | ||
| 499 | unmap_vmap_area(va); | ||
| 500 | list_add_tail(&va->purge_list, &valist); | ||
| 501 | va->flags |= VM_LAZY_FREEING; | ||
| 502 | va->flags &= ~VM_LAZY_FREE; | ||
| 503 | } | ||
| 504 | } | ||
| 505 | rcu_read_unlock(); | ||
| 506 | |||
| 507 | if (nr) { | ||
| 508 | BUG_ON(nr > atomic_read(&vmap_lazy_nr)); | ||
| 509 | atomic_sub(nr, &vmap_lazy_nr); | ||
| 510 | } | ||
| 511 | |||
| 512 | if (nr || force_flush) | ||
| 513 | flush_tlb_kernel_range(*start, *end); | ||
| 514 | |||
| 515 | if (nr) { | ||
| 516 | spin_lock(&vmap_area_lock); | ||
| 517 | list_for_each_entry(va, &valist, purge_list) | ||
| 518 | __free_vmap_area(va); | ||
| 519 | spin_unlock(&vmap_area_lock); | ||
| 520 | } | ||
| 521 | spin_unlock(&purge_lock); | ||
| 522 | } | ||
| 523 | |||
| 524 | /* | ||
| 525 | * Kick off a purge of the outstanding lazy areas. | ||
| 526 | */ | ||
| 527 | static void purge_vmap_area_lazy(void) | ||
| 528 | { | ||
| 529 | unsigned long start = ULONG_MAX, end = 0; | ||
| 530 | |||
| 531 | __purge_vmap_area_lazy(&start, &end, 0, 0); | ||
| 532 | } | ||
| 533 | |||
| 534 | /* | ||
| 535 | * Free and unmap a vmap area | ||
| 536 | */ | ||
| 537 | static void free_unmap_vmap_area(struct vmap_area *va) | ||
| 538 | { | ||
| 539 | va->flags |= VM_LAZY_FREE; | ||
| 540 | atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); | ||
| 541 | if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) | ||
| 542 | purge_vmap_area_lazy(); | ||
| 543 | } | ||
| 544 | |||
| 545 | static struct vmap_area *find_vmap_area(unsigned long addr) | ||
| 546 | { | ||
| 547 | struct vmap_area *va; | ||
| 548 | |||
| 549 | spin_lock(&vmap_area_lock); | ||
| 550 | va = __find_vmap_area(addr); | ||
| 551 | spin_unlock(&vmap_area_lock); | ||
| 552 | |||
| 553 | return va; | ||
| 554 | } | ||
| 555 | |||
| 556 | static void free_unmap_vmap_area_addr(unsigned long addr) | ||
| 557 | { | ||
| 558 | struct vmap_area *va; | ||
| 559 | |||
| 560 | va = find_vmap_area(addr); | ||
| 561 | BUG_ON(!va); | ||
| 562 | free_unmap_vmap_area(va); | ||
| 563 | } | ||
| 564 | |||
| 565 | |||
| 566 | /*** Per cpu kva allocator ***/ | ||
| 567 | |||
| 568 | /* | ||
| 569 | * vmap space is limited especially on 32 bit architectures. Ensure there is | ||
| 570 | * room for at least 16 percpu vmap blocks per CPU. | ||
| 571 | */ | ||
| 572 | /* | ||
| 573 | * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able | ||
| 574 | * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess | ||
| 575 | * instead (we just need a rough idea) | ||
| 576 | */ | ||
| 577 | #if BITS_PER_LONG == 32 | ||
| 578 | #define VMALLOC_SPACE (128UL*1024*1024) | ||
| 579 | #else | ||
| 580 | #define VMALLOC_SPACE (128UL*1024*1024*1024) | ||
| 581 | #endif | ||
| 582 | |||
| 583 | #define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) | ||
| 584 | #define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ | ||
| 585 | #define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ | ||
| 586 | #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) | ||
| 587 | #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ | ||
| 588 | #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ | ||
| 589 | #define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ | ||
| 590 | VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ | ||
| 591 | VMALLOC_PAGES / NR_CPUS / 16)) | ||
| 592 | |||
| 593 | #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) | ||
| 594 | |||
| 595 | static bool vmap_initialized __read_mostly = false; | ||
| 596 | |||
| 597 | struct vmap_block_queue { | ||
| 598 | spinlock_t lock; | ||
| 599 | struct list_head free; | ||
| 600 | struct list_head dirty; | ||
| 601 | unsigned int nr_dirty; | ||
| 602 | }; | ||
| 603 | |||
| 604 | struct vmap_block { | ||
| 605 | spinlock_t lock; | ||
| 606 | struct vmap_area *va; | ||
| 607 | struct vmap_block_queue *vbq; | ||
| 608 | unsigned long free, dirty; | ||
| 609 | DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); | ||
| 610 | DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); | ||
| 611 | union { | ||
| 612 | struct { | ||
| 613 | struct list_head free_list; | ||
| 614 | struct list_head dirty_list; | ||
| 615 | }; | ||
| 616 | struct rcu_head rcu_head; | ||
| 617 | }; | ||
| 618 | }; | ||
| 619 | |||
| 620 | /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ | ||
| 621 | static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); | ||
| 622 | |||
| 623 | /* | ||
| 624 | * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block | ||
| 625 | * in the free path. Could get rid of this if we change the API to return a | ||
| 626 | * "cookie" from alloc, to be passed to free. But no big deal yet. | ||
| 627 | */ | ||
| 628 | static DEFINE_SPINLOCK(vmap_block_tree_lock); | ||
| 629 | static RADIX_TREE(vmap_block_tree, GFP_ATOMIC); | ||
| 630 | |||
| 631 | /* | ||
| 632 | * We should probably have a fallback mechanism to allocate virtual memory | ||
| 633 | * out of partially filled vmap blocks. However vmap block sizing should be | ||
| 634 | * fairly reasonable according to the vmalloc size, so it shouldn't be a | ||
| 635 | * big problem. | ||
| 636 | */ | ||
| 637 | |||
| 638 | static unsigned long addr_to_vb_idx(unsigned long addr) | ||
| 639 | { | ||
| 640 | addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); | ||
| 641 | addr /= VMAP_BLOCK_SIZE; | ||
| 642 | return addr; | ||
| 643 | } | ||
| 644 | |||
| 645 | static struct vmap_block *new_vmap_block(gfp_t gfp_mask) | ||
| 646 | { | ||
| 647 | struct vmap_block_queue *vbq; | ||
| 648 | struct vmap_block *vb; | ||
| 649 | struct vmap_area *va; | ||
| 650 | unsigned long vb_idx; | ||
| 651 | int node, err; | ||
| 652 | |||
| 653 | node = numa_node_id(); | ||
| 654 | |||
| 655 | vb = kmalloc_node(sizeof(struct vmap_block), | ||
| 656 | gfp_mask & GFP_RECLAIM_MASK, node); | ||
| 657 | if (unlikely(!vb)) | ||
| 658 | return ERR_PTR(-ENOMEM); | ||
| 659 | |||
| 660 | va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, | ||
| 661 | VMALLOC_START, VMALLOC_END, | ||
| 662 | node, gfp_mask); | ||
| 663 | if (unlikely(IS_ERR(va))) { | ||
| 664 | kfree(vb); | ||
| 665 | return ERR_PTR(PTR_ERR(va)); | ||
| 666 | } | ||
| 667 | |||
| 668 | err = radix_tree_preload(gfp_mask); | ||
| 669 | if (unlikely(err)) { | ||
| 670 | kfree(vb); | ||
| 671 | free_vmap_area(va); | ||
| 672 | return ERR_PTR(err); | ||
| 673 | } | ||
| 674 | |||
| 675 | spin_lock_init(&vb->lock); | ||
| 676 | vb->va = va; | ||
| 677 | vb->free = VMAP_BBMAP_BITS; | ||
| 678 | vb->dirty = 0; | ||
| 679 | bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); | ||
| 680 | bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); | ||
| 681 | INIT_LIST_HEAD(&vb->free_list); | ||
| 682 | INIT_LIST_HEAD(&vb->dirty_list); | ||
| 683 | |||
| 684 | vb_idx = addr_to_vb_idx(va->va_start); | ||
| 685 | spin_lock(&vmap_block_tree_lock); | ||
| 686 | err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); | ||
| 687 | spin_unlock(&vmap_block_tree_lock); | ||
| 688 | BUG_ON(err); | ||
| 689 | radix_tree_preload_end(); | ||
| 690 | |||
| 691 | vbq = &get_cpu_var(vmap_block_queue); | ||
| 692 | vb->vbq = vbq; | ||
| 693 | spin_lock(&vbq->lock); | ||
| 694 | list_add(&vb->free_list, &vbq->free); | ||
| 695 | spin_unlock(&vbq->lock); | ||
| 696 | put_cpu_var(vmap_cpu_blocks); | ||
| 697 | |||
| 698 | return vb; | ||
| 699 | } | ||
| 700 | |||
| 701 | static void rcu_free_vb(struct rcu_head *head) | ||
| 702 | { | ||
| 703 | struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); | ||
| 704 | |||
| 705 | kfree(vb); | ||
| 706 | } | ||
| 707 | |||
| 708 | static void free_vmap_block(struct vmap_block *vb) | ||
| 709 | { | ||
| 710 | struct vmap_block *tmp; | ||
| 711 | unsigned long vb_idx; | ||
| 712 | |||
| 713 | spin_lock(&vb->vbq->lock); | ||
| 714 | if (!list_empty(&vb->free_list)) | ||
| 715 | list_del(&vb->free_list); | ||
| 716 | if (!list_empty(&vb->dirty_list)) | ||
| 717 | list_del(&vb->dirty_list); | ||
| 718 | spin_unlock(&vb->vbq->lock); | ||
| 719 | |||
| 720 | vb_idx = addr_to_vb_idx(vb->va->va_start); | ||
| 721 | spin_lock(&vmap_block_tree_lock); | ||
| 722 | tmp = radix_tree_delete(&vmap_block_tree, vb_idx); | ||
| 723 | spin_unlock(&vmap_block_tree_lock); | ||
| 724 | BUG_ON(tmp != vb); | ||
| 725 | |||
| 726 | free_unmap_vmap_area(vb->va); | ||
| 727 | call_rcu(&vb->rcu_head, rcu_free_vb); | ||
| 728 | } | ||
| 729 | |||
| 730 | static void *vb_alloc(unsigned long size, gfp_t gfp_mask) | ||
| 731 | { | ||
| 732 | struct vmap_block_queue *vbq; | ||
| 733 | struct vmap_block *vb; | ||
| 734 | unsigned long addr = 0; | ||
| 735 | unsigned int order; | ||
| 736 | |||
| 737 | BUG_ON(size & ~PAGE_MASK); | ||
| 738 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | ||
| 739 | order = get_order(size); | ||
| 740 | |||
| 741 | again: | ||
| 742 | rcu_read_lock(); | ||
| 743 | vbq = &get_cpu_var(vmap_block_queue); | ||
| 744 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | ||
| 745 | int i; | ||
| 746 | |||
| 747 | spin_lock(&vb->lock); | ||
| 748 | i = bitmap_find_free_region(vb->alloc_map, | ||
| 749 | VMAP_BBMAP_BITS, order); | ||
| 750 | |||
| 751 | if (i >= 0) { | ||
| 752 | addr = vb->va->va_start + (i << PAGE_SHIFT); | ||
| 753 | BUG_ON(addr_to_vb_idx(addr) != | ||
| 754 | addr_to_vb_idx(vb->va->va_start)); | ||
| 755 | vb->free -= 1UL << order; | ||
| 756 | if (vb->free == 0) { | ||
| 757 | spin_lock(&vbq->lock); | ||
| 758 | list_del_init(&vb->free_list); | ||
| 759 | spin_unlock(&vbq->lock); | ||
| 760 | } | ||
| 761 | spin_unlock(&vb->lock); | ||
| 762 | break; | ||
| 763 | } | ||
| 764 | spin_unlock(&vb->lock); | ||
| 765 | } | ||
| 766 | put_cpu_var(vmap_cpu_blocks); | ||
| 767 | rcu_read_unlock(); | ||
| 768 | |||
| 769 | if (!addr) { | ||
| 770 | vb = new_vmap_block(gfp_mask); | ||
| 771 | if (IS_ERR(vb)) | ||
| 772 | return vb; | ||
| 773 | goto again; | ||
| 774 | } | ||
| 775 | |||
| 776 | return (void *)addr; | ||
| 777 | } | ||
| 778 | |||
| 779 | static void vb_free(const void *addr, unsigned long size) | ||
| 780 | { | ||
| 781 | unsigned long offset; | ||
| 782 | unsigned long vb_idx; | ||
| 783 | unsigned int order; | ||
| 784 | struct vmap_block *vb; | ||
| 785 | |||
| 786 | BUG_ON(size & ~PAGE_MASK); | ||
| 787 | BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); | ||
| 788 | order = get_order(size); | ||
| 789 | |||
| 790 | offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); | ||
| 791 | |||
| 792 | vb_idx = addr_to_vb_idx((unsigned long)addr); | ||
| 793 | rcu_read_lock(); | ||
| 794 | vb = radix_tree_lookup(&vmap_block_tree, vb_idx); | ||
| 795 | rcu_read_unlock(); | ||
| 796 | BUG_ON(!vb); | ||
| 797 | |||
| 798 | spin_lock(&vb->lock); | ||
| 799 | bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); | ||
| 800 | if (!vb->dirty) { | ||
| 801 | spin_lock(&vb->vbq->lock); | ||
| 802 | list_add(&vb->dirty_list, &vb->vbq->dirty); | ||
| 803 | spin_unlock(&vb->vbq->lock); | ||
| 804 | } | ||
| 805 | vb->dirty += 1UL << order; | ||
| 806 | if (vb->dirty == VMAP_BBMAP_BITS) { | ||
| 807 | BUG_ON(vb->free || !list_empty(&vb->free_list)); | ||
| 808 | spin_unlock(&vb->lock); | ||
| 809 | free_vmap_block(vb); | ||
| 810 | } else | ||
| 811 | spin_unlock(&vb->lock); | ||
| 812 | } | ||
| 813 | |||
| 814 | /** | ||
| 815 | * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer | ||
| 816 | * | ||
| 817 | * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily | ||
| 818 | * to amortize TLB flushing overheads. What this means is that any page you | ||
| 819 | * have now, may, in a former life, have been mapped into kernel virtual | ||
| 820 | * address by the vmap layer and so there might be some CPUs with TLB entries | ||
| 821 | * still referencing that page (additional to the regular 1:1 kernel mapping). | ||
| 822 | * | ||
| 823 | * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can | ||
| 824 | * be sure that none of the pages we have control over will have any aliases | ||
| 825 | * from the vmap layer. | ||
| 826 | */ | ||
| 827 | void vm_unmap_aliases(void) | ||
| 828 | { | ||
| 829 | unsigned long start = ULONG_MAX, end = 0; | ||
| 830 | int cpu; | ||
| 831 | int flush = 0; | ||
| 832 | |||
| 833 | if (unlikely(!vmap_initialized)) | ||
| 834 | return; | ||
| 835 | |||
| 836 | for_each_possible_cpu(cpu) { | ||
| 837 | struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); | ||
| 838 | struct vmap_block *vb; | ||
| 839 | |||
| 840 | rcu_read_lock(); | ||
| 841 | list_for_each_entry_rcu(vb, &vbq->free, free_list) { | ||
| 842 | int i; | ||
| 843 | |||
| 844 | spin_lock(&vb->lock); | ||
| 845 | i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); | ||
| 846 | while (i < VMAP_BBMAP_BITS) { | ||
| 847 | unsigned long s, e; | ||
| 848 | int j; | ||
| 849 | j = find_next_zero_bit(vb->dirty_map, | ||
| 850 | VMAP_BBMAP_BITS, i); | ||
| 851 | |||
| 852 | s = vb->va->va_start + (i << PAGE_SHIFT); | ||
| 853 | e = vb->va->va_start + (j << PAGE_SHIFT); | ||
| 854 | vunmap_page_range(s, e); | ||
| 855 | flush = 1; | ||
| 856 | |||
| 857 | if (s < start) | ||
| 858 | start = s; | ||
| 859 | if (e > end) | ||
| 860 | end = e; | ||
| 861 | |||
| 862 | i = j; | ||
| 863 | i = find_next_bit(vb->dirty_map, | ||
| 864 | VMAP_BBMAP_BITS, i); | ||
| 865 | } | ||
| 866 | spin_unlock(&vb->lock); | ||
| 867 | } | ||
| 868 | rcu_read_unlock(); | ||
| 869 | } | ||
| 870 | |||
| 871 | __purge_vmap_area_lazy(&start, &end, 1, flush); | ||
| 872 | } | ||
| 873 | EXPORT_SYMBOL_GPL(vm_unmap_aliases); | ||
| 874 | |||
| 875 | /** | ||
| 876 | * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram | ||
| 877 | * @mem: the pointer returned by vm_map_ram | ||
| 878 | * @count: the count passed to that vm_map_ram call (cannot unmap partial) | ||
| 879 | */ | ||
| 880 | void vm_unmap_ram(const void *mem, unsigned int count) | ||
| 881 | { | ||
| 882 | unsigned long size = count << PAGE_SHIFT; | ||
| 883 | unsigned long addr = (unsigned long)mem; | ||
| 884 | |||
| 885 | BUG_ON(!addr); | ||
| 886 | BUG_ON(addr < VMALLOC_START); | ||
| 887 | BUG_ON(addr > VMALLOC_END); | ||
| 888 | BUG_ON(addr & (PAGE_SIZE-1)); | ||
| 889 | |||
| 890 | debug_check_no_locks_freed(mem, size); | ||
| 891 | |||
| 892 | if (likely(count <= VMAP_MAX_ALLOC)) | ||
| 893 | vb_free(mem, size); | ||
| 894 | else | ||
| 895 | free_unmap_vmap_area_addr(addr); | ||
| 896 | } | ||
| 897 | EXPORT_SYMBOL(vm_unmap_ram); | ||
| 898 | |||
| 899 | /** | ||
| 900 | * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) | ||
| 901 | * @pages: an array of pointers to the pages to be mapped | ||
| 902 | * @count: number of pages | ||
| 903 | * @node: prefer to allocate data structures on this node | ||
| 904 | * @prot: memory protection to use. PAGE_KERNEL for regular RAM | ||
| 905 | * | ||
| 906 | * Returns: a pointer to the address that has been mapped, or %NULL on failure | ||
| 907 | */ | ||
| 908 | void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) | ||
| 909 | { | ||
| 910 | unsigned long size = count << PAGE_SHIFT; | ||
| 215 | unsigned long addr; | 911 | unsigned long addr; |
| 912 | void *mem; | ||
| 913 | |||
| 914 | if (likely(count <= VMAP_MAX_ALLOC)) { | ||
| 915 | mem = vb_alloc(size, GFP_KERNEL); | ||
| 916 | if (IS_ERR(mem)) | ||
| 917 | return NULL; | ||
| 918 | addr = (unsigned long)mem; | ||
| 919 | } else { | ||
| 920 | struct vmap_area *va; | ||
| 921 | va = alloc_vmap_area(size, PAGE_SIZE, | ||
| 922 | VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); | ||
| 923 | if (IS_ERR(va)) | ||
| 924 | return NULL; | ||
| 925 | |||
| 926 | addr = va->va_start; | ||
| 927 | mem = (void *)addr; | ||
| 928 | } | ||
| 929 | if (vmap_page_range(addr, addr + size, prot, pages) < 0) { | ||
| 930 | vm_unmap_ram(mem, count); | ||
| 931 | return NULL; | ||
| 932 | } | ||
| 933 | return mem; | ||
| 934 | } | ||
| 935 | EXPORT_SYMBOL(vm_map_ram); | ||
| 936 | |||
| 937 | void __init vmalloc_init(void) | ||
| 938 | { | ||
| 939 | int i; | ||
| 940 | |||
| 941 | for_each_possible_cpu(i) { | ||
| 942 | struct vmap_block_queue *vbq; | ||
| 943 | |||
| 944 | vbq = &per_cpu(vmap_block_queue, i); | ||
| 945 | spin_lock_init(&vbq->lock); | ||
| 946 | INIT_LIST_HEAD(&vbq->free); | ||
| 947 | INIT_LIST_HEAD(&vbq->dirty); | ||
| 948 | vbq->nr_dirty = 0; | ||
| 949 | } | ||
| 950 | |||
| 951 | vmap_initialized = true; | ||
| 952 | } | ||
| 953 | |||
| 954 | void unmap_kernel_range(unsigned long addr, unsigned long size) | ||
| 955 | { | ||
| 956 | unsigned long end = addr + size; | ||
| 957 | vunmap_page_range(addr, end); | ||
| 958 | flush_tlb_kernel_range(addr, end); | ||
| 959 | } | ||
| 960 | |||
| 961 | int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) | ||
| 962 | { | ||
| 963 | unsigned long addr = (unsigned long)area->addr; | ||
| 964 | unsigned long end = addr + area->size - PAGE_SIZE; | ||
| 965 | int err; | ||
| 966 | |||
| 967 | err = vmap_page_range(addr, end, prot, *pages); | ||
| 968 | if (err > 0) { | ||
| 969 | *pages += err; | ||
| 970 | err = 0; | ||
| 971 | } | ||
| 972 | |||
| 973 | return err; | ||
| 974 | } | ||
| 975 | EXPORT_SYMBOL_GPL(map_vm_area); | ||
| 976 | |||
| 977 | /*** Old vmalloc interfaces ***/ | ||
| 978 | DEFINE_RWLOCK(vmlist_lock); | ||
| 979 | struct vm_struct *vmlist; | ||
| 980 | |||
| 981 | static struct vm_struct *__get_vm_area_node(unsigned long size, | ||
| 982 | unsigned long flags, unsigned long start, unsigned long end, | ||
| 983 | int node, gfp_t gfp_mask, void *caller) | ||
| 984 | { | ||
| 985 | static struct vmap_area *va; | ||
| 986 | struct vm_struct *area; | ||
| 987 | struct vm_struct *tmp, **p; | ||
| 988 | unsigned long align = 1; | ||
| 216 | 989 | ||
| 217 | BUG_ON(in_interrupt()); | 990 | BUG_ON(in_interrupt()); |
| 218 | if (flags & VM_IOREMAP) { | 991 | if (flags & VM_IOREMAP) { |
| @@ -225,13 +998,12 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, | |||
| 225 | 998 | ||
| 226 | align = 1ul << bit; | 999 | align = 1ul << bit; |
| 227 | } | 1000 | } |
| 228 | addr = ALIGN(start, align); | 1001 | |
| 229 | size = PAGE_ALIGN(size); | 1002 | size = PAGE_ALIGN(size); |
| 230 | if (unlikely(!size)) | 1003 | if (unlikely(!size)) |
| 231 | return NULL; | 1004 | return NULL; |
| 232 | 1005 | ||
| 233 | area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); | 1006 | area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); |
| 234 | |||
| 235 | if (unlikely(!area)) | 1007 | if (unlikely(!area)) |
| 236 | return NULL; | 1008 | return NULL; |
| 237 | 1009 | ||
| @@ -240,48 +1012,32 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, | |||
| 240 | */ | 1012 | */ |
| 241 | size += PAGE_SIZE; | 1013 | size += PAGE_SIZE; |
| 242 | 1014 | ||
| 243 | write_lock(&vmlist_lock); | 1015 | va = alloc_vmap_area(size, align, start, end, node, gfp_mask); |
| 244 | for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) { | 1016 | if (IS_ERR(va)) { |
| 245 | if ((unsigned long)tmp->addr < addr) { | 1017 | kfree(area); |
| 246 | if((unsigned long)tmp->addr + tmp->size >= addr) | 1018 | return NULL; |
| 247 | addr = ALIGN(tmp->size + | ||
| 248 | (unsigned long)tmp->addr, align); | ||
| 249 | continue; | ||
| 250 | } | ||
| 251 | if ((size + addr) < addr) | ||
| 252 | goto out; | ||
| 253 | if (size + addr <= (unsigned long)tmp->addr) | ||
| 254 | goto found; | ||
| 255 | addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align); | ||
| 256 | if (addr > end - size) | ||
| 257 | goto out; | ||
| 258 | } | 1019 | } |
| 259 | if ((size + addr) < addr) | ||
| 260 | goto out; | ||
| 261 | if (addr > end - size) | ||
| 262 | goto out; | ||
| 263 | |||
| 264 | found: | ||
| 265 | area->next = *p; | ||
| 266 | *p = area; | ||
| 267 | 1020 | ||
| 268 | area->flags = flags; | 1021 | area->flags = flags; |
| 269 | area->addr = (void *)addr; | 1022 | area->addr = (void *)va->va_start; |
| 270 | area->size = size; | 1023 | area->size = size; |
| 271 | area->pages = NULL; | 1024 | area->pages = NULL; |
| 272 | area->nr_pages = 0; | 1025 | area->nr_pages = 0; |
| 273 | area->phys_addr = 0; | 1026 | area->phys_addr = 0; |
| 274 | area->caller = caller; | 1027 | area->caller = caller; |
| 1028 | va->private = area; | ||
| 1029 | va->flags |= VM_VM_AREA; | ||
| 1030 | |||
| 1031 | write_lock(&vmlist_lock); | ||
| 1032 | for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { | ||
| 1033 | if (tmp->addr >= area->addr) | ||
| 1034 | break; | ||
| 1035 | } | ||
| 1036 | area->next = *p; | ||
| 1037 | *p = area; | ||
| 275 | write_unlock(&vmlist_lock); | 1038 | write_unlock(&vmlist_lock); |
| 276 | 1039 | ||
| 277 | return area; | 1040 | return area; |
| 278 | |||
| 279 | out: | ||
| 280 | write_unlock(&vmlist_lock); | ||
| 281 | kfree(area); | ||
| 282 | if (printk_ratelimit()) | ||
| 283 | printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n"); | ||
| 284 | return NULL; | ||
| 285 | } | 1041 | } |
| 286 | 1042 | ||
| 287 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, | 1043 | struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, |
| @@ -321,39 +1077,15 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, | |||
| 321 | gfp_mask, __builtin_return_address(0)); | 1077 | gfp_mask, __builtin_return_address(0)); |
| 322 | } | 1078 | } |
| 323 | 1079 | ||
| 324 | /* Caller must hold vmlist_lock */ | 1080 | static struct vm_struct *find_vm_area(const void *addr) |
| 325 | static struct vm_struct *__find_vm_area(const void *addr) | ||
| 326 | { | 1081 | { |
| 327 | struct vm_struct *tmp; | 1082 | struct vmap_area *va; |
| 328 | |||
| 329 | for (tmp = vmlist; tmp != NULL; tmp = tmp->next) { | ||
| 330 | if (tmp->addr == addr) | ||
| 331 | break; | ||
| 332 | } | ||
| 333 | |||
| 334 | return tmp; | ||
| 335 | } | ||
| 336 | 1083 | ||
| 337 | /* Caller must hold vmlist_lock */ | 1084 | va = find_vmap_area((unsigned long)addr); |
| 338 | static struct vm_struct *__remove_vm_area(const void *addr) | 1085 | if (va && va->flags & VM_VM_AREA) |
| 339 | { | 1086 | return va->private; |
| 340 | struct vm_struct **p, *tmp; | ||
| 341 | 1087 | ||
| 342 | for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) { | ||
| 343 | if (tmp->addr == addr) | ||
| 344 | goto found; | ||
| 345 | } | ||
| 346 | return NULL; | 1088 | return NULL; |
| 347 | |||
| 348 | found: | ||
| 349 | unmap_vm_area(tmp); | ||
| 350 | *p = tmp->next; | ||
| 351 | |||
| 352 | /* | ||
| 353 | * Remove the guard page. | ||
| 354 | */ | ||
| 355 | tmp->size -= PAGE_SIZE; | ||
| 356 | return tmp; | ||
| 357 | } | 1089 | } |
| 358 | 1090 | ||
| 359 | /** | 1091 | /** |
| @@ -366,11 +1098,24 @@ found: | |||
| 366 | */ | 1098 | */ |
| 367 | struct vm_struct *remove_vm_area(const void *addr) | 1099 | struct vm_struct *remove_vm_area(const void *addr) |
| 368 | { | 1100 | { |
| 369 | struct vm_struct *v; | 1101 | struct vmap_area *va; |
| 370 | write_lock(&vmlist_lock); | 1102 | |
| 371 | v = __remove_vm_area(addr); | 1103 | va = find_vmap_area((unsigned long)addr); |
| 372 | write_unlock(&vmlist_lock); | 1104 | if (va && va->flags & VM_VM_AREA) { |
| 373 | return v; | 1105 | struct vm_struct *vm = va->private; |
| 1106 | struct vm_struct *tmp, **p; | ||
| 1107 | free_unmap_vmap_area(va); | ||
| 1108 | vm->size -= PAGE_SIZE; | ||
| 1109 | |||
| 1110 | write_lock(&vmlist_lock); | ||
| 1111 | for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) | ||
| 1112 | ; | ||
| 1113 | *p = tmp->next; | ||
| 1114 | write_unlock(&vmlist_lock); | ||
| 1115 | |||
| 1116 | return vm; | ||
| 1117 | } | ||
| 1118 | return NULL; | ||
| 374 | } | 1119 | } |
| 375 | 1120 | ||
| 376 | static void __vunmap(const void *addr, int deallocate_pages) | 1121 | static void __vunmap(const void *addr, int deallocate_pages) |
| @@ -480,6 +1225,8 @@ void *vmap(struct page **pages, unsigned int count, | |||
| 480 | } | 1225 | } |
| 481 | EXPORT_SYMBOL(vmap); | 1226 | EXPORT_SYMBOL(vmap); |
| 482 | 1227 | ||
| 1228 | static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, | ||
| 1229 | int node, void *caller); | ||
| 483 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, | 1230 | static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, |
| 484 | pgprot_t prot, int node, void *caller) | 1231 | pgprot_t prot, int node, void *caller) |
| 485 | { | 1232 | { |
| @@ -606,10 +1353,8 @@ void *vmalloc_user(unsigned long size) | |||
| 606 | 1353 | ||
| 607 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); | 1354 | ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); |
| 608 | if (ret) { | 1355 | if (ret) { |
| 609 | write_lock(&vmlist_lock); | 1356 | area = find_vm_area(ret); |
| 610 | area = __find_vm_area(ret); | ||
| 611 | area->flags |= VM_USERMAP; | 1357 | area->flags |= VM_USERMAP; |
| 612 | write_unlock(&vmlist_lock); | ||
| 613 | } | 1358 | } |
| 614 | return ret; | 1359 | return ret; |
| 615 | } | 1360 | } |
| @@ -689,10 +1434,8 @@ void *vmalloc_32_user(unsigned long size) | |||
| 689 | 1434 | ||
| 690 | ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); | 1435 | ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); |
| 691 | if (ret) { | 1436 | if (ret) { |
| 692 | write_lock(&vmlist_lock); | 1437 | area = find_vm_area(ret); |
| 693 | area = __find_vm_area(ret); | ||
| 694 | area->flags |= VM_USERMAP; | 1438 | area->flags |= VM_USERMAP; |
| 695 | write_unlock(&vmlist_lock); | ||
| 696 | } | 1439 | } |
| 697 | return ret; | 1440 | return ret; |
| 698 | } | 1441 | } |
| @@ -793,26 +1536,25 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | |||
| 793 | struct vm_struct *area; | 1536 | struct vm_struct *area; |
| 794 | unsigned long uaddr = vma->vm_start; | 1537 | unsigned long uaddr = vma->vm_start; |
| 795 | unsigned long usize = vma->vm_end - vma->vm_start; | 1538 | unsigned long usize = vma->vm_end - vma->vm_start; |
| 796 | int ret; | ||
| 797 | 1539 | ||
| 798 | if ((PAGE_SIZE-1) & (unsigned long)addr) | 1540 | if ((PAGE_SIZE-1) & (unsigned long)addr) |
| 799 | return -EINVAL; | 1541 | return -EINVAL; |
| 800 | 1542 | ||
| 801 | read_lock(&vmlist_lock); | 1543 | area = find_vm_area(addr); |
| 802 | area = __find_vm_area(addr); | ||
| 803 | if (!area) | 1544 | if (!area) |
| 804 | goto out_einval_locked; | 1545 | return -EINVAL; |
| 805 | 1546 | ||
| 806 | if (!(area->flags & VM_USERMAP)) | 1547 | if (!(area->flags & VM_USERMAP)) |
| 807 | goto out_einval_locked; | 1548 | return -EINVAL; |
| 808 | 1549 | ||
| 809 | if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) | 1550 | if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) |
| 810 | goto out_einval_locked; | 1551 | return -EINVAL; |
| 811 | read_unlock(&vmlist_lock); | ||
| 812 | 1552 | ||
| 813 | addr += pgoff << PAGE_SHIFT; | 1553 | addr += pgoff << PAGE_SHIFT; |
| 814 | do { | 1554 | do { |
| 815 | struct page *page = vmalloc_to_page(addr); | 1555 | struct page *page = vmalloc_to_page(addr); |
| 1556 | int ret; | ||
| 1557 | |||
| 816 | ret = vm_insert_page(vma, uaddr, page); | 1558 | ret = vm_insert_page(vma, uaddr, page); |
| 817 | if (ret) | 1559 | if (ret) |
| 818 | return ret; | 1560 | return ret; |
| @@ -825,11 +1567,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, | |||
| 825 | /* Prevent "things" like memory migration? VM_flags need a cleanup... */ | 1567 | /* Prevent "things" like memory migration? VM_flags need a cleanup... */ |
| 826 | vma->vm_flags |= VM_RESERVED; | 1568 | vma->vm_flags |= VM_RESERVED; |
| 827 | 1569 | ||
| 828 | return ret; | 1570 | return 0; |
| 829 | |||
| 830 | out_einval_locked: | ||
| 831 | read_unlock(&vmlist_lock); | ||
| 832 | return -EINVAL; | ||
| 833 | } | 1571 | } |
| 834 | EXPORT_SYMBOL(remap_vmalloc_range); | 1572 | EXPORT_SYMBOL(remap_vmalloc_range); |
| 835 | 1573 | ||
| @@ -989,11 +1727,41 @@ static int s_show(struct seq_file *m, void *p) | |||
| 989 | return 0; | 1727 | return 0; |
| 990 | } | 1728 | } |
| 991 | 1729 | ||
| 992 | const struct seq_operations vmalloc_op = { | 1730 | static const struct seq_operations vmalloc_op = { |
| 993 | .start = s_start, | 1731 | .start = s_start, |
| 994 | .next = s_next, | 1732 | .next = s_next, |
| 995 | .stop = s_stop, | 1733 | .stop = s_stop, |
| 996 | .show = s_show, | 1734 | .show = s_show, |
| 997 | }; | 1735 | }; |
| 1736 | |||
| 1737 | static int vmalloc_open(struct inode *inode, struct file *file) | ||
| 1738 | { | ||
| 1739 | unsigned int *ptr = NULL; | ||
| 1740 | int ret; | ||
| 1741 | |||
| 1742 | if (NUMA_BUILD) | ||
| 1743 | ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); | ||
| 1744 | ret = seq_open(file, &vmalloc_op); | ||
| 1745 | if (!ret) { | ||
| 1746 | struct seq_file *m = file->private_data; | ||
| 1747 | m->private = ptr; | ||
| 1748 | } else | ||
| 1749 | kfree(ptr); | ||
| 1750 | return ret; | ||
| 1751 | } | ||
| 1752 | |||
| 1753 | static const struct file_operations proc_vmalloc_operations = { | ||
| 1754 | .open = vmalloc_open, | ||
| 1755 | .read = seq_read, | ||
| 1756 | .llseek = seq_lseek, | ||
| 1757 | .release = seq_release_private, | ||
| 1758 | }; | ||
| 1759 | |||
| 1760 | static int __init proc_vmalloc_init(void) | ||
| 1761 | { | ||
| 1762 | proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations); | ||
| 1763 | return 0; | ||
| 1764 | } | ||
| 1765 | module_init(proc_vmalloc_init); | ||
| 998 | #endif | 1766 | #endif |
| 999 | 1767 | ||
