aboutsummaryrefslogtreecommitdiffstats
path: root/mm/vmalloc.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmalloc.c')
-rw-r--r--mm/vmalloc.c609
1 files changed, 532 insertions, 77 deletions
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index f8189a4b3e13..0f551a4a44cd 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -12,6 +12,7 @@
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/highmem.h> 14#include <linux/highmem.h>
15#include <linux/sched.h>
15#include <linux/slab.h> 16#include <linux/slab.h>
16#include <linux/spinlock.h> 17#include <linux/spinlock.h>
17#include <linux/interrupt.h> 18#include <linux/interrupt.h>
@@ -25,10 +26,10 @@
25#include <linux/rcupdate.h> 26#include <linux/rcupdate.h>
26#include <linux/pfn.h> 27#include <linux/pfn.h>
27#include <linux/kmemleak.h> 28#include <linux/kmemleak.h>
28
29#include <asm/atomic.h> 29#include <asm/atomic.h>
30#include <asm/uaccess.h> 30#include <asm/uaccess.h>
31#include <asm/tlbflush.h> 31#include <asm/tlbflush.h>
32#include <asm/shmparam.h>
32 33
33 34
34/*** Page table manipulation functions ***/ 35/*** Page table manipulation functions ***/
@@ -168,11 +169,9 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end,
168 next = pgd_addr_end(addr, end); 169 next = pgd_addr_end(addr, end);
169 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); 170 err = vmap_pud_range(pgd, addr, next, prot, pages, &nr);
170 if (err) 171 if (err)
171 break; 172 return err;
172 } while (pgd++, addr = next, addr != end); 173 } while (pgd++, addr = next, addr != end);
173 174
174 if (unlikely(err))
175 return err;
176 return nr; 175 return nr;
177} 176}
178 177
@@ -186,7 +185,7 @@ static int vmap_page_range(unsigned long start, unsigned long end,
186 return ret; 185 return ret;
187} 186}
188 187
189static inline int is_vmalloc_or_module_addr(const void *x) 188int is_vmalloc_or_module_addr(const void *x)
190{ 189{
191 /* 190 /*
192 * ARM, x86-64 and sparc64 put modules in a special place, 191 * ARM, x86-64 and sparc64 put modules in a special place,
@@ -265,6 +264,7 @@ struct vmap_area {
265static DEFINE_SPINLOCK(vmap_area_lock); 264static DEFINE_SPINLOCK(vmap_area_lock);
266static struct rb_root vmap_area_root = RB_ROOT; 265static struct rb_root vmap_area_root = RB_ROOT;
267static LIST_HEAD(vmap_area_list); 266static LIST_HEAD(vmap_area_list);
267static unsigned long vmap_area_pcpu_hole;
268 268
269static struct vmap_area *__find_vmap_area(unsigned long addr) 269static struct vmap_area *__find_vmap_area(unsigned long addr)
270{ 270{
@@ -431,6 +431,15 @@ static void __free_vmap_area(struct vmap_area *va)
431 RB_CLEAR_NODE(&va->rb_node); 431 RB_CLEAR_NODE(&va->rb_node);
432 list_del_rcu(&va->list); 432 list_del_rcu(&va->list);
433 433
434 /*
435 * Track the highest possible candidate for pcpu area
436 * allocation. Areas outside of vmalloc area can be returned
437 * here too, consider only end addresses which fall inside
438 * vmalloc area proper.
439 */
440 if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
441 vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
442
434 call_rcu(&va->rcu_head, rcu_free_va); 443 call_rcu(&va->rcu_head, rcu_free_va);
435} 444}
436 445
@@ -1038,6 +1047,9 @@ void __init vmalloc_init(void)
1038 va->va_end = va->va_start + tmp->size; 1047 va->va_end = va->va_start + tmp->size;
1039 __insert_vmap_area(va); 1048 __insert_vmap_area(va);
1040 } 1049 }
1050
1051 vmap_area_pcpu_hole = VMALLOC_END;
1052
1041 vmap_initialized = true; 1053 vmap_initialized = true;
1042} 1054}
1043 1055
@@ -1122,14 +1134,34 @@ EXPORT_SYMBOL_GPL(map_vm_area);
1122DEFINE_RWLOCK(vmlist_lock); 1134DEFINE_RWLOCK(vmlist_lock);
1123struct vm_struct *vmlist; 1135struct vm_struct *vmlist;
1124 1136
1137static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
1138 unsigned long flags, void *caller)
1139{
1140 struct vm_struct *tmp, **p;
1141
1142 vm->flags = flags;
1143 vm->addr = (void *)va->va_start;
1144 vm->size = va->va_end - va->va_start;
1145 vm->caller = caller;
1146 va->private = vm;
1147 va->flags |= VM_VM_AREA;
1148
1149 write_lock(&vmlist_lock);
1150 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1151 if (tmp->addr >= vm->addr)
1152 break;
1153 }
1154 vm->next = *p;
1155 *p = vm;
1156 write_unlock(&vmlist_lock);
1157}
1158
1125static struct vm_struct *__get_vm_area_node(unsigned long size, 1159static struct vm_struct *__get_vm_area_node(unsigned long size,
1126 unsigned long flags, unsigned long start, unsigned long end, 1160 unsigned long align, unsigned long flags, unsigned long start,
1127 int node, gfp_t gfp_mask, void *caller) 1161 unsigned long end, int node, gfp_t gfp_mask, void *caller)
1128{ 1162{
1129 static struct vmap_area *va; 1163 static struct vmap_area *va;
1130 struct vm_struct *area; 1164 struct vm_struct *area;
1131 struct vm_struct *tmp, **p;
1132 unsigned long align = 1;
1133 1165
1134 BUG_ON(in_interrupt()); 1166 BUG_ON(in_interrupt());
1135 if (flags & VM_IOREMAP) { 1167 if (flags & VM_IOREMAP) {
@@ -1147,7 +1179,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1147 if (unlikely(!size)) 1179 if (unlikely(!size))
1148 return NULL; 1180 return NULL;
1149 1181
1150 area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); 1182 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
1151 if (unlikely(!area)) 1183 if (unlikely(!area))
1152 return NULL; 1184 return NULL;
1153 1185
@@ -1162,32 +1194,14 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
1162 return NULL; 1194 return NULL;
1163 } 1195 }
1164 1196
1165 area->flags = flags; 1197 insert_vmalloc_vm(area, va, flags, caller);
1166 area->addr = (void *)va->va_start;
1167 area->size = size;
1168 area->pages = NULL;
1169 area->nr_pages = 0;
1170 area->phys_addr = 0;
1171 area->caller = caller;
1172 va->private = area;
1173 va->flags |= VM_VM_AREA;
1174
1175 write_lock(&vmlist_lock);
1176 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
1177 if (tmp->addr >= area->addr)
1178 break;
1179 }
1180 area->next = *p;
1181 *p = area;
1182 write_unlock(&vmlist_lock);
1183
1184 return area; 1198 return area;
1185} 1199}
1186 1200
1187struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, 1201struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
1188 unsigned long start, unsigned long end) 1202 unsigned long start, unsigned long end)
1189{ 1203{
1190 return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, 1204 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
1191 __builtin_return_address(0)); 1205 __builtin_return_address(0));
1192} 1206}
1193EXPORT_SYMBOL_GPL(__get_vm_area); 1207EXPORT_SYMBOL_GPL(__get_vm_area);
@@ -1196,7 +1210,7 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1196 unsigned long start, unsigned long end, 1210 unsigned long start, unsigned long end,
1197 void *caller) 1211 void *caller)
1198{ 1212{
1199 return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, 1213 return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL,
1200 caller); 1214 caller);
1201} 1215}
1202 1216
@@ -1211,22 +1225,22 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
1211 */ 1225 */
1212struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) 1226struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
1213{ 1227{
1214 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, 1228 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1215 -1, GFP_KERNEL, __builtin_return_address(0)); 1229 -1, GFP_KERNEL, __builtin_return_address(0));
1216} 1230}
1217 1231
1218struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, 1232struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
1219 void *caller) 1233 void *caller)
1220{ 1234{
1221 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, 1235 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1222 -1, GFP_KERNEL, caller); 1236 -1, GFP_KERNEL, caller);
1223} 1237}
1224 1238
1225struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, 1239struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags,
1226 int node, gfp_t gfp_mask) 1240 int node, gfp_t gfp_mask)
1227{ 1241{
1228 return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node, 1242 return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END,
1229 gfp_mask, __builtin_return_address(0)); 1243 node, gfp_mask, __builtin_return_address(0));
1230} 1244}
1231 1245
1232static struct vm_struct *find_vm_area(const void *addr) 1246static struct vm_struct *find_vm_area(const void *addr)
@@ -1256,17 +1270,21 @@ struct vm_struct *remove_vm_area(const void *addr)
1256 if (va && va->flags & VM_VM_AREA) { 1270 if (va && va->flags & VM_VM_AREA) {
1257 struct vm_struct *vm = va->private; 1271 struct vm_struct *vm = va->private;
1258 struct vm_struct *tmp, **p; 1272 struct vm_struct *tmp, **p;
1259 1273 /*
1260 vmap_debug_free_range(va->va_start, va->va_end); 1274 * remove from list and disallow access to this vm_struct
1261 free_unmap_vmap_area(va); 1275 * before unmap. (address range confliction is maintained by
1262 vm->size -= PAGE_SIZE; 1276 * vmap.)
1263 1277 */
1264 write_lock(&vmlist_lock); 1278 write_lock(&vmlist_lock);
1265 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) 1279 for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
1266 ; 1280 ;
1267 *p = tmp->next; 1281 *p = tmp->next;
1268 write_unlock(&vmlist_lock); 1282 write_unlock(&vmlist_lock);
1269 1283
1284 vmap_debug_free_range(va->va_start, va->va_end);
1285 free_unmap_vmap_area(va);
1286 vm->size -= PAGE_SIZE;
1287
1270 return vm; 1288 return vm;
1271 } 1289 }
1272 return NULL; 1290 return NULL;
@@ -1368,7 +1386,7 @@ void *vmap(struct page **pages, unsigned int count,
1368 1386
1369 might_sleep(); 1387 might_sleep();
1370 1388
1371 if (count > num_physpages) 1389 if (count > totalram_pages)
1372 return NULL; 1390 return NULL;
1373 1391
1374 area = get_vm_area_caller((count << PAGE_SHIFT), flags, 1392 area = get_vm_area_caller((count << PAGE_SHIFT), flags,
@@ -1385,7 +1403,8 @@ void *vmap(struct page **pages, unsigned int count,
1385} 1403}
1386EXPORT_SYMBOL(vmap); 1404EXPORT_SYMBOL(vmap);
1387 1405
1388static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, 1406static void *__vmalloc_node(unsigned long size, unsigned long align,
1407 gfp_t gfp_mask, pgprot_t prot,
1389 int node, void *caller); 1408 int node, void *caller);
1390static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, 1409static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1391 pgprot_t prot, int node, void *caller) 1410 pgprot_t prot, int node, void *caller)
@@ -1399,7 +1418,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
1399 area->nr_pages = nr_pages; 1418 area->nr_pages = nr_pages;
1400 /* Please note that the recursion is strictly bounded. */ 1419 /* Please note that the recursion is strictly bounded. */
1401 if (array_size > PAGE_SIZE) { 1420 if (array_size > PAGE_SIZE) {
1402 pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO, 1421 pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO,
1403 PAGE_KERNEL, node, caller); 1422 PAGE_KERNEL, node, caller);
1404 area->flags |= VM_VPAGES; 1423 area->flags |= VM_VPAGES;
1405 } else { 1424 } else {
@@ -1458,6 +1477,7 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1458/** 1477/**
1459 * __vmalloc_node - allocate virtually contiguous memory 1478 * __vmalloc_node - allocate virtually contiguous memory
1460 * @size: allocation size 1479 * @size: allocation size
1480 * @align: desired alignment
1461 * @gfp_mask: flags for the page level allocator 1481 * @gfp_mask: flags for the page level allocator
1462 * @prot: protection mask for the allocated pages 1482 * @prot: protection mask for the allocated pages
1463 * @node: node to use for allocation or -1 1483 * @node: node to use for allocation or -1
@@ -1467,19 +1487,20 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot)
1467 * allocator with @gfp_mask flags. Map them into contiguous 1487 * allocator with @gfp_mask flags. Map them into contiguous
1468 * kernel virtual space, using a pagetable protection of @prot. 1488 * kernel virtual space, using a pagetable protection of @prot.
1469 */ 1489 */
1470static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, 1490static void *__vmalloc_node(unsigned long size, unsigned long align,
1471 int node, void *caller) 1491 gfp_t gfp_mask, pgprot_t prot,
1492 int node, void *caller)
1472{ 1493{
1473 struct vm_struct *area; 1494 struct vm_struct *area;
1474 void *addr; 1495 void *addr;
1475 unsigned long real_size = size; 1496 unsigned long real_size = size;
1476 1497
1477 size = PAGE_ALIGN(size); 1498 size = PAGE_ALIGN(size);
1478 if (!size || (size >> PAGE_SHIFT) > num_physpages) 1499 if (!size || (size >> PAGE_SHIFT) > totalram_pages)
1479 return NULL; 1500 return NULL;
1480 1501
1481 area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, 1502 area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START,
1482 node, gfp_mask, caller); 1503 VMALLOC_END, node, gfp_mask, caller);
1483 1504
1484 if (!area) 1505 if (!area)
1485 return NULL; 1506 return NULL;
@@ -1498,7 +1519,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot,
1498 1519
1499void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) 1520void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
1500{ 1521{
1501 return __vmalloc_node(size, gfp_mask, prot, -1, 1522 return __vmalloc_node(size, 1, gfp_mask, prot, -1,
1502 __builtin_return_address(0)); 1523 __builtin_return_address(0));
1503} 1524}
1504EXPORT_SYMBOL(__vmalloc); 1525EXPORT_SYMBOL(__vmalloc);
@@ -1514,7 +1535,7 @@ EXPORT_SYMBOL(__vmalloc);
1514 */ 1535 */
1515void *vmalloc(unsigned long size) 1536void *vmalloc(unsigned long size)
1516{ 1537{
1517 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1538 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
1518 -1, __builtin_return_address(0)); 1539 -1, __builtin_return_address(0));
1519} 1540}
1520EXPORT_SYMBOL(vmalloc); 1541EXPORT_SYMBOL(vmalloc);
@@ -1531,7 +1552,8 @@ void *vmalloc_user(unsigned long size)
1531 struct vm_struct *area; 1552 struct vm_struct *area;
1532 void *ret; 1553 void *ret;
1533 1554
1534 ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, 1555 ret = __vmalloc_node(size, SHMLBA,
1556 GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
1535 PAGE_KERNEL, -1, __builtin_return_address(0)); 1557 PAGE_KERNEL, -1, __builtin_return_address(0));
1536 if (ret) { 1558 if (ret) {
1537 area = find_vm_area(ret); 1559 area = find_vm_area(ret);
@@ -1554,7 +1576,7 @@ EXPORT_SYMBOL(vmalloc_user);
1554 */ 1576 */
1555void *vmalloc_node(unsigned long size, int node) 1577void *vmalloc_node(unsigned long size, int node)
1556{ 1578{
1557 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, 1579 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL,
1558 node, __builtin_return_address(0)); 1580 node, __builtin_return_address(0));
1559} 1581}
1560EXPORT_SYMBOL(vmalloc_node); 1582EXPORT_SYMBOL(vmalloc_node);
@@ -1577,7 +1599,7 @@ EXPORT_SYMBOL(vmalloc_node);
1577 1599
1578void *vmalloc_exec(unsigned long size) 1600void *vmalloc_exec(unsigned long size)
1579{ 1601{
1580 return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, 1602 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
1581 -1, __builtin_return_address(0)); 1603 -1, __builtin_return_address(0));
1582} 1604}
1583 1605
@@ -1598,7 +1620,7 @@ void *vmalloc_exec(unsigned long size)
1598 */ 1620 */
1599void *vmalloc_32(unsigned long size) 1621void *vmalloc_32(unsigned long size)
1600{ 1622{
1601 return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL, 1623 return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
1602 -1, __builtin_return_address(0)); 1624 -1, __builtin_return_address(0));
1603} 1625}
1604EXPORT_SYMBOL(vmalloc_32); 1626EXPORT_SYMBOL(vmalloc_32);
@@ -1615,7 +1637,7 @@ void *vmalloc_32_user(unsigned long size)
1615 struct vm_struct *area; 1637 struct vm_struct *area;
1616 void *ret; 1638 void *ret;
1617 1639
1618 ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, 1640 ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
1619 -1, __builtin_return_address(0)); 1641 -1, __builtin_return_address(0));
1620 if (ret) { 1642 if (ret) {
1621 area = find_vm_area(ret); 1643 area = find_vm_area(ret);
@@ -1625,10 +1647,120 @@ void *vmalloc_32_user(unsigned long size)
1625} 1647}
1626EXPORT_SYMBOL(vmalloc_32_user); 1648EXPORT_SYMBOL(vmalloc_32_user);
1627 1649
1650/*
1651 * small helper routine , copy contents to buf from addr.
1652 * If the page is not present, fill zero.
1653 */
1654
1655static int aligned_vread(char *buf, char *addr, unsigned long count)
1656{
1657 struct page *p;
1658 int copied = 0;
1659
1660 while (count) {
1661 unsigned long offset, length;
1662
1663 offset = (unsigned long)addr & ~PAGE_MASK;
1664 length = PAGE_SIZE - offset;
1665 if (length > count)
1666 length = count;
1667 p = vmalloc_to_page(addr);
1668 /*
1669 * To do safe access to this _mapped_ area, we need
1670 * lock. But adding lock here means that we need to add
1671 * overhead of vmalloc()/vfree() calles for this _debug_
1672 * interface, rarely used. Instead of that, we'll use
1673 * kmap() and get small overhead in this access function.
1674 */
1675 if (p) {
1676 /*
1677 * we can expect USER0 is not used (see vread/vwrite's
1678 * function description)
1679 */
1680 void *map = kmap_atomic(p, KM_USER0);
1681 memcpy(buf, map + offset, length);
1682 kunmap_atomic(map, KM_USER0);
1683 } else
1684 memset(buf, 0, length);
1685
1686 addr += length;
1687 buf += length;
1688 copied += length;
1689 count -= length;
1690 }
1691 return copied;
1692}
1693
1694static int aligned_vwrite(char *buf, char *addr, unsigned long count)
1695{
1696 struct page *p;
1697 int copied = 0;
1698
1699 while (count) {
1700 unsigned long offset, length;
1701
1702 offset = (unsigned long)addr & ~PAGE_MASK;
1703 length = PAGE_SIZE - offset;
1704 if (length > count)
1705 length = count;
1706 p = vmalloc_to_page(addr);
1707 /*
1708 * To do safe access to this _mapped_ area, we need
1709 * lock. But adding lock here means that we need to add
1710 * overhead of vmalloc()/vfree() calles for this _debug_
1711 * interface, rarely used. Instead of that, we'll use
1712 * kmap() and get small overhead in this access function.
1713 */
1714 if (p) {
1715 /*
1716 * we can expect USER0 is not used (see vread/vwrite's
1717 * function description)
1718 */
1719 void *map = kmap_atomic(p, KM_USER0);
1720 memcpy(map + offset, buf, length);
1721 kunmap_atomic(map, KM_USER0);
1722 }
1723 addr += length;
1724 buf += length;
1725 copied += length;
1726 count -= length;
1727 }
1728 return copied;
1729}
1730
1731/**
1732 * vread() - read vmalloc area in a safe way.
1733 * @buf: buffer for reading data
1734 * @addr: vm address.
1735 * @count: number of bytes to be read.
1736 *
1737 * Returns # of bytes which addr and buf should be increased.
1738 * (same number to @count). Returns 0 if [addr...addr+count) doesn't
1739 * includes any intersect with alive vmalloc area.
1740 *
1741 * This function checks that addr is a valid vmalloc'ed area, and
1742 * copy data from that area to a given buffer. If the given memory range
1743 * of [addr...addr+count) includes some valid address, data is copied to
1744 * proper area of @buf. If there are memory holes, they'll be zero-filled.
1745 * IOREMAP area is treated as memory hole and no copy is done.
1746 *
1747 * If [addr...addr+count) doesn't includes any intersects with alive
1748 * vm_struct area, returns 0.
1749 * @buf should be kernel's buffer. Because this function uses KM_USER0,
1750 * the caller should guarantee KM_USER0 is not used.
1751 *
1752 * Note: In usual ops, vread() is never necessary because the caller
1753 * should know vmalloc() area is valid and can use memcpy().
1754 * This is for routines which have to access vmalloc area without
1755 * any informaion, as /dev/kmem.
1756 *
1757 */
1758
1628long vread(char *buf, char *addr, unsigned long count) 1759long vread(char *buf, char *addr, unsigned long count)
1629{ 1760{
1630 struct vm_struct *tmp; 1761 struct vm_struct *tmp;
1631 char *vaddr, *buf_start = buf; 1762 char *vaddr, *buf_start = buf;
1763 unsigned long buflen = count;
1632 unsigned long n; 1764 unsigned long n;
1633 1765
1634 /* Don't allow overflow */ 1766 /* Don't allow overflow */
@@ -1636,7 +1768,7 @@ long vread(char *buf, char *addr, unsigned long count)
1636 count = -(unsigned long) addr; 1768 count = -(unsigned long) addr;
1637 1769
1638 read_lock(&vmlist_lock); 1770 read_lock(&vmlist_lock);
1639 for (tmp = vmlist; tmp; tmp = tmp->next) { 1771 for (tmp = vmlist; count && tmp; tmp = tmp->next) {
1640 vaddr = (char *) tmp->addr; 1772 vaddr = (char *) tmp->addr;
1641 if (addr >= vaddr + tmp->size - PAGE_SIZE) 1773 if (addr >= vaddr + tmp->size - PAGE_SIZE)
1642 continue; 1774 continue;
@@ -1649,32 +1781,72 @@ long vread(char *buf, char *addr, unsigned long count)
1649 count--; 1781 count--;
1650 } 1782 }
1651 n = vaddr + tmp->size - PAGE_SIZE - addr; 1783 n = vaddr + tmp->size - PAGE_SIZE - addr;
1652 do { 1784 if (n > count)
1653 if (count == 0) 1785 n = count;
1654 goto finished; 1786 if (!(tmp->flags & VM_IOREMAP))
1655 *buf = *addr; 1787 aligned_vread(buf, addr, n);
1656 buf++; 1788 else /* IOREMAP area is treated as memory hole */
1657 addr++; 1789 memset(buf, 0, n);
1658 count--; 1790 buf += n;
1659 } while (--n > 0); 1791 addr += n;
1792 count -= n;
1660 } 1793 }
1661finished: 1794finished:
1662 read_unlock(&vmlist_lock); 1795 read_unlock(&vmlist_lock);
1663 return buf - buf_start; 1796
1797 if (buf == buf_start)
1798 return 0;
1799 /* zero-fill memory holes */
1800 if (buf != buf_start + buflen)
1801 memset(buf, 0, buflen - (buf - buf_start));
1802
1803 return buflen;
1664} 1804}
1665 1805
1806/**
1807 * vwrite() - write vmalloc area in a safe way.
1808 * @buf: buffer for source data
1809 * @addr: vm address.
1810 * @count: number of bytes to be read.
1811 *
1812 * Returns # of bytes which addr and buf should be incresed.
1813 * (same number to @count).
1814 * If [addr...addr+count) doesn't includes any intersect with valid
1815 * vmalloc area, returns 0.
1816 *
1817 * This function checks that addr is a valid vmalloc'ed area, and
1818 * copy data from a buffer to the given addr. If specified range of
1819 * [addr...addr+count) includes some valid address, data is copied from
1820 * proper area of @buf. If there are memory holes, no copy to hole.
1821 * IOREMAP area is treated as memory hole and no copy is done.
1822 *
1823 * If [addr...addr+count) doesn't includes any intersects with alive
1824 * vm_struct area, returns 0.
1825 * @buf should be kernel's buffer. Because this function uses KM_USER0,
1826 * the caller should guarantee KM_USER0 is not used.
1827 *
1828 * Note: In usual ops, vwrite() is never necessary because the caller
1829 * should know vmalloc() area is valid and can use memcpy().
1830 * This is for routines which have to access vmalloc area without
1831 * any informaion, as /dev/kmem.
1832 *
1833 * The caller should guarantee KM_USER1 is not used.
1834 */
1835
1666long vwrite(char *buf, char *addr, unsigned long count) 1836long vwrite(char *buf, char *addr, unsigned long count)
1667{ 1837{
1668 struct vm_struct *tmp; 1838 struct vm_struct *tmp;
1669 char *vaddr, *buf_start = buf; 1839 char *vaddr;
1670 unsigned long n; 1840 unsigned long n, buflen;
1841 int copied = 0;
1671 1842
1672 /* Don't allow overflow */ 1843 /* Don't allow overflow */
1673 if ((unsigned long) addr + count < count) 1844 if ((unsigned long) addr + count < count)
1674 count = -(unsigned long) addr; 1845 count = -(unsigned long) addr;
1846 buflen = count;
1675 1847
1676 read_lock(&vmlist_lock); 1848 read_lock(&vmlist_lock);
1677 for (tmp = vmlist; tmp; tmp = tmp->next) { 1849 for (tmp = vmlist; count && tmp; tmp = tmp->next) {
1678 vaddr = (char *) tmp->addr; 1850 vaddr = (char *) tmp->addr;
1679 if (addr >= vaddr + tmp->size - PAGE_SIZE) 1851 if (addr >= vaddr + tmp->size - PAGE_SIZE)
1680 continue; 1852 continue;
@@ -1686,18 +1858,21 @@ long vwrite(char *buf, char *addr, unsigned long count)
1686 count--; 1858 count--;
1687 } 1859 }
1688 n = vaddr + tmp->size - PAGE_SIZE - addr; 1860 n = vaddr + tmp->size - PAGE_SIZE - addr;
1689 do { 1861 if (n > count)
1690 if (count == 0) 1862 n = count;
1691 goto finished; 1863 if (!(tmp->flags & VM_IOREMAP)) {
1692 *addr = *buf; 1864 aligned_vwrite(buf, addr, n);
1693 buf++; 1865 copied++;
1694 addr++; 1866 }
1695 count--; 1867 buf += n;
1696 } while (--n > 0); 1868 addr += n;
1869 count -= n;
1697 } 1870 }
1698finished: 1871finished:
1699 read_unlock(&vmlist_lock); 1872 read_unlock(&vmlist_lock);
1700 return buf - buf_start; 1873 if (!copied)
1874 return 0;
1875 return buflen;
1701} 1876}
1702 1877
1703/** 1878/**
@@ -1818,6 +1993,286 @@ void free_vm_area(struct vm_struct *area)
1818} 1993}
1819EXPORT_SYMBOL_GPL(free_vm_area); 1994EXPORT_SYMBOL_GPL(free_vm_area);
1820 1995
1996static struct vmap_area *node_to_va(struct rb_node *n)
1997{
1998 return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
1999}
2000
2001/**
2002 * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
2003 * @end: target address
2004 * @pnext: out arg for the next vmap_area
2005 * @pprev: out arg for the previous vmap_area
2006 *
2007 * Returns: %true if either or both of next and prev are found,
2008 * %false if no vmap_area exists
2009 *
2010 * Find vmap_areas end addresses of which enclose @end. ie. if not
2011 * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
2012 */
2013static bool pvm_find_next_prev(unsigned long end,
2014 struct vmap_area **pnext,
2015 struct vmap_area **pprev)
2016{
2017 struct rb_node *n = vmap_area_root.rb_node;
2018 struct vmap_area *va = NULL;
2019
2020 while (n) {
2021 va = rb_entry(n, struct vmap_area, rb_node);
2022 if (end < va->va_end)
2023 n = n->rb_left;
2024 else if (end > va->va_end)
2025 n = n->rb_right;
2026 else
2027 break;
2028 }
2029
2030 if (!va)
2031 return false;
2032
2033 if (va->va_end > end) {
2034 *pnext = va;
2035 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
2036 } else {
2037 *pprev = va;
2038 *pnext = node_to_va(rb_next(&(*pprev)->rb_node));
2039 }
2040 return true;
2041}
2042
2043/**
2044 * pvm_determine_end - find the highest aligned address between two vmap_areas
2045 * @pnext: in/out arg for the next vmap_area
2046 * @pprev: in/out arg for the previous vmap_area
2047 * @align: alignment
2048 *
2049 * Returns: determined end address
2050 *
2051 * Find the highest aligned address between *@pnext and *@pprev below
2052 * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned
2053 * down address is between the end addresses of the two vmap_areas.
2054 *
2055 * Please note that the address returned by this function may fall
2056 * inside *@pnext vmap_area. The caller is responsible for checking
2057 * that.
2058 */
2059static unsigned long pvm_determine_end(struct vmap_area **pnext,
2060 struct vmap_area **pprev,
2061 unsigned long align)
2062{
2063 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
2064 unsigned long addr;
2065
2066 if (*pnext)
2067 addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
2068 else
2069 addr = vmalloc_end;
2070
2071 while (*pprev && (*pprev)->va_end > addr) {
2072 *pnext = *pprev;
2073 *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
2074 }
2075
2076 return addr;
2077}
2078
2079/**
2080 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
2081 * @offsets: array containing offset of each area
2082 * @sizes: array containing size of each area
2083 * @nr_vms: the number of areas to allocate
2084 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
2085 * @gfp_mask: allocation mask
2086 *
2087 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
2088 * vm_structs on success, %NULL on failure
2089 *
2090 * Percpu allocator wants to use congruent vm areas so that it can
2091 * maintain the offsets among percpu areas. This function allocates
2092 * congruent vmalloc areas for it. These areas tend to be scattered
2093 * pretty far, distance between two areas easily going up to
2094 * gigabytes. To avoid interacting with regular vmallocs, these areas
2095 * are allocated from top.
2096 *
2097 * Despite its complicated look, this allocator is rather simple. It
2098 * does everything top-down and scans areas from the end looking for
2099 * matching slot. While scanning, if any of the areas overlaps with
2100 * existing vmap_area, the base address is pulled down to fit the
2101 * area. Scanning is repeated till all the areas fit and then all
2102 * necessary data structres are inserted and the result is returned.
2103 */
2104struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
2105 const size_t *sizes, int nr_vms,
2106 size_t align, gfp_t gfp_mask)
2107{
2108 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
2109 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
2110 struct vmap_area **vas, *prev, *next;
2111 struct vm_struct **vms;
2112 int area, area2, last_area, term_area;
2113 unsigned long base, start, end, last_end;
2114 bool purged = false;
2115
2116 gfp_mask &= GFP_RECLAIM_MASK;
2117
2118 /* verify parameters and allocate data structures */
2119 BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align));
2120 for (last_area = 0, area = 0; area < nr_vms; area++) {
2121 start = offsets[area];
2122 end = start + sizes[area];
2123
2124 /* is everything aligned properly? */
2125 BUG_ON(!IS_ALIGNED(offsets[area], align));
2126 BUG_ON(!IS_ALIGNED(sizes[area], align));
2127
2128 /* detect the area with the highest address */
2129 if (start > offsets[last_area])
2130 last_area = area;
2131
2132 for (area2 = 0; area2 < nr_vms; area2++) {
2133 unsigned long start2 = offsets[area2];
2134 unsigned long end2 = start2 + sizes[area2];
2135
2136 if (area2 == area)
2137 continue;
2138
2139 BUG_ON(start2 >= start && start2 < end);
2140 BUG_ON(end2 <= end && end2 > start);
2141 }
2142 }
2143 last_end = offsets[last_area] + sizes[last_area];
2144
2145 if (vmalloc_end - vmalloc_start < last_end) {
2146 WARN_ON(true);
2147 return NULL;
2148 }
2149
2150 vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask);
2151 vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask);
2152 if (!vas || !vms)
2153 goto err_free;
2154
2155 for (area = 0; area < nr_vms; area++) {
2156 vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask);
2157 vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask);
2158 if (!vas[area] || !vms[area])
2159 goto err_free;
2160 }
2161retry:
2162 spin_lock(&vmap_area_lock);
2163
2164 /* start scanning - we scan from the top, begin with the last area */
2165 area = term_area = last_area;
2166 start = offsets[area];
2167 end = start + sizes[area];
2168
2169 if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
2170 base = vmalloc_end - last_end;
2171 goto found;
2172 }
2173 base = pvm_determine_end(&next, &prev, align) - end;
2174
2175 while (true) {
2176 BUG_ON(next && next->va_end <= base + end);
2177 BUG_ON(prev && prev->va_end > base + end);
2178
2179 /*
2180 * base might have underflowed, add last_end before
2181 * comparing.
2182 */
2183 if (base + last_end < vmalloc_start + last_end) {
2184 spin_unlock(&vmap_area_lock);
2185 if (!purged) {
2186 purge_vmap_area_lazy();
2187 purged = true;
2188 goto retry;
2189 }
2190 goto err_free;
2191 }
2192
2193 /*
2194 * If next overlaps, move base downwards so that it's
2195 * right below next and then recheck.
2196 */
2197 if (next && next->va_start < base + end) {
2198 base = pvm_determine_end(&next, &prev, align) - end;
2199 term_area = area;
2200 continue;
2201 }
2202
2203 /*
2204 * If prev overlaps, shift down next and prev and move
2205 * base so that it's right below new next and then
2206 * recheck.
2207 */
2208 if (prev && prev->va_end > base + start) {
2209 next = prev;
2210 prev = node_to_va(rb_prev(&next->rb_node));
2211 base = pvm_determine_end(&next, &prev, align) - end;
2212 term_area = area;
2213 continue;
2214 }
2215
2216 /*
2217 * This area fits, move on to the previous one. If
2218 * the previous one is the terminal one, we're done.
2219 */
2220 area = (area + nr_vms - 1) % nr_vms;
2221 if (area == term_area)
2222 break;
2223 start = offsets[area];
2224 end = start + sizes[area];
2225 pvm_find_next_prev(base + end, &next, &prev);
2226 }
2227found:
2228 /* we've found a fitting base, insert all va's */
2229 for (area = 0; area < nr_vms; area++) {
2230 struct vmap_area *va = vas[area];
2231
2232 va->va_start = base + offsets[area];
2233 va->va_end = va->va_start + sizes[area];
2234 __insert_vmap_area(va);
2235 }
2236
2237 vmap_area_pcpu_hole = base + offsets[last_area];
2238
2239 spin_unlock(&vmap_area_lock);
2240
2241 /* insert all vm's */
2242 for (area = 0; area < nr_vms; area++)
2243 insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC,
2244 pcpu_get_vm_areas);
2245
2246 kfree(vas);
2247 return vms;
2248
2249err_free:
2250 for (area = 0; area < nr_vms; area++) {
2251 if (vas)
2252 kfree(vas[area]);
2253 if (vms)
2254 kfree(vms[area]);
2255 }
2256 kfree(vas);
2257 kfree(vms);
2258 return NULL;
2259}
2260
2261/**
2262 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
2263 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
2264 * @nr_vms: the number of allocated areas
2265 *
2266 * Free vm_structs and the array allocated by pcpu_get_vm_areas().
2267 */
2268void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
2269{
2270 int i;
2271
2272 for (i = 0; i < nr_vms; i++)
2273 free_vm_area(vms[i]);
2274 kfree(vms);
2275}
1821 2276
1822#ifdef CONFIG_PROC_FS 2277#ifdef CONFIG_PROC_FS
1823static void *s_start(struct seq_file *m, loff_t *pos) 2278static void *s_start(struct seq_file *m, loff_t *pos)