diff options
-rw-r--r-- | include/linux/slub_def.h | 9 | ||||
-rw-r--r-- | mm/slub.c | 190 |
2 files changed, 124 insertions, 75 deletions
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 3b361b2906bd..0a7ae25f7e8d 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h | |||
@@ -11,6 +11,13 @@ | |||
11 | #include <linux/workqueue.h> | 11 | #include <linux/workqueue.h> |
12 | #include <linux/kobject.h> | 12 | #include <linux/kobject.h> |
13 | 13 | ||
14 | struct kmem_cache_cpu { | ||
15 | void **freelist; | ||
16 | struct page *page; | ||
17 | int node; | ||
18 | /* Lots of wasted space */ | ||
19 | } ____cacheline_aligned_in_smp; | ||
20 | |||
14 | struct kmem_cache_node { | 21 | struct kmem_cache_node { |
15 | spinlock_t list_lock; /* Protect partial list and nr_partial */ | 22 | spinlock_t list_lock; /* Protect partial list and nr_partial */ |
16 | unsigned long nr_partial; | 23 | unsigned long nr_partial; |
@@ -54,7 +61,7 @@ struct kmem_cache { | |||
54 | int defrag_ratio; | 61 | int defrag_ratio; |
55 | struct kmem_cache_node *node[MAX_NUMNODES]; | 62 | struct kmem_cache_node *node[MAX_NUMNODES]; |
56 | #endif | 63 | #endif |
57 | struct page *cpu_slab[NR_CPUS]; | 64 | struct kmem_cache_cpu cpu_slab[NR_CPUS]; |
58 | }; | 65 | }; |
59 | 66 | ||
60 | /* | 67 | /* |
@@ -90,7 +90,7 @@ | |||
90 | * One use of this flag is to mark slabs that are | 90 | * One use of this flag is to mark slabs that are |
91 | * used for allocations. Then such a slab becomes a cpu | 91 | * used for allocations. Then such a slab becomes a cpu |
92 | * slab. The cpu slab may be equipped with an additional | 92 | * slab. The cpu slab may be equipped with an additional |
93 | * lockless_freelist that allows lockless access to | 93 | * freelist that allows lockless access to |
94 | * free objects in addition to the regular freelist | 94 | * free objects in addition to the regular freelist |
95 | * that requires the slab lock. | 95 | * that requires the slab lock. |
96 | * | 96 | * |
@@ -140,11 +140,6 @@ static inline void ClearSlabDebug(struct page *page) | |||
140 | /* | 140 | /* |
141 | * Issues still to be resolved: | 141 | * Issues still to be resolved: |
142 | * | 142 | * |
143 | * - The per cpu array is updated for each new slab and and is a remote | ||
144 | * cacheline for most nodes. This could become a bouncing cacheline given | ||
145 | * enough frequent updates. There are 16 pointers in a cacheline, so at | ||
146 | * max 16 cpus could compete for the cacheline which may be okay. | ||
147 | * | ||
148 | * - Support PAGE_ALLOC_DEBUG. Should be easy to do. | 143 | * - Support PAGE_ALLOC_DEBUG. Should be easy to do. |
149 | * | 144 | * |
150 | * - Variable sizing of the per node arrays | 145 | * - Variable sizing of the per node arrays |
@@ -277,6 +272,11 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) | |||
277 | #endif | 272 | #endif |
278 | } | 273 | } |
279 | 274 | ||
275 | static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) | ||
276 | { | ||
277 | return &s->cpu_slab[cpu]; | ||
278 | } | ||
279 | |||
280 | static inline int check_valid_pointer(struct kmem_cache *s, | 280 | static inline int check_valid_pointer(struct kmem_cache *s, |
281 | struct page *page, const void *object) | 281 | struct page *page, const void *object) |
282 | { | 282 | { |
@@ -1387,33 +1387,34 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page) | |||
1387 | /* | 1387 | /* |
1388 | * Remove the cpu slab | 1388 | * Remove the cpu slab |
1389 | */ | 1389 | */ |
1390 | static void deactivate_slab(struct kmem_cache *s, struct page *page, int cpu) | 1390 | static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1391 | { | 1391 | { |
1392 | struct page *page = c->page; | ||
1392 | /* | 1393 | /* |
1393 | * Merge cpu freelist into freelist. Typically we get here | 1394 | * Merge cpu freelist into freelist. Typically we get here |
1394 | * because both freelists are empty. So this is unlikely | 1395 | * because both freelists are empty. So this is unlikely |
1395 | * to occur. | 1396 | * to occur. |
1396 | */ | 1397 | */ |
1397 | while (unlikely(page->lockless_freelist)) { | 1398 | while (unlikely(c->freelist)) { |
1398 | void **object; | 1399 | void **object; |
1399 | 1400 | ||
1400 | /* Retrieve object from cpu_freelist */ | 1401 | /* Retrieve object from cpu_freelist */ |
1401 | object = page->lockless_freelist; | 1402 | object = c->freelist; |
1402 | page->lockless_freelist = page->lockless_freelist[page->offset]; | 1403 | c->freelist = c->freelist[page->offset]; |
1403 | 1404 | ||
1404 | /* And put onto the regular freelist */ | 1405 | /* And put onto the regular freelist */ |
1405 | object[page->offset] = page->freelist; | 1406 | object[page->offset] = page->freelist; |
1406 | page->freelist = object; | 1407 | page->freelist = object; |
1407 | page->inuse--; | 1408 | page->inuse--; |
1408 | } | 1409 | } |
1409 | s->cpu_slab[cpu] = NULL; | 1410 | c->page = NULL; |
1410 | unfreeze_slab(s, page); | 1411 | unfreeze_slab(s, page); |
1411 | } | 1412 | } |
1412 | 1413 | ||
1413 | static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu) | 1414 | static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) |
1414 | { | 1415 | { |
1415 | slab_lock(page); | 1416 | slab_lock(c->page); |
1416 | deactivate_slab(s, page, cpu); | 1417 | deactivate_slab(s, c); |
1417 | } | 1418 | } |
1418 | 1419 | ||
1419 | /* | 1420 | /* |
@@ -1422,18 +1423,17 @@ static inline void flush_slab(struct kmem_cache *s, struct page *page, int cpu) | |||
1422 | */ | 1423 | */ |
1423 | static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) | 1424 | static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) |
1424 | { | 1425 | { |
1425 | struct page *page = s->cpu_slab[cpu]; | 1426 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); |
1426 | 1427 | ||
1427 | if (likely(page)) | 1428 | if (likely(c && c->page)) |
1428 | flush_slab(s, page, cpu); | 1429 | flush_slab(s, c); |
1429 | } | 1430 | } |
1430 | 1431 | ||
1431 | static void flush_cpu_slab(void *d) | 1432 | static void flush_cpu_slab(void *d) |
1432 | { | 1433 | { |
1433 | struct kmem_cache *s = d; | 1434 | struct kmem_cache *s = d; |
1434 | int cpu = smp_processor_id(); | ||
1435 | 1435 | ||
1436 | __flush_cpu_slab(s, cpu); | 1436 | __flush_cpu_slab(s, smp_processor_id()); |
1437 | } | 1437 | } |
1438 | 1438 | ||
1439 | static void flush_all(struct kmem_cache *s) | 1439 | static void flush_all(struct kmem_cache *s) |
@@ -1450,6 +1450,19 @@ static void flush_all(struct kmem_cache *s) | |||
1450 | } | 1450 | } |
1451 | 1451 | ||
1452 | /* | 1452 | /* |
1453 | * Check if the objects in a per cpu structure fit numa | ||
1454 | * locality expectations. | ||
1455 | */ | ||
1456 | static inline int node_match(struct kmem_cache_cpu *c, int node) | ||
1457 | { | ||
1458 | #ifdef CONFIG_NUMA | ||
1459 | if (node != -1 && c->node != node) | ||
1460 | return 0; | ||
1461 | #endif | ||
1462 | return 1; | ||
1463 | } | ||
1464 | |||
1465 | /* | ||
1453 | * Slow path. The lockless freelist is empty or we need to perform | 1466 | * Slow path. The lockless freelist is empty or we need to perform |
1454 | * debugging duties. | 1467 | * debugging duties. |
1455 | * | 1468 | * |
@@ -1467,45 +1480,46 @@ static void flush_all(struct kmem_cache *s) | |||
1467 | * we need to allocate a new slab. This is slowest path since we may sleep. | 1480 | * we need to allocate a new slab. This is slowest path since we may sleep. |
1468 | */ | 1481 | */ |
1469 | static void *__slab_alloc(struct kmem_cache *s, | 1482 | static void *__slab_alloc(struct kmem_cache *s, |
1470 | gfp_t gfpflags, int node, void *addr, struct page *page) | 1483 | gfp_t gfpflags, int node, void *addr, struct kmem_cache_cpu *c) |
1471 | { | 1484 | { |
1472 | void **object; | 1485 | void **object; |
1473 | int cpu = smp_processor_id(); | 1486 | struct page *new; |
1474 | 1487 | ||
1475 | if (!page) | 1488 | if (!c->page) |
1476 | goto new_slab; | 1489 | goto new_slab; |
1477 | 1490 | ||
1478 | slab_lock(page); | 1491 | slab_lock(c->page); |
1479 | if (unlikely(node != -1 && page_to_nid(page) != node)) | 1492 | if (unlikely(!node_match(c, node))) |
1480 | goto another_slab; | 1493 | goto another_slab; |
1481 | load_freelist: | 1494 | load_freelist: |
1482 | object = page->freelist; | 1495 | object = c->page->freelist; |
1483 | if (unlikely(!object)) | 1496 | if (unlikely(!object)) |
1484 | goto another_slab; | 1497 | goto another_slab; |
1485 | if (unlikely(SlabDebug(page))) | 1498 | if (unlikely(SlabDebug(c->page))) |
1486 | goto debug; | 1499 | goto debug; |
1487 | 1500 | ||
1488 | object = page->freelist; | 1501 | object = c->page->freelist; |
1489 | page->lockless_freelist = object[page->offset]; | 1502 | c->freelist = object[c->page->offset]; |
1490 | page->inuse = s->objects; | 1503 | c->page->inuse = s->objects; |
1491 | page->freelist = NULL; | 1504 | c->page->freelist = NULL; |
1492 | slab_unlock(page); | 1505 | c->node = page_to_nid(c->page); |
1506 | slab_unlock(c->page); | ||
1493 | return object; | 1507 | return object; |
1494 | 1508 | ||
1495 | another_slab: | 1509 | another_slab: |
1496 | deactivate_slab(s, page, cpu); | 1510 | deactivate_slab(s, c); |
1497 | 1511 | ||
1498 | new_slab: | 1512 | new_slab: |
1499 | page = get_partial(s, gfpflags, node); | 1513 | new = get_partial(s, gfpflags, node); |
1500 | if (page) { | 1514 | if (new) { |
1501 | s->cpu_slab[cpu] = page; | 1515 | c->page = new; |
1502 | goto load_freelist; | 1516 | goto load_freelist; |
1503 | } | 1517 | } |
1504 | 1518 | ||
1505 | page = new_slab(s, gfpflags, node); | 1519 | new = new_slab(s, gfpflags, node); |
1506 | if (page) { | 1520 | if (new) { |
1507 | cpu = smp_processor_id(); | 1521 | c = get_cpu_slab(s, smp_processor_id()); |
1508 | if (s->cpu_slab[cpu]) { | 1522 | if (c->page) { |
1509 | /* | 1523 | /* |
1510 | * Someone else populated the cpu_slab while we | 1524 | * Someone else populated the cpu_slab while we |
1511 | * enabled interrupts, or we have gotten scheduled | 1525 | * enabled interrupts, or we have gotten scheduled |
@@ -1513,34 +1527,32 @@ new_slab: | |||
1513 | * requested node even if __GFP_THISNODE was | 1527 | * requested node even if __GFP_THISNODE was |
1514 | * specified. So we need to recheck. | 1528 | * specified. So we need to recheck. |
1515 | */ | 1529 | */ |
1516 | if (node == -1 || | 1530 | if (node_match(c, node)) { |
1517 | page_to_nid(s->cpu_slab[cpu]) == node) { | ||
1518 | /* | 1531 | /* |
1519 | * Current cpuslab is acceptable and we | 1532 | * Current cpuslab is acceptable and we |
1520 | * want the current one since its cache hot | 1533 | * want the current one since its cache hot |
1521 | */ | 1534 | */ |
1522 | discard_slab(s, page); | 1535 | discard_slab(s, new); |
1523 | page = s->cpu_slab[cpu]; | 1536 | slab_lock(c->page); |
1524 | slab_lock(page); | ||
1525 | goto load_freelist; | 1537 | goto load_freelist; |
1526 | } | 1538 | } |
1527 | /* New slab does not fit our expectations */ | 1539 | /* New slab does not fit our expectations */ |
1528 | flush_slab(s, s->cpu_slab[cpu], cpu); | 1540 | flush_slab(s, c); |
1529 | } | 1541 | } |
1530 | slab_lock(page); | 1542 | slab_lock(new); |
1531 | SetSlabFrozen(page); | 1543 | SetSlabFrozen(new); |
1532 | s->cpu_slab[cpu] = page; | 1544 | c->page = new; |
1533 | goto load_freelist; | 1545 | goto load_freelist; |
1534 | } | 1546 | } |
1535 | return NULL; | 1547 | return NULL; |
1536 | debug: | 1548 | debug: |
1537 | object = page->freelist; | 1549 | object = c->page->freelist; |
1538 | if (!alloc_debug_processing(s, page, object, addr)) | 1550 | if (!alloc_debug_processing(s, c->page, object, addr)) |
1539 | goto another_slab; | 1551 | goto another_slab; |
1540 | 1552 | ||
1541 | page->inuse++; | 1553 | c->page->inuse++; |
1542 | page->freelist = object[page->offset]; | 1554 | c->page->freelist = object[c->page->offset]; |
1543 | slab_unlock(page); | 1555 | slab_unlock(c->page); |
1544 | return object; | 1556 | return object; |
1545 | } | 1557 | } |
1546 | 1558 | ||
@@ -1557,20 +1569,20 @@ debug: | |||
1557 | static void __always_inline *slab_alloc(struct kmem_cache *s, | 1569 | static void __always_inline *slab_alloc(struct kmem_cache *s, |
1558 | gfp_t gfpflags, int node, void *addr) | 1570 | gfp_t gfpflags, int node, void *addr) |
1559 | { | 1571 | { |
1560 | struct page *page; | ||
1561 | void **object; | 1572 | void **object; |
1562 | unsigned long flags; | 1573 | unsigned long flags; |
1574 | struct kmem_cache_cpu *c; | ||
1563 | 1575 | ||
1564 | local_irq_save(flags); | 1576 | local_irq_save(flags); |
1565 | page = s->cpu_slab[smp_processor_id()]; | 1577 | c = get_cpu_slab(s, smp_processor_id()); |
1566 | if (unlikely(!page || !page->lockless_freelist || | 1578 | if (unlikely(!c->page || !c->freelist || |
1567 | (node != -1 && page_to_nid(page) != node))) | 1579 | !node_match(c, node))) |
1568 | 1580 | ||
1569 | object = __slab_alloc(s, gfpflags, node, addr, page); | 1581 | object = __slab_alloc(s, gfpflags, node, addr, c); |
1570 | 1582 | ||
1571 | else { | 1583 | else { |
1572 | object = page->lockless_freelist; | 1584 | object = c->freelist; |
1573 | page->lockless_freelist = object[page->offset]; | 1585 | c->freelist = object[c->page->offset]; |
1574 | } | 1586 | } |
1575 | local_irq_restore(flags); | 1587 | local_irq_restore(flags); |
1576 | 1588 | ||
@@ -1668,13 +1680,14 @@ static void __always_inline slab_free(struct kmem_cache *s, | |||
1668 | { | 1680 | { |
1669 | void **object = (void *)x; | 1681 | void **object = (void *)x; |
1670 | unsigned long flags; | 1682 | unsigned long flags; |
1683 | struct kmem_cache_cpu *c; | ||
1671 | 1684 | ||
1672 | local_irq_save(flags); | 1685 | local_irq_save(flags); |
1673 | debug_check_no_locks_freed(object, s->objsize); | 1686 | debug_check_no_locks_freed(object, s->objsize); |
1674 | if (likely(page == s->cpu_slab[smp_processor_id()] && | 1687 | c = get_cpu_slab(s, smp_processor_id()); |
1675 | !SlabDebug(page))) { | 1688 | if (likely(page == c->page && !SlabDebug(page))) { |
1676 | object[page->offset] = page->lockless_freelist; | 1689 | object[page->offset] = c->freelist; |
1677 | page->lockless_freelist = object; | 1690 | c->freelist = object; |
1678 | } else | 1691 | } else |
1679 | __slab_free(s, page, x, addr); | 1692 | __slab_free(s, page, x, addr); |
1680 | 1693 | ||
@@ -1862,6 +1875,24 @@ static unsigned long calculate_alignment(unsigned long flags, | |||
1862 | return ALIGN(align, sizeof(void *)); | 1875 | return ALIGN(align, sizeof(void *)); |
1863 | } | 1876 | } |
1864 | 1877 | ||
1878 | static void init_kmem_cache_cpu(struct kmem_cache *s, | ||
1879 | struct kmem_cache_cpu *c) | ||
1880 | { | ||
1881 | c->page = NULL; | ||
1882 | c->freelist = NULL; | ||
1883 | c->node = 0; | ||
1884 | } | ||
1885 | |||
1886 | static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) | ||
1887 | { | ||
1888 | int cpu; | ||
1889 | |||
1890 | for_each_possible_cpu(cpu) | ||
1891 | init_kmem_cache_cpu(s, get_cpu_slab(s, cpu)); | ||
1892 | |||
1893 | return 1; | ||
1894 | } | ||
1895 | |||
1865 | static void init_kmem_cache_node(struct kmem_cache_node *n) | 1896 | static void init_kmem_cache_node(struct kmem_cache_node *n) |
1866 | { | 1897 | { |
1867 | n->nr_partial = 0; | 1898 | n->nr_partial = 0; |
@@ -2111,8 +2142,10 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, | |||
2111 | #ifdef CONFIG_NUMA | 2142 | #ifdef CONFIG_NUMA |
2112 | s->defrag_ratio = 100; | 2143 | s->defrag_ratio = 100; |
2113 | #endif | 2144 | #endif |
2145 | if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) | ||
2146 | goto error; | ||
2114 | 2147 | ||
2115 | if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) | 2148 | if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) |
2116 | return 1; | 2149 | return 1; |
2117 | error: | 2150 | error: |
2118 | if (flags & SLAB_PANIC) | 2151 | if (flags & SLAB_PANIC) |
@@ -2646,7 +2679,7 @@ void __init kmem_cache_init(void) | |||
2646 | #endif | 2679 | #endif |
2647 | 2680 | ||
2648 | kmem_size = offsetof(struct kmem_cache, cpu_slab) + | 2681 | kmem_size = offsetof(struct kmem_cache, cpu_slab) + |
2649 | nr_cpu_ids * sizeof(struct page *); | 2682 | nr_cpu_ids * sizeof(struct kmem_cache_cpu); |
2650 | 2683 | ||
2651 | printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," | 2684 | printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," |
2652 | " CPUs=%d, Nodes=%d\n", | 2685 | " CPUs=%d, Nodes=%d\n", |
@@ -3248,11 +3281,14 @@ static unsigned long slab_objects(struct kmem_cache *s, | |||
3248 | per_cpu = nodes + nr_node_ids; | 3281 | per_cpu = nodes + nr_node_ids; |
3249 | 3282 | ||
3250 | for_each_possible_cpu(cpu) { | 3283 | for_each_possible_cpu(cpu) { |
3251 | struct page *page = s->cpu_slab[cpu]; | 3284 | struct page *page; |
3252 | int node; | 3285 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); |
3253 | 3286 | ||
3287 | if (!c) | ||
3288 | continue; | ||
3289 | |||
3290 | page = c->page; | ||
3254 | if (page) { | 3291 | if (page) { |
3255 | node = page_to_nid(page); | ||
3256 | if (flags & SO_CPU) { | 3292 | if (flags & SO_CPU) { |
3257 | int x = 0; | 3293 | int x = 0; |
3258 | 3294 | ||
@@ -3261,9 +3297,9 @@ static unsigned long slab_objects(struct kmem_cache *s, | |||
3261 | else | 3297 | else |
3262 | x = 1; | 3298 | x = 1; |
3263 | total += x; | 3299 | total += x; |
3264 | nodes[node] += x; | 3300 | nodes[c->node] += x; |
3265 | } | 3301 | } |
3266 | per_cpu[node]++; | 3302 | per_cpu[c->node]++; |
3267 | } | 3303 | } |
3268 | } | 3304 | } |
3269 | 3305 | ||
@@ -3309,13 +3345,19 @@ static int any_slab_objects(struct kmem_cache *s) | |||
3309 | int node; | 3345 | int node; |
3310 | int cpu; | 3346 | int cpu; |
3311 | 3347 | ||
3312 | for_each_possible_cpu(cpu) | 3348 | for_each_possible_cpu(cpu) { |
3313 | if (s->cpu_slab[cpu]) | 3349 | struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); |
3350 | |||
3351 | if (c && c->page) | ||
3314 | return 1; | 3352 | return 1; |
3353 | } | ||
3315 | 3354 | ||
3316 | for_each_node(node) { | 3355 | for_each_online_node(node) { |
3317 | struct kmem_cache_node *n = get_node(s, node); | 3356 | struct kmem_cache_node *n = get_node(s, node); |
3318 | 3357 | ||
3358 | if (!n) | ||
3359 | continue; | ||
3360 | |||
3319 | if (n->nr_partial || atomic_long_read(&n->nr_slabs)) | 3361 | if (n->nr_partial || atomic_long_read(&n->nr_slabs)) |
3320 | return 1; | 3362 | return 1; |
3321 | } | 3363 | } |