/*
* SLUB: A slab allocator that limits cache line use instead of queuing
* objects in per cpu and per node lists.
*
* The allocator synchronizes using per slab locks or atomic operatios
* and only uses a centralized lock to manage a pool of partial slabs.
*
* (C) 2007 SGI, Christoph Lameter
* (C) 2011 Linux Foundation, Christoph Lameter
*/
#include <linux/mm.h>
#include <linux/swap.h> /* struct reclaim_state */
#include <linux/module.h>
#include <linux/bit_spinlock.h>
#include <linux/interrupt.h>
#include <linux/bitops.h>
#include <linux/slab.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/kmemcheck.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/mempolicy.h>
#include <linux/ctype.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/memory.h>
#include <linux/math64.h>
#include <linux/fault-inject.h>
#include <linux/stacktrace.h>
#include <linux/prefetch.h>
#include <trace/events/kmem.h>
/*
* Lock order:
* 1. slub_lock (Global Semaphore)
* 2. node->list_lock
* 3. slab_lock(page) (Only on some arches and for debugging)
*
* slub_lock
*
* The role of the slub_lock is to protect the list of all the slabs
* and to synchronize major metadata changes to slab cache structures.
*
* The slab_lock is only used for debugging and on arches that do not
* have the ability to do a cmpxchg_double. It only protects the second
* double word in the page struct. Meaning
* A. page->freelist -> List of object free in a page
* B. page->counters -> Counters of objects
* C. page->frozen -> frozen state
*
* If a slab is frozen then it is exempt from list management. It is not
* on any list. The processor that froze the slab is the one who can
* perform list operations on the page. Other processors may put objects
* onto the freelist but the processor that froze the slab is the only
* one that can retrieve the objects from the page's freelist.
*
* The list_lock protects the partial and full list on each node and
* the partial slab counter. If taken then no new slabs may be added or
* removed from the lists nor make the number of partial slabs be modified.
* (Note that the total number of slabs is an atomic value that may be
* modified without taking the list lock).
*
* The list_lock is a centralized lock and thus we avoid taking it as
* much as possible. As long as SLUB does not have to handle partial
* slabs, operations can continue without any centralized lock. F.e.
* allocating a long series of objects that fill up slabs does not require
* the list lock.
* Interrupts are disabled during allocation and deallocation in order to
* make the slab allocator safe to use in the context of an irq. In addition
* interrupts are disabled to ensure that the processor does not change
* while handling per_cpu slabs, due to kernel preemption.
*
* SLUB assigns one slab for allocation to each processor.
* Allocations only occur from these slabs called cpu slabs.
*
* Slabs with free elements are kept on a partial list and during regular
* operations no list for full slabs is used. If an object in a full slab is
* freed then the slab will show up again on the partial lists.
* We track full slabs for debugging purposes though because otherwise we
* cannot scan all objects.
*
* Slabs are freed when they become empty. Teardown and setup is
* minimal so we rely on the page allocators per cpu caches for
* fast frees and allocs.
*
* Overloading of page flags that are otherwise used for LRU management.
*
* PageActive The slab is frozen and exempt from list processing.
* This means that the slab is dedicated to a purpose
* such as satisfying allocations for a specific
* processor. Objects may be freed in the slab while
* it is frozen but slab_free will then skip the usual
* list operations. It is up to the processor holding
* the slab to integrate the slab into the slab lists
* when the slab is no longer needed.
*
* One use of this flag is to mark slabs that are
* used for allocations. Then such a slab becomes a cpu
* slab. The cpu slab may be equipped with an additional
* freelist that allows lockless access to
* free objects in addition to the regular freelist
* that requires the slab lock.
*
* PageError Slab requires special handling due to debug
* options set. This moves slab handling out of
* the fast path and disables lockless freelists.
*/
#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_DEBUG_FREE)
static inline int kmem_cache_debug(struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_DEBUG
return unlikely(s->flags & SLAB_DEBUG_FLAGS);
#else
return 0;
#endif
}
/*
* Issues still to be resolved:
*
* - Support PAGE_ALLOC_DEBUG. Should be easy to do.
*
* - Variable sizing of the per node arrays
*/
/* Enable to test recovery from slab corruption on boot */
#undef SLUB_RESILIENCY_TEST
/* Enable to log cmpxchg failures */
#undef SLUB_DEBUG_CMPXCHG
/*
* Mininum number of partial slabs. These will be left on the partial
* lists even if they are empty. kmem_cache_shrink may reclaim them.
*/
#define MIN_PARTIAL 5
/*
* Maximum number of desirable partial slabs.
* The existence of more partial slabs makes kmem_cache_shrink
* sort the partial list by the number of objects in the.
*/
#define MAX_PARTIAL 10
#define DEBUG_DEFAULT_FLAGS (SLAB_DEBUG_FREE | SLAB_RED_ZONE | \
SLAB_POISON | SLAB_STORE_USER)
/*
* Debugging flags that require metadata to be stored in the slab. These get
* disabled when slub_debug=O is used and a cache's min order increases with
* metadata.
*/
#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER)
/*
* Set of flags that will prevent slab merging
*/
#define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
SLAB_FAILSLAB)
#define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
SLAB_CACHE_DMA | SLAB_NOTRACK)
#define OO_SHIFT 16
#define OO_MASK ((1 << OO_SHIFT) - 1)
#define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */
/* Internal SLUB flags */
#define __OBJECT_POISON 0x80000000UL /* Poison object */
#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */
static int kmem_size = sizeof(struct kmem_cache);
#ifdef CONFIG_SMP
static struct notifier_block slab_notifier;
#endif
static enum {
DOWN, /* No slab functionality available */
PARTIAL, /* Kmem_cache_node works */
UP, /* Everything works but does not show up in sysfs */
SYSFS /* Sysfs up */
} slab_state = DOWN;
/* A list of all slab caches on the system */
static DECLARE_RWSEM(slub_lock);
static LIST_HEAD(slab_caches);
/*
* Tracking user of a slab.
*/
#define TRACK_ADDRS_COUNT 16
struct track {
unsigned long addr; /* Called from address */
#ifdef CONFIG_STACKTRACE
unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
#endif
int cpu; /* Was running on cpu */
int pid; /* Pid context */
unsigned long when; /* When did the operation occur */
};
enum track_item { TRACK_ALLOC, TRACK_FREE };
#ifdef CONFIG_SYSFS
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
static void sysfs_slab_remove(struct kmem_cache *);
#else
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
{ return 0; }
static inline void sysfs_slab_remove(struct kmem_cache *s)
{
kfree(s->name);
kfree(s);
}
#endif
static inline void stat(const struct kmem_cache *s, enum stat_item si)
{
#ifdef CONFIG_SLUB_STATS
__this_cpu_inc(s->cpu_slab->stat[si]);
#endif
}
/********************************************************************
* Core slab cache functions
*******************************************************************/
int slab_is_available(void)
{
return slab_state >= UP;
}
static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
{
return s->node[node];
}
/* Verify that a pointer has an address that is valid within a slab page */
static inline int check_valid_pointer(struct kmem_cache *s,
struct page *page, const void *object)
{
void *base;
if (!object)
return 1;
base = page_address(page);
if (object < base || object >= base + page->objects * s->size ||
(object - base) % s->size) {
return 0;
}
return 1;
}
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
return *(void **)(object + s->offset);
}
static void prefetch_freepointer(const struct kmem_cache *s, void *object)
{
prefetch(object + s->offset);
}
static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
{
void *p;
#ifdef CONFIG_DEBUG_PAGEALLOC
probe_kernel_read(&p, (void **)(object + s->offset), sizeof(p));
#else
p = get_freepointer(s, object);
#endif
return p;
}
static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
{
*(void **)(object + s->offset) = fp;
}
/* Loop over all objects in a slab */
#define for_each_object(__p, __s, __addr, __objects) \
for (__p = (__addr); __p < (__addr) + (__objects) * (__s)->size;\
__p += (__s)->size)
/* Determine object index from a given position */
static inline int slab_index(void *p, struct kmem_cache *s, void *addr)
{
return (p - addr) / s->size;
}
static inline size_t slab_ksize(const struct kmem_cache *s)
{
#ifdef CONFIG_SLUB_DEBUG
/*
* Debugging requires use of the padding between object
* and whatever may come after it.
*/
if (s->flags & (SLAB_RED_ZONE | SLAB_POISON))
return s->objsize;
#endif
/*
* If we have the need to store the freelist pointer
* back there or track user information then we can
* only use the space before that information.
*/
if (s->flags & (SLAB_DESTROY_BY_RCU | SLAB_STORE_USER))
return s->inuse;
/*
* Else we can use all the padding etc for the allocation
*/
return s->size;
}
static inline int order_objects(int order, unsigned long size, int reserved)
{
return ((PAGE_SIZE << order) - reserved) / size;
}
static inline struct kmem_cache_order_objects oo_make(int order,
unsigned long size, int reserved)
{
struct kmem_cache_order_objects x = {
(order << OO_SHIFT) + order_objects(order, size, reserved)
};
return x;
}
static inline int oo_order(struct kmem_cache_order_objects x)
{
return x.x >> OO_SHIFT;
}
static inline int oo_objects(struct kmem_cache_order_objects x)
{
return x.x & OO_MASK;
}
/*
* Per slab locking using the pagelock
*/
static __always_inline void slab_lock(struct page *page)
{
bit_spin_lock(PG_locked, &page->flags);
}
static __always_inline void slab_unlock(struct page *page)
{
__bit_spin_unlock(PG_locked, &page->flags);
}
/* Interrupts must be disabled (for the fallback code to work right) */
static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
const char *n)
{
VM_BUG_ON(!irqs_disabled());
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
if (s->flags & __CMPXCHG_DOUBLE) {
if (cmpxchg_double(&page->freelist, &page->counters,
freelist_old, counters_old,
freelist_new, counters_new))
return 1;
} else
#endif
{
slab_lock(page);
if (page->freelist == freelist_old && page->counters == counters_old) {
page->freelist = freelist_new;
page->counters = counters_new;
slab_unlock(page);
return 1;
}
slab_unlock(page);
}
cpu_relax();
stat(s, CMPXCHG_DOUBLE_FAIL);
#ifdef SLUB_DEBUG_CMPXCHG
printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
#endif
return 0;
}
static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page,
void *freelist_old, unsigned long counters_old,
void *freelist_new, unsigned long counters_new,
const char *n)
{
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
if (s->flags & __CMPXCHG_DOUBLE) {
if (cmpxchg_double(&page->freelist, &page->counters,
freelist_old, counters_old,
freelist_new, counters_new))
return 1;
} else
#endif
{
unsigned long flags;
local_irq_save(flags);
slab_lock(page);
if (page->freelist == freelist_old && page->counters == counters_old) {
page->freelist = freelist_new;
page->counters = counters_new;
slab_unlock(page);
local_irq_restore(flags);
return 1;
}
slab_unlock(page);
local_irq_restore(flags);
}
cpu_relax();
stat(s, CMPXCHG_DOUBLE_FAIL);
#ifdef SLUB_DEBUG_CMPXCHG
printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name);
#endif
return 0;
}
#ifdef CONFIG_SLUB_DEBUG
/*
* Determine a map of object in use on a page.
*
* Node listlock must be held to guarantee that the page does
* not vanish from under us.
*/
static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
{
void *p;
void *addr = page_address(page);
for (p = page->freelist; p; p = get_freepointer(s, p))
set_bit(slab_index(p, s, addr), map);
}
/*
* Debug settings:
*/
#ifdef CONFIG_SLUB_DEBUG_ON
static int slub_debug = DEBUG_DEFAULT_FLAGS;
#else
static int slub_debug;
#endif
static char *slub_debug_slabs;
static int disable_higher_order_debug;
/*
* Object debugging
*/
static void print_section(char *text, u8 *addr, unsigned int length)
{
print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
length, 1);
}
static struct track *get_track(struct kmem_cache *s, void *object,
enum track_item alloc)
{
struct track *p;
if (s->offset)
p = object + s->offset + sizeof(void *);
else
p = object + s->inuse;
return p + alloc;
}
static void set_track(struct kmem_cache *s, void *object,
enum track_item alloc, unsigned long addr)
{
struct track *p = get_track(s, object, alloc);
if (addr) {
#ifdef CONFIG_STACKTRACE
struct stack_trace trace;
int i;
trace.nr_entries = 0;
trace.max_entries = TRACK_ADDRS_COUNT;
trace.entries = p->addrs;
trace.skip = 3;
save_stack_trace(&trace);
/* See rant in lockdep.c */
if (trace.nr_entries != 0 &&
trace.entries[trace.nr_entries - 1] == ULONG_MAX)
trace.nr_entries--;
for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
p->addrs[i] = 0;
#endif
p->addr = addr;
p->cpu = smp_processor_id();
p->pid = current->pid;
p->when = jiffies;
} else
memset(p, 0, sizeof(struct track));
}
static void init_tracking(struct kmem_cache *s, void *object)
{
if (!(s->flags & SLAB_STORE_USER))
return;
set_track(s, object, TRACK_FREE, 0UL);
set_track(s, object, TRACK_ALLOC, 0UL);
}
static void print_track(const char *s, struct track *t)
{
if (!t->addr)
return;
printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n",
s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid);
#ifdef CONFIG_STACKTRACE
{
int i;
for (i = 0; i < TRACK_ADDRS_COUNT; i++)
if (t->addrs[i])
printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]);
else
break;
}
#endif
}
static void print_tracking(struct kmem_cache *s, void *object)
{
if (!(s->flags & SLAB_STORE_USER))
return;
print_track("Allocated", get_track(s, object, TRACK_ALLOC));
print_track("Freed", get_track(s, object, TRACK_FREE));
}