3 files changed, 538 insertions, 0 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 7e28ecfa8aa4..45503ed5f3aa 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -478,6 +478,16 @@ config FRONTSWAP
          If unsure, say Y to enable frontswap.
+config ZBUD
+        tristate
+        default n
+        help
+          A special purpose allocator for storing compressed pages.
+          It is designed to store up to two compressed pages per physical
+          page.  While this design limits storage density, it has simple and
+          deterministic reclaim properties that make it preferable to a higher
+          density approach when reclaim will be used.
 config MEM_SOFT_DIRTY
        bool "Track memory changes"
        depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY
diff --git a/mm/Makefile b/mm/Makefile
index 72c5acb9345f..95f0197ce3d3 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -58,3 +58,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
 obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
 obj-$(CONFIG_CLEANCACHE) += cleancache.o
 obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
+obj-$(CONFIG_ZBUD)      += zbud.o
diff --git a/mm/zbud.c b/mm/zbud.c
new file mode 100644
index 000000000000..9bb4710e3589
--- /dev/null
+++ b/mm/zbud.c
@@ -0,0 +1,527 @@
+/*
+ * zbud.c
+ *
+ * Copyright (C) 2013, Seth Jennings, IBM
+ *
+ * Concepts based on zcache internal zbud allocator by Dan Magenheimer.
+ *
+ * zbud is an special purpose allocator for storing compressed pages.  Contrary
+ * to what its name may suggest, zbud is not a buddy allocator, but rather an
+ * allocator that "buddies" two compressed pages together in a single memory
+ * page.
+ *
+ * While this design limits storage density, it has simple and deterministic
+ * reclaim properties that make it preferable to a higher density approach when
+ * reclaim will be used.
+ *
+ * zbud works by storing compressed pages, or "zpages", together in pairs in a
+ * single memory page called a "zbud page".  The first buddy is "left
+ * justifed" at the beginning of the zbud page, and the last buddy is "right
+ * justified" at the end of the zbud page.  The benefit is that if either
+ * buddy is freed, the freed buddy space, coalesced with whatever slack space
+ * that existed between the buddies, results in the largest possible free region
+ * within the zbud page.
+ *
+ * zbud also provides an attractive lower bound on density. The ratio of zpages
+ * to zbud pages can not be less than 1.  This ensures that zbud can never "do
+ * harm" by using more pages to store zpages than the uncompressed zpages would
+ * have used on their own.
+ *
+ * zbud pages are divided into "chunks".  The size of the chunks is fixed at
+ * compile time and determined by NCHUNKS_ORDER below.  Dividing zbud pages
+ * into chunks allows organizing unbuddied zbud pages into a manageable number
+ * of unbuddied lists according to the number of free chunks available in the
+ * zbud page.
+ *
+ * The zbud API differs from that of conventional allocators in that the
+ * allocation function, zbud_alloc(), returns an opaque handle to the user,
+ * not a dereferenceable pointer.  The user must map the handle using
+ * zbud_map() in order to get a usable pointer by which to access the
+ * allocation data and unmap the handle with zbud_unmap() when operations
+ * on the allocation data are complete.
+ */
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/atomic.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/preempt.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/zbud.h>
+/*****************
+ * Structures
+*****************/
+/*
+ * NCHUNKS_ORDER determines the internal allocation granularity, effectively
+ * adjusting internal fragmentation.  It also determines the number of
+ * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
+ * allocation granularity will be in chunks of size PAGE_SIZE/64, and there
+ * will be 64 freelists per pool.
+ */
+#define NCHUNKS_ORDER   6
+#define CHUNK_SHIFT     (PAGE_SHIFT - NCHUNKS_ORDER)
+#define CHUNK_SIZE      (1 << CHUNK_SHIFT)
+#define NCHUNKS         (PAGE_SIZE >> CHUNK_SHIFT)
+#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
+/**
+ * struct zbud_pool - stores metadata for each zbud pool
+ * @lock:       protects all pool fields and first|last_chunk fields of any
+ *              zbud page in the pool
+ * @unbuddied:  array of lists tracking zbud pages that only contain one buddy;
+ *              the lists each zbud page is added to depends on the size of
+ *              its free region.
+ * @buddied:    list tracking the zbud pages that contain two buddies;
+ *              these zbud pages are full
+ * @lru:        list tracking the zbud pages in LRU order by most recently
+ *              added buddy.
+ * @pages_nr:   number of zbud pages in the pool.
+ * @ops:        pointer to a structure of user defined operations specified at
+ *              pool creation time.
+ *
+ * This structure is allocated at pool creation time and maintains metadata
+ * pertaining to a particular zbud pool.
+ */
+struct zbud_pool {
+        spinlock_t lock;
+        struct list_head unbuddied[NCHUNKS];
+        struct list_head buddied;
+        struct list_head lru;
+        u64 pages_nr;
+        struct zbud_ops *ops;
+};
+/*
+ * struct zbud_header - zbud page metadata occupying the first chunk of each
+ *                      zbud page.
+ * @buddy:      links the zbud page into the unbuddied/buddied lists in the pool
+ * @lru:        links the zbud page into the lru list in the pool
+ * @first_chunks:       the size of the first buddy in chunks, 0 if free
+ * @last_chunks:        the size of the last buddy in chunks, 0 if free
+ */
+struct zbud_header {
+        struct list_head buddy;
+        struct list_head lru;
+        unsigned int first_chunks;
+        unsigned int last_chunks;
+        bool under_reclaim;
+};
+/*****************
+ * Helpers
+*****************/
+/* Just to make the code easier to read */
+enum buddy {
+        FIRST,
+        LAST
+};
+/* Converts an allocation size in bytes to size in zbud chunks */
+static int size_to_chunks(int size)
+{
+        return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
+}
+#define for_each_unbuddied_list(_iter, _begin) \
+        for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
+/* Initializes the zbud header of a newly allocated zbud page */
+static struct zbud_header *init_zbud_page(struct page *page)
+{
+        struct zbud_header *zhdr = page_address(page);
+        zhdr->first_chunks = 0;
+        zhdr->last_chunks = 0;
+        INIT_LIST_HEAD(&zhdr->buddy);
+        INIT_LIST_HEAD(&zhdr->lru);
+        zhdr->under_reclaim = 0;
+        return zhdr;
+}
+/* Resets the struct page fields and frees the page */
+static void free_zbud_page(struct zbud_header *zhdr)
+{
+        __free_page(virt_to_page(zhdr));
+}
+/*
+ * Encodes the handle of a particular buddy within a zbud page
+ * Pool lock should be held as this function accesses first|last_chunks
+ */
+static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud)
+{
+        unsigned long handle;
+        /*
+         * For now, the encoded handle is actually just the pointer to the data
+         * but this might not always be the case.  A little information hiding.
+         * Add CHUNK_SIZE to the handle if it is the first allocation to jump
+         * over the zbud header in the first chunk.
+         */
+        handle = (unsigned long)zhdr;
+        if (bud == FIRST)
+                /* skip over zbud header */
+                handle += ZHDR_SIZE_ALIGNED;
+        else /* bud == LAST */
+                handle += PAGE_SIZE - (zhdr->last_chunks  << CHUNK_SHIFT);
+        return handle;
+}
+/* Returns the zbud page where a given handle is stored */
+static struct zbud_header *handle_to_zbud_header(unsigned long handle)
+{
+        return (struct zbud_header *)(handle & PAGE_MASK);
+}
+/* Returns the number of free chunks in a zbud page */
+static int num_free_chunks(struct zbud_header *zhdr)
+{
+        /*
+         * Rather than branch for different situations, just use the fact that
+         * free buddies have a length of zero to simplify everything. -1 at the
+         * end for the zbud header.
+         */
+        return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1;
+}
+/*****************
+ * API Functions
+*****************/
+/**
+ * zbud_create_pool() - create a new zbud pool
+ * @gfp:        gfp flags when allocating the zbud pool structure
+ * @ops:        user-defined operations for the zbud pool
+ *
+ * Return: pointer to the new zbud pool or NULL if the metadata allocation
+ * failed.
+ */
+struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
+{
+        struct zbud_pool *pool;
+        int i;
+        pool = kmalloc(sizeof(struct zbud_pool), gfp);
+        if (!pool)
+                return NULL;
+        spin_lock_init(&pool->lock);
+        for_each_unbuddied_list(i, 0)
+                INIT_LIST_HEAD(&pool->unbuddied[i]);
+        INIT_LIST_HEAD(&pool->buddied);
+        INIT_LIST_HEAD(&pool->lru);
+        pool->pages_nr = 0;
+        pool->ops = ops;
+        return pool;
+}
+/**
+ * zbud_destroy_pool() - destroys an existing zbud pool
+ * @pool:       the zbud pool to be destroyed
+ *
+ * The pool should be emptied before this function is called.
+ */
+void zbud_destroy_pool(struct zbud_pool *pool)
+{
+        kfree(pool);
+}
+/**
+ * zbud_alloc() - allocates a region of a given size
+ * @pool:       zbud pool from which to allocate
+ * @size:       size in bytes of the desired allocation
+ * @gfp:        gfp flags used if the pool needs to grow
+ * @handle:     handle of the new allocation
+ *
+ * This function will attempt to find a free region in the pool large enough to
+ * satisfy the allocation request.  A search of the unbuddied lists is
+ * performed first. If no suitable free region is found, then a new page is
+ * allocated and added to the pool to satisfy the request.
+ *
+ * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
+ * as zbud pool pages.
+ *
+ * Return: 0 if success and handle is set, otherwise -EINVAL is the size or
+ * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
+ * a new page.
+ */
+int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
+                        unsigned long *handle)
+{
+        int chunks, i, freechunks;
+        struct zbud_header *zhdr = NULL;
+        enum buddy bud;
+        struct page *page;
+        if (size <= 0 || gfp & __GFP_HIGHMEM)
+                return -EINVAL;
+        if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED)
+                return -ENOSPC;
+        chunks = size_to_chunks(size);
+        spin_lock(&pool->lock);
+        /* First, try to find an unbuddied zbud page. */
+        zhdr = NULL;
+        for_each_unbuddied_list(i, chunks) {
+                if (!list_empty(&pool->unbuddied[i])) {
+                        zhdr = list_first_entry(&pool->unbuddied[i],
+                                        struct zbud_header, buddy);
+                        list_del(&zhdr->buddy);
+                        if (zhdr->first_chunks == 0)
+                                bud = FIRST;
+                        else
+                                bud = LAST;
+                        goto found;
+                }
+        }
+        /* Couldn't find unbuddied zbud page, create new one */
+        spin_unlock(&pool->lock);
+        page = alloc_page(gfp);
+        if (!page)
+                return -ENOMEM;
+        spin_lock(&pool->lock);
+        pool->pages_nr++;
+        zhdr = init_zbud_page(page);
+        bud = FIRST;
+found:
+        if (bud == FIRST)
+                zhdr->first_chunks = chunks;
+        else
+                zhdr->last_chunks = chunks;
+        if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) {
+                /* Add to unbuddied list */
+                freechunks = num_free_chunks(zhdr);
+                list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+        } else {
+                /* Add to buddied list */
+                list_add(&zhdr->buddy, &pool->buddied);
+        }
+        /* Add/move zbud page to beginning of LRU */
+        if (!list_empty(&zhdr->lru))
+                list_del(&zhdr->lru);
+        list_add(&zhdr->lru, &pool->lru);
+        *handle = encode_handle(zhdr, bud);
+        spin_unlock(&pool->lock);
+        return 0;
+}
+/**
+ * zbud_free() - frees the allocation associated with the given handle
+ * @pool:       pool in which the allocation resided
+ * @handle:     handle associated with the allocation returned by zbud_alloc()
+ *
+ * In the case that the zbud page in which the allocation resides is under
+ * reclaim, as indicated by the PG_reclaim flag being set, this function
+ * only sets the first|last_chunks to 0.  The page is actually freed
+ * once both buddies are evicted (see zbud_reclaim_page() below).
+ */
+void zbud_free(struct zbud_pool *pool, unsigned long handle)
+{
+        struct zbud_header *zhdr;
+        int freechunks;
+        spin_lock(&pool->lock);
+        zhdr = handle_to_zbud_header(handle);
+        /* If first buddy, handle will be page aligned */
+        if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK)
+                zhdr->last_chunks = 0;
+        else
+                zhdr->first_chunks = 0;
+        if (zhdr->under_reclaim) {
+                /* zbud page is under reclaim, reclaim will free */
+                spin_unlock(&pool->lock);
+                return;
+        }
+        /* Remove from existing buddy list */
+        list_del(&zhdr->buddy);
+        if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
+                /* zbud page is empty, free */
+                list_del(&zhdr->lru);
+                free_zbud_page(zhdr);
+                pool->pages_nr--;
+        } else {
+                /* Add to unbuddied list */
+                freechunks = num_free_chunks(zhdr);
+                list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+        }
+        spin_unlock(&pool->lock);
+}
+#define list_tail_entry(ptr, type, member) \
+        list_entry((ptr)->prev, type, member)
+/**
+ * zbud_reclaim_page() - evicts allocations from a pool page and frees it
+ * @pool:       pool from which a page will attempt to be evicted
+ * @retires:    number of pages on the LRU list for which eviction will
+ *              be attempted before failing
+ *
+ * zbud reclaim is different from normal system reclaim in that the reclaim is
+ * done from the bottom, up.  This is because only the bottom layer, zbud, has
+ * information on how the allocations are organized within each zbud page. This
+ * has the potential to create interesting locking situations between zbud and
+ * the user, however.
+ *
+ * To avoid these, this is how zbud_reclaim_page() should be called:
+ * The user detects a page should be reclaimed and calls zbud_reclaim_page().
+ * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call
+ * the user-defined eviction handler with the pool and handle as arguments.
+ *
+ * If the handle can not be evicted, the eviction handler should return
+ * non-zero. zbud_reclaim_page() will add the zbud page back to the
+ * appropriate list and try the next zbud page on the LRU up to
+ * a user defined number of retries.
+ *
+ * If the handle is successfully evicted, the eviction handler should
+ * return 0 _and_ should have called zbud_free() on the handle. zbud_free()
+ * contains logic to delay freeing the page if the page is under reclaim,
+ * as indicated by the setting of the PG_reclaim flag on the underlying page.
+ *
+ * If all buddies in the zbud page are successfully evicted, then the
+ * zbud page can be freed.
+ *
+ * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
+ * no pages to evict or an eviction handler is not registered, -EAGAIN if
+ * the retry limit was hit.
+ */
+int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
+{
+        int i, ret, freechunks;
+        struct zbud_header *zhdr;
+        unsigned long first_handle = 0, last_handle = 0;
+        spin_lock(&pool->lock);
+        if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
+                        retries == 0) {
+                spin_unlock(&pool->lock);
+                return -EINVAL;
+        }
+        for (i = 0; i < retries; i++) {
+                zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru);
+                list_del(&zhdr->lru);
+                list_del(&zhdr->buddy);
+                /* Protect zbud page against free */
+                zhdr->under_reclaim = true;
+                /*
+                 * We need encode the handles before unlocking, since we can
+                 * race with free that will set (first|last)_chunks to 0
+                 */
+                first_handle = 0;
+                last_handle = 0;
+                if (zhdr->first_chunks)
+                        first_handle = encode_handle(zhdr, FIRST);
+                if (zhdr->last_chunks)
+                        last_handle = encode_handle(zhdr, LAST);
+                spin_unlock(&pool->lock);
+                /* Issue the eviction callback(s) */
+                if (first_handle) {
+                        ret = pool->ops->evict(pool, first_handle);
+                        if (ret)
+                                goto next;
+                }
+                if (last_handle) {
+                        ret = pool->ops->evict(pool, last_handle);
+                        if (ret)
+                                goto next;
+                }
+next:
+                spin_lock(&pool->lock);
+                zhdr->under_reclaim = false;
+                if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
+                        /*
+                         * Both buddies are now free, free the zbud page and
+                         * return success.
+                         */
+                        free_zbud_page(zhdr);
+                        pool->pages_nr--;
+                        spin_unlock(&pool->lock);
+                        return 0;
+                } else if (zhdr->first_chunks == 0 ||
+                                zhdr->last_chunks == 0) {
+                        /* add to unbuddied list */
+                        freechunks = num_free_chunks(zhdr);
+                        list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
+                } else {
+                        /* add to buddied list */
+                        list_add(&zhdr->buddy, &pool->buddied);
+                }
+                /* add to beginning of LRU */
+                list_add(&zhdr->lru, &pool->lru);
+        }
+        spin_unlock(&pool->lock);
+        return -EAGAIN;
+}
+/**
+ * zbud_map() - maps the allocation associated with the given handle
+ * @pool:       pool in which the allocation resides
+ * @handle:     handle associated with the allocation to be mapped
+ *
+ * While trivial for zbud, the mapping functions for others allocators
+ * implementing this allocation API could have more complex information encoded
+ * in the handle and could create temporary mappings to make the data
+ * accessible to the user.
+ *
+ * Returns: a pointer to the mapped allocation
+ */
+void *zbud_map(struct zbud_pool *pool, unsigned long handle)
+{
+        return (void *)(handle);
+}
+/**
+ * zbud_unmap() - maps the allocation associated with the given handle
+ * @pool:       pool in which the allocation resides
+ * @handle:     handle associated with the allocation to be unmapped
+ */
+void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
+{
+}
+/**
+ * zbud_get_pool_size() - gets the zbud pool size in pages
+ * @pool:       pool whose size is being queried
+ *
+ * Returns: size in pages of the given pool.  The pool lock need not be
+ * taken to access pages_nr.
+ */
+u64 zbud_get_pool_size(struct zbud_pool *pool)
+{
+        return pool->pages_nr;
+}
+static int __init init_zbud(void)
+{
+        /* Make sure the zbud header will fit in one chunk */
+        BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
+        pr_info("loaded\n");
+        return 0;
+}
+static void __exit exit_zbud(void)
+{
+        pr_info("unloaded\n");
+}
+module_init(init_zbud);
+module_exit(exit_zbud);
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
+MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages");

diff --git a/mm/Kconfig b/mm/Kconfig index 7e28ecfa8aa4..45503ed5f3aa 100644 --- a/mm/Kconfig +++ b/mm/Kconfig
@@ -478,6 +478,16 @@ config FRONTSWAP
478		478
479	If unsure, say Y to enable frontswap.	479	If unsure, say Y to enable frontswap.
480		480
		481	config ZBUD
		482	tristate
		483	default n
		484	help
		485	A special purpose allocator for storing compressed pages.
		486	It is designed to store up to two compressed pages per physical
		487	page. While this design limits storage density, it has simple and
		488	deterministic reclaim properties that make it preferable to a higher
		489	density approach when reclaim will be used.
		490
481	config MEM_SOFT_DIRTY	491	config MEM_SOFT_DIRTY
482	bool "Track memory changes"	492	bool "Track memory changes"
483	depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY	493	depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY


diff --git a/mm/Makefile b/mm/Makefile index 72c5acb9345f..95f0197ce3d3 100644 --- a/mm/Makefile +++ b/mm/Makefile
@@ -58,3 +58,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
58	obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o	58	obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
59	obj-$(CONFIG_CLEANCACHE) += cleancache.o	59	obj-$(CONFIG_CLEANCACHE) += cleancache.o
60	obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o	60	obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
		61	obj-$(CONFIG_ZBUD) += zbud.o


diff --git a/mm/zbud.c b/mm/zbud.c new file mode 100644 index 000000000000..9bb4710e3589 --- /dev/null +++ b/mm/zbud.c
@@ -0,0 +1,527 @@
		1	/*
		2	* zbud.c
		3	*
		4	* Copyright (C) 2013, Seth Jennings, IBM
		5	*
		6	* Concepts based on zcache internal zbud allocator by Dan Magenheimer.
		7	*
		8	* zbud is an special purpose allocator for storing compressed pages. Contrary
		9	* to what its name may suggest, zbud is not a buddy allocator, but rather an
		10	* allocator that "buddies" two compressed pages together in a single memory
		11	* page.
		12	*
		13	* While this design limits storage density, it has simple and deterministic
		14	* reclaim properties that make it preferable to a higher density approach when
		15	* reclaim will be used.
		16	*
		17	* zbud works by storing compressed pages, or "zpages", together in pairs in a
		18	* single memory page called a "zbud page". The first buddy is "left
		19	* justifed" at the beginning of the zbud page, and the last buddy is "right
		20	* justified" at the end of the zbud page. The benefit is that if either
		21	* buddy is freed, the freed buddy space, coalesced with whatever slack space
		22	* that existed between the buddies, results in the largest possible free region
		23	* within the zbud page.
		24	*
		25	* zbud also provides an attractive lower bound on density. The ratio of zpages
		26	* to zbud pages can not be less than 1. This ensures that zbud can never "do
		27	* harm" by using more pages to store zpages than the uncompressed zpages would
		28	* have used on their own.
		29	*
		30	* zbud pages are divided into "chunks". The size of the chunks is fixed at
		31	* compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages
		32	* into chunks allows organizing unbuddied zbud pages into a manageable number
		33	* of unbuddied lists according to the number of free chunks available in the
		34	* zbud page.
		35	*
		36	* The zbud API differs from that of conventional allocators in that the
		37	* allocation function, zbud_alloc(), returns an opaque handle to the user,
		38	* not a dereferenceable pointer. The user must map the handle using
		39	* zbud_map() in order to get a usable pointer by which to access the
		40	* allocation data and unmap the handle with zbud_unmap() when operations
		41	* on the allocation data are complete.
		42	*/
		43
		44	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
		45
		46	#include <linux/atomic.h>
		47	#include <linux/list.h>
		48	#include <linux/mm.h>
		49	#include <linux/module.h>
		50	#include <linux/preempt.h>
		51	#include <linux/slab.h>
		52	#include <linux/spinlock.h>
		53	#include <linux/zbud.h>
		54
		55	/*****************
		56	* Structures
		57	*****************/
		58	/*
		59	* NCHUNKS_ORDER determines the internal allocation granularity, effectively
		60	* adjusting internal fragmentation. It also determines the number of
		61	* freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
		62	* allocation granularity will be in chunks of size PAGE_SIZE/64, and there
		63	* will be 64 freelists per pool.
		64	*/
		65	#define NCHUNKS_ORDER 6
		66
		67	#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
		68	#define CHUNK_SIZE (1 << CHUNK_SHIFT)
		69	#define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
		70	#define ZHDR_SIZE_ALIGNED CHUNK_SIZE
		71
		72	/**
		73	* struct zbud_pool - stores metadata for each zbud pool
		74	* @lock: protects all pool fields and first\|last_chunk fields of any
		75	* zbud page in the pool
		76	* @unbuddied: array of lists tracking zbud pages that only contain one buddy;
		77	* the lists each zbud page is added to depends on the size of
		78	* its free region.
		79	* @buddied: list tracking the zbud pages that contain two buddies;
		80	* these zbud pages are full
		81	* @lru: list tracking the zbud pages in LRU order by most recently
		82	* added buddy.
		83	* @pages_nr: number of zbud pages in the pool.
		84	* @ops: pointer to a structure of user defined operations specified at
		85	* pool creation time.
		86	*
		87	* This structure is allocated at pool creation time and maintains metadata
		88	* pertaining to a particular zbud pool.
		89	*/
		90	struct zbud_pool {
		91	spinlock_t lock;
		92	struct list_head unbuddied[NCHUNKS];
		93	struct list_head buddied;
		94	struct list_head lru;
		95	u64 pages_nr;
		96	struct zbud_ops *ops;
		97	};
		98
		99	/*
		100	* struct zbud_header - zbud page metadata occupying the first chunk of each
		101	* zbud page.
		102	* @buddy: links the zbud page into the unbuddied/buddied lists in the pool
		103	* @lru: links the zbud page into the lru list in the pool
		104	* @first_chunks: the size of the first buddy in chunks, 0 if free
		105	* @last_chunks: the size of the last buddy in chunks, 0 if free
		106	*/
		107	struct zbud_header {
		108	struct list_head buddy;
		109	struct list_head lru;
		110	unsigned int first_chunks;
		111	unsigned int last_chunks;
		112	bool under_reclaim;
		113	};
		114
		115	/*****************
		116	* Helpers
		117	*****************/
		118	/* Just to make the code easier to read */
		119	enum buddy {
		120	FIRST,
		121	LAST
		122	};
		123
		124	/* Converts an allocation size in bytes to size in zbud chunks */
		125	static int size_to_chunks(int size)
		126	{
		127	return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
		128	}
		129
		130	#define for_each_unbuddied_list(_iter, _begin) \
		131	for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
		132
		133	/* Initializes the zbud header of a newly allocated zbud page */
		134	static struct zbud_header init_zbud_page(struct page page)
		135	{
		136	struct zbud_header *zhdr = page_address(page);
		137	zhdr->first_chunks = 0;
		138	zhdr->last_chunks = 0;
		139	INIT_LIST_HEAD(&zhdr->buddy);
		140	INIT_LIST_HEAD(&zhdr->lru);
		141	zhdr->under_reclaim = 0;
		142	return zhdr;
		143	}
		144
		145	/* Resets the struct page fields and frees the page */
		146	static void free_zbud_page(struct zbud_header *zhdr)
		147	{
		148	__free_page(virt_to_page(zhdr));
		149	}
		150
		151	/*
		152	* Encodes the handle of a particular buddy within a zbud page
		153	* Pool lock should be held as this function accesses first\|last_chunks
		154	*/
		155	static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud)
		156	{
		157	unsigned long handle;
		158
		159	/*
		160	* For now, the encoded handle is actually just the pointer to the data
		161	* but this might not always be the case. A little information hiding.
		162	* Add CHUNK_SIZE to the handle if it is the first allocation to jump
		163	* over the zbud header in the first chunk.
		164	*/
		165	handle = (unsigned long)zhdr;
		166	if (bud == FIRST)
		167	/* skip over zbud header */
		168	handle += ZHDR_SIZE_ALIGNED;
		169	else /* bud == LAST */
		170	handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
		171	return handle;
		172	}
		173
		174	/* Returns the zbud page where a given handle is stored */
		175	static struct zbud_header *handle_to_zbud_header(unsigned long handle)
		176	{
		177	return (struct zbud_header *)(handle & PAGE_MASK);
		178	}
		179
		180	/* Returns the number of free chunks in a zbud page */
		181	static int num_free_chunks(struct zbud_header *zhdr)
		182	{
		183	/*
		184	* Rather than branch for different situations, just use the fact that
		185	* free buddies have a length of zero to simplify everything. -1 at the
		186	* end for the zbud header.
		187	*/
		188	return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1;
		189	}
		190
		191	/*****************
		192	* API Functions
		193	*****************/
		194	/**
		195	* zbud_create_pool() - create a new zbud pool
		196	* @gfp: gfp flags when allocating the zbud pool structure
		197	* @ops: user-defined operations for the zbud pool
		198	*
		199	* Return: pointer to the new zbud pool or NULL if the metadata allocation
		200	* failed.
		201	*/
		202	struct zbud_pool zbud_create_pool(gfp_t gfp, struct zbud_ops ops)
		203	{
		204	struct zbud_pool *pool;
		205	int i;
		206
		207	pool = kmalloc(sizeof(struct zbud_pool), gfp);
		208	if (!pool)
		209	return NULL;
		210	spin_lock_init(&pool->lock);
		211	for_each_unbuddied_list(i, 0)
		212	INIT_LIST_HEAD(&pool->unbuddied[i]);
		213	INIT_LIST_HEAD(&pool->buddied);
		214	INIT_LIST_HEAD(&pool->lru);
		215	pool->pages_nr = 0;
		216	pool->ops = ops;
		217	return pool;
		218	}
		219
		220	/**
		221	* zbud_destroy_pool() - destroys an existing zbud pool
		222	* @pool: the zbud pool to be destroyed
		223	*
		224	* The pool should be emptied before this function is called.
		225	*/
		226	void zbud_destroy_pool(struct zbud_pool *pool)
		227	{
		228	kfree(pool);
		229	}
		230
		231	/**
		232	* zbud_alloc() - allocates a region of a given size
		233	* @pool: zbud pool from which to allocate
		234	* @size: size in bytes of the desired allocation
		235	* @gfp: gfp flags used if the pool needs to grow
		236	* @handle: handle of the new allocation
		237	*
		238	* This function will attempt to find a free region in the pool large enough to
		239	* satisfy the allocation request. A search of the unbuddied lists is
		240	* performed first. If no suitable free region is found, then a new page is
		241	* allocated and added to the pool to satisfy the request.
		242	*
		243	* gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
		244	* as zbud pool pages.
		245	*
		246	* Return: 0 if success and handle is set, otherwise -EINVAL is the size or
		247	* gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
		248	* a new page.
		249	*/
		250	int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp,
		251	unsigned long *handle)
		252	{
		253	int chunks, i, freechunks;
		254	struct zbud_header *zhdr = NULL;
		255	enum buddy bud;
		256	struct page *page;
		257
		258	if (size <= 0 \|\| gfp & __GFP_HIGHMEM)
		259	return -EINVAL;
		260	if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED)
		261	return -ENOSPC;
		262	chunks = size_to_chunks(size);
		263	spin_lock(&pool->lock);
		264
		265	/* First, try to find an unbuddied zbud page. */
		266	zhdr = NULL;
		267	for_each_unbuddied_list(i, chunks) {
		268	if (!list_empty(&pool->unbuddied[i])) {
		269	zhdr = list_first_entry(&pool->unbuddied[i],
		270	struct zbud_header, buddy);
		271	list_del(&zhdr->buddy);
		272	if (zhdr->first_chunks == 0)
		273	bud = FIRST;
		274	else
		275	bud = LAST;
		276	goto found;
		277	}
		278	}
		279
		280	/* Couldn't find unbuddied zbud page, create new one */
		281	spin_unlock(&pool->lock);
		282	page = alloc_page(gfp);
		283	if (!page)
		284	return -ENOMEM;
		285	spin_lock(&pool->lock);
		286	pool->pages_nr++;
		287	zhdr = init_zbud_page(page);
		288	bud = FIRST;
		289
		290	found:
		291	if (bud == FIRST)
		292	zhdr->first_chunks = chunks;
		293	else
		294	zhdr->last_chunks = chunks;
		295
		296	if (zhdr->first_chunks == 0 \|\| zhdr->last_chunks == 0) {
		297	/* Add to unbuddied list */
		298	freechunks = num_free_chunks(zhdr);
		299	list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
		300	} else {
		301	/* Add to buddied list */
		302	list_add(&zhdr->buddy, &pool->buddied);
		303	}
		304
		305	/* Add/move zbud page to beginning of LRU */
		306	if (!list_empty(&zhdr->lru))
		307	list_del(&zhdr->lru);
		308	list_add(&zhdr->lru, &pool->lru);
		309
		310	*handle = encode_handle(zhdr, bud);
		311	spin_unlock(&pool->lock);
		312
		313	return 0;
		314	}
		315
		316	/**
		317	* zbud_free() - frees the allocation associated with the given handle
		318	* @pool: pool in which the allocation resided
		319	* @handle: handle associated with the allocation returned by zbud_alloc()
		320	*
		321	* In the case that the zbud page in which the allocation resides is under
		322	* reclaim, as indicated by the PG_reclaim flag being set, this function
		323	* only sets the first\|last_chunks to 0. The page is actually freed
		324	* once both buddies are evicted (see zbud_reclaim_page() below).
		325	*/
		326	void zbud_free(struct zbud_pool *pool, unsigned long handle)
		327	{
		328	struct zbud_header *zhdr;
		329	int freechunks;
		330
		331	spin_lock(&pool->lock);
		332	zhdr = handle_to_zbud_header(handle);
		333
		334	/* If first buddy, handle will be page aligned */
		335	if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK)
		336	zhdr->last_chunks = 0;
		337	else
		338	zhdr->first_chunks = 0;
		339
		340	if (zhdr->under_reclaim) {
		341	/* zbud page is under reclaim, reclaim will free */
		342	spin_unlock(&pool->lock);
		343	return;
		344	}
		345
		346	/* Remove from existing buddy list */
		347	list_del(&zhdr->buddy);
		348
		349	if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
		350	/* zbud page is empty, free */
		351	list_del(&zhdr->lru);
		352	free_zbud_page(zhdr);
		353	pool->pages_nr--;
		354	} else {
		355	/* Add to unbuddied list */
		356	freechunks = num_free_chunks(zhdr);
		357	list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
		358	}
		359
		360	spin_unlock(&pool->lock);
		361	}
		362
		363	#define list_tail_entry(ptr, type, member) \
		364	list_entry((ptr)->prev, type, member)
		365
		366	/**
		367	* zbud_reclaim_page() - evicts allocations from a pool page and frees it
		368	* @pool: pool from which a page will attempt to be evicted
		369	* @retires: number of pages on the LRU list for which eviction will
		370	* be attempted before failing
		371	*
		372	* zbud reclaim is different from normal system reclaim in that the reclaim is
		373	* done from the bottom, up. This is because only the bottom layer, zbud, has
		374	* information on how the allocations are organized within each zbud page. This
		375	* has the potential to create interesting locking situations between zbud and
		376	* the user, however.
		377	*
		378	* To avoid these, this is how zbud_reclaim_page() should be called:
		379
		380	* The user detects a page should be reclaimed and calls zbud_reclaim_page().
		381	* zbud_reclaim_page() will remove a zbud page from the pool LRU list and call
		382	* the user-defined eviction handler with the pool and handle as arguments.
		383	*
		384	* If the handle can not be evicted, the eviction handler should return
		385	* non-zero. zbud_reclaim_page() will add the zbud page back to the
		386	* appropriate list and try the next zbud page on the LRU up to
		387	* a user defined number of retries.
		388	*
		389	* If the handle is successfully evicted, the eviction handler should
		390	* return 0 _and_ should have called zbud_free() on the handle. zbud_free()
		391	* contains logic to delay freeing the page if the page is under reclaim,
		392	* as indicated by the setting of the PG_reclaim flag on the underlying page.
		393	*
		394	* If all buddies in the zbud page are successfully evicted, then the
		395	* zbud page can be freed.
		396	*
		397	* Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
		398	* no pages to evict or an eviction handler is not registered, -EAGAIN if
		399	* the retry limit was hit.
		400	*/
		401	int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
		402	{
		403	int i, ret, freechunks;
		404	struct zbud_header *zhdr;
		405	unsigned long first_handle = 0, last_handle = 0;
		406
		407	spin_lock(&pool->lock);
		408	if (!pool->ops \|\| !pool->ops->evict \|\| list_empty(&pool->lru) \|\|
		409	retries == 0) {
		410	spin_unlock(&pool->lock);
		411	return -EINVAL;
		412	}
		413	for (i = 0; i < retries; i++) {
		414	zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru);
		415	list_del(&zhdr->lru);
		416	list_del(&zhdr->buddy);
		417	/* Protect zbud page against free */
		418	zhdr->under_reclaim = true;
		419	/*
		420	* We need encode the handles before unlocking, since we can
		421	* race with free that will set (first\|last)_chunks to 0
		422	*/
		423	first_handle = 0;
		424	last_handle = 0;
		425	if (zhdr->first_chunks)
		426	first_handle = encode_handle(zhdr, FIRST);
		427	if (zhdr->last_chunks)
		428	last_handle = encode_handle(zhdr, LAST);
		429	spin_unlock(&pool->lock);
		430
		431	/* Issue the eviction callback(s) */
		432	if (first_handle) {
		433	ret = pool->ops->evict(pool, first_handle);
		434	if (ret)
		435	goto next;
		436	}
		437	if (last_handle) {
		438	ret = pool->ops->evict(pool, last_handle);
		439	if (ret)
		440	goto next;
		441	}
		442	next:
		443	spin_lock(&pool->lock);
		444	zhdr->under_reclaim = false;
		445	if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
		446	/*
		447	* Both buddies are now free, free the zbud page and
		448	* return success.
		449	*/
		450	free_zbud_page(zhdr);
		451	pool->pages_nr--;
		452	spin_unlock(&pool->lock);
		453	return 0;
		454	} else if (zhdr->first_chunks == 0 \|\|
		455	zhdr->last_chunks == 0) {
		456	/* add to unbuddied list */
		457	freechunks = num_free_chunks(zhdr);
		458	list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
		459	} else {
		460	/* add to buddied list */
		461	list_add(&zhdr->buddy, &pool->buddied);
		462	}
		463
		464	/* add to beginning of LRU */
		465	list_add(&zhdr->lru, &pool->lru);
		466	}
		467	spin_unlock(&pool->lock);
		468	return -EAGAIN;
		469	}
		470
		471	/**
		472	* zbud_map() - maps the allocation associated with the given handle
		473	* @pool: pool in which the allocation resides
		474	* @handle: handle associated with the allocation to be mapped
		475	*
		476	* While trivial for zbud, the mapping functions for others allocators
		477	* implementing this allocation API could have more complex information encoded
		478	* in the handle and could create temporary mappings to make the data
		479	* accessible to the user.
		480	*
		481	* Returns: a pointer to the mapped allocation
		482	*/
		483	void zbud_map(struct zbud_pool pool, unsigned long handle)
		484	{
		485	return (void *)(handle);
		486	}
		487
		488	/**
		489	* zbud_unmap() - maps the allocation associated with the given handle
		490	* @pool: pool in which the allocation resides
		491	* @handle: handle associated with the allocation to be unmapped
		492	*/
		493	void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
		494	{
		495	}
		496
		497	/**
		498	* zbud_get_pool_size() - gets the zbud pool size in pages
		499	* @pool: pool whose size is being queried
		500	*
		501	* Returns: size in pages of the given pool. The pool lock need not be
		502	* taken to access pages_nr.
		503	*/
		504	u64 zbud_get_pool_size(struct zbud_pool *pool)
		505	{
		506	return pool->pages_nr;
		507	}
		508
		509	static int __init init_zbud(void)
		510	{
		511	/* Make sure the zbud header will fit in one chunk */
		512	BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
		513	pr_info("loaded\n");
		514	return 0;
		515	}
		516
		517	static void __exit exit_zbud(void)
		518	{
		519	pr_info("unloaded\n");
		520	}
		521
		522	module_init(init_zbud);
		523	module_exit(exit_zbud);
		524
		525	MODULE_LICENSE("GPL");
		526	MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
		527	MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages");