percpu: finer grained locking to break deadlock and allow atomic free

Impact: fix deadlock and allow atomic free Percpu allocation always uses GFP_KERNEL and whole alloc/free paths were protected by single mutex. All percpu allocations have been from GFP_KERNEL-safe context and the original allocator had this assumption too. However, by protecting both alloc and free paths with the same mutex, the new allocator creates free -> alloc -> GFP_KERNEL dependency which the original allocator didn't have. This can lead to deadlock if free is called from FS or IO paths. Also, in general, allocators are expected to allow free to be called from atomic context. This patch implements finer grained locking to break the deadlock and allow atomic free. For details, please read the "Synchronization rules" comment. While at it, also add CONTEXT: to function comments to describe which context they expect to be called from and what they do to it. This problem was reported by Thomas Gleixner and Peter Zijlstra. http://thread.gmane.org/gmane.linux.kernel/802384 Signed-off-by: Tejun Heo <tj@kernel.org> Reported-by: Thomas Gleixner <tglx@linutronix.de> Reported-by: Peter Zijlstra <peterz@infradead.org>
author: Tejun Heo <tj@kernel.org> 2009-03-06 10:44:13 -0500
committer: Tejun Heo <tj@kernel.org> 2009-03-07 00:46:35 -0500
commit: ccea34b5d0fbab081496d1860f31acee99fa8a6d (patch)
tree: e7066b5dde0e83a216768569e57cd71cc83fe912
parent: a56dbddf06b653ef9c04ca3767f260fd31ccebab (diff)
1 files changed, 124 insertions, 37 deletions
diff --git a/mm/percpu.c b/mm/percpu.c
index 4c8a419119da..bfe6a3afaf45 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -62,6 +62,7 @@
 #include <linux/pfn.h>
 #include <linux/rbtree.h>
 #include <linux/slab.h>
+#include <linux/spinlock.h>
 #include <linux/vmalloc.h>
 #include <linux/workqueue.h>
@@ -101,20 +102,28 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
 static int pcpu_reserved_chunk_limit;
 /*
- * One mutex to rule them all.
+ * Synchronization rules.
- *
+ *
- * The following mutex is grabbed in the outermost public alloc/free
+ * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
- * interface functions and released only when the operation is
+ * protects allocation/reclaim paths, chunks and chunk->page arrays.
- * complete.  As such, every function in this file other than the
+ * The latter is a spinlock and protects the index data structures -
- * outermost functions are called under pcpu_mutex.
+ * chunk slots, rbtree, chunks and area maps in chunks.
- *
+ *
- * It can easily be switched to use spinlock such that only the area
+ * During allocation, pcpu_alloc_mutex is kept locked all the time and
- * allocation and page population commit are protected with it doing
+ * pcpu_lock is grabbed and released as necessary.  All actual memory
- * actual [de]allocation without holding any lock.  However, given
+ * allocations are done using GFP_KERNEL with pcpu_lock released.
- * what this allocator does, I think it's better to let them run
+ *
- * sequentially.
+ * Free path accesses and alters only the index data structures, so it
+ * can be safely called from atomic context.  When memory needs to be
+ * returned to the system, free path schedules reclaim_work which
+ * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
+ * reclaimed, release both locks and frees the chunks.  Note that it's
+ * necessary to grab both locks to remove a chunk from circulation as
+ * allocation path might be referencing the chunk with only
+ * pcpu_alloc_mutex locked.
 */
-static DEFINE_MUTEX(pcpu_mutex);
+static DEFINE_MUTEX(pcpu_alloc_mutex);  /* protects whole alloc and reclaim */
+static DEFINE_SPINLOCK(pcpu_lock);      /* protects index data structures */
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
@@ -176,6 +185,9 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
 * kzalloc() is used; otherwise, vmalloc() is used.  The returned
 * memory is always zeroed.
 *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
 * RETURNS:
 * Pointer to the allocated area on success, NULL on failure.
 */
@@ -215,6 +227,9 @@ static void pcpu_mem_free(void *ptr, size_t size)
 * New slot according to the changed state is determined and @chunk is
 * moved to the slot.  Note that the reserved chunk is never put on
 * chunk slots.
+ *
+ * CONTEXT:
+ * pcpu_lock.
 */
 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 {
@@ -260,6 +275,9 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr,
 * searchs for the chunk with the highest start address which isn't
 * beyond @addr.
 *
+ * CONTEXT:
+ * pcpu_lock.
+ *
 * RETURNS:
 * The address of the found chunk.
 */
@@ -300,6 +318,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 * @new: chunk to insert
 *
 * Insert @new into address rb tree.
+ *
+ * CONTEXT:
+ * pcpu_lock.
 */
 static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
 {
@@ -319,6 +340,10 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
 * A single allocation can split an area into three areas, so this
 * function makes sure that @chunk->map has at least two extra slots.
 *
+ * CONTEXT:
+ * pcpu_alloc_mutex, pcpu_lock.  pcpu_lock is released and reacquired
+ * if area map is extended.
+ *
 * RETURNS:
 * 0 if noop, 1 if successfully extended, -errno on failure.
 */
@@ -332,13 +357,25 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
        if (chunk->map_alloc >= chunk->map_used + 2)
                return 0;
+        spin_unlock_irq(&pcpu_lock);
        new_alloc = PCPU_DFL_MAP_ALLOC;
        while (new_alloc < chunk->map_used + 2)
                new_alloc *= 2;
        new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
-        if (!new)
+        if (!new) {
+                spin_lock_irq(&pcpu_lock);
                return -ENOMEM;
+        }
+        /*
+         * Acquire pcpu_lock and switch to new area map.  Only free
+         * could have happened inbetween, so map_used couldn't have
+         * grown.
+         */
+        spin_lock_irq(&pcpu_lock);
+        BUG_ON(new_alloc < chunk->map_used + 2);
        size = chunk->map_alloc * sizeof(chunk->map[0]);
        memcpy(new, chunk->map, size);
@@ -371,6 +408,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
 * is inserted after the target block.
 *
 * @chunk->map must have enough free slots to accomodate the split.
+ *
+ * CONTEXT:
+ * pcpu_lock.
 */
 static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
                             int head, int tail)
@@ -406,6 +446,9 @@ static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
 *
 * @chunk->map must have at least two free slots.
 *
+ * CONTEXT:
+ * pcpu_lock.
+ *
 * RETURNS:
 * Allocated offset in @chunk on success, -1 if no matching area is
 * found.
@@ -495,6 +538,9 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
 * Free area starting from @freeme to @chunk.  Note that this function
 * only modifies the allocation map.  It doesn't depopulate or unmap
 * the area.
+ *
+ * CONTEXT:
+ * pcpu_lock.
 */
 static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
 {
@@ -580,6 +626,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
 * from @chunk.  If @flush is true, vcache is flushed before unmapping
 * and tlb after.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex.
 */
 static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
                                  bool flush)
@@ -658,6 +707,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
 *
 * For each cpu, populate and map pages [@page_start,@page_end) into
 * @chunk.  The area is cleared on return.
+ *
+ * CONTEXT:
+ * pcpu_alloc_mutex, does GFP_KERNEL allocation.
 */
 static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
 {
@@ -748,15 +800,16 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 * @align: alignment of area (max PAGE_SIZE)
 * @reserved: allocate from the reserved chunk if available
 *
- * Allocate percpu area of @size bytes aligned at @align.  Might
+ * Allocate percpu area of @size bytes aligned at @align.
- * sleep.  Might trigger writeouts.
+ *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
 *
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
 static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 {
-        void *ptr = NULL;
        struct pcpu_chunk *chunk;
        int slot, off;
@@ -766,27 +819,37 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
                return NULL;
        }
-        mutex_lock(&pcpu_mutex);
+        mutex_lock(&pcpu_alloc_mutex);
+        spin_lock_irq(&pcpu_lock);
        /* serve reserved allocations from the reserved chunk if available */
        if (reserved && pcpu_reserved_chunk) {
                chunk = pcpu_reserved_chunk;
                if (size > chunk->contig_hint ||
                    pcpu_extend_area_map(chunk) < 0)
-                        goto out_unlock;
+                        goto fail_unlock;
                off = pcpu_alloc_area(chunk, size, align);
                if (off >= 0)
                        goto area_found;
-                goto out_unlock;
+                goto fail_unlock;
        }
+restart:
        /* search through normal chunks */
        for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
                list_for_each_entry(chunk, &pcpu_slot[slot], list) {
                        if (size > chunk->contig_hint)
                                continue;
-                        if (pcpu_extend_area_map(chunk) < 0)
-                                goto out_unlock;
+                        switch (pcpu_extend_area_map(chunk)) {
+                        case 0:
+                                break;
+                        case 1:
+                                goto restart;   /* pcpu_lock dropped, restart */
+                        default:
+                                goto fail_unlock;
+                        }
                        off = pcpu_alloc_area(chunk, size, align);
                        if (off >= 0)
                                goto area_found;
@@ -794,27 +857,36 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
        }
        /* hmmm... no space left, create a new chunk */
+        spin_unlock_irq(&pcpu_lock);
        chunk = alloc_pcpu_chunk();
        if (!chunk)
-                goto out_unlock;
+                goto fail_unlock_mutex;
+        spin_lock_irq(&pcpu_lock);
        pcpu_chunk_relocate(chunk, -1);
        pcpu_chunk_addr_insert(chunk);
+        goto restart;
-        off = pcpu_alloc_area(chunk, size, align);
-        if (off < 0)
-                goto out_unlock;
 area_found:
+        spin_unlock_irq(&pcpu_lock);
        /* populate, map and clear the area */
        if (pcpu_populate_chunk(chunk, off, size)) {
+                spin_lock_irq(&pcpu_lock);
                pcpu_free_area(chunk, off);
-                goto out_unlock;
+                goto fail_unlock;
        }
-        ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
+        mutex_unlock(&pcpu_alloc_mutex);
-out_unlock:
-        mutex_unlock(&pcpu_mutex);
+        return __addr_to_pcpu_ptr(chunk->vm->addr + off);
-        return ptr;
+fail_unlock:
+        spin_unlock_irq(&pcpu_lock);
+fail_unlock_mutex:
+        mutex_unlock(&pcpu_alloc_mutex);
+        return NULL;
 }
 /**
@@ -825,6 +897,9 @@ out_unlock:
 * Allocate percpu area of @size bytes aligned at @align.  Might
 * sleep.  Might trigger writeouts.
 *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
@@ -843,6 +918,9 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
 * percpu area if arch has set it up; otherwise, allocation is served
 * from the same dynamic area.  Might sleep.  Might trigger writeouts.
 *
+ * CONTEXT:
+ * Does GFP_KERNEL allocation.
+ *
 * RETURNS:
 * Percpu pointer to the allocated area on success, NULL on failure.
 */
@@ -856,6 +934,9 @@ void *__alloc_reserved_percpu(size_t size, size_t align)
 * @work: unused
 *
 * Reclaim all fully free chunks except for the first one.
+ *
+ * CONTEXT:
+ * workqueue context.
 */
 static void pcpu_reclaim(struct work_struct *work)
 {
@@ -863,7 +944,8 @@ static void pcpu_reclaim(struct work_struct *work)
        struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
        struct pcpu_chunk *chunk, *next;
-        mutex_lock(&pcpu_mutex);
+        mutex_lock(&pcpu_alloc_mutex);
+        spin_lock_irq(&pcpu_lock);
        list_for_each_entry_safe(chunk, next, head, list) {
                WARN_ON(chunk->immutable);
@@ -876,7 +958,8 @@ static void pcpu_reclaim(struct work_struct *work)
                list_move(&chunk->list, &todo);
        }
-        mutex_unlock(&pcpu_mutex);
+        spin_unlock_irq(&pcpu_lock);
+        mutex_unlock(&pcpu_alloc_mutex);
        list_for_each_entry_safe(chunk, next, &todo, list) {
                pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
@@ -888,18 +971,22 @@ static void pcpu_reclaim(struct work_struct *work)
 * free_percpu - free percpu area
 * @ptr: pointer to area to free
 *
- * Free percpu area @ptr.  Might sleep.
+ * Free percpu area @ptr.
+ *
+ * CONTEXT:
+ * Can be called from atomic context.
 */
 void free_percpu(void *ptr)
 {
        void *addr = __pcpu_ptr_to_addr(ptr);
        struct pcpu_chunk *chunk;
+        unsigned long flags;
        int off;
        if (!ptr)
                return;
-        mutex_lock(&pcpu_mutex);
+        spin_lock_irqsave(&pcpu_lock, flags);
        chunk = pcpu_chunk_addr_search(addr);
        off = addr - chunk->vm->addr;
@@ -917,7 +1004,7 @@ void free_percpu(void *ptr)
                        }
        }
-        mutex_unlock(&pcpu_mutex);
+        spin_unlock_irqrestore(&pcpu_lock, flags);
 }
 EXPORT_SYMBOL_GPL(free_percpu);
author	Tejun Heo <tj@kernel.org>	2009-03-06 10:44:13 -0500
committer	Tejun Heo <tj@kernel.org>	2009-03-07 00:46:35 -0500
commit	ccea34b5d0fbab081496d1860f31acee99fa8a6d (patch)
tree	e7066b5dde0e83a216768569e57cd71cc83fe912
parent	a56dbddf06b653ef9c04ca3767f260fd31ccebab (diff)

diff --git a/mm/percpu.c b/mm/percpu.c index 4c8a419119da..bfe6a3afaf45 100644 --- a/mm/percpu.c +++ b/mm/percpu.c
@@ -62,6 +62,7 @@
62	#include <linux/pfn.h>	62	#include <linux/pfn.h>
63	#include <linux/rbtree.h>	63	#include <linux/rbtree.h>
64	#include <linux/slab.h>	64	#include <linux/slab.h>
		65	#include <linux/spinlock.h>
65	#include <linux/vmalloc.h>	66	#include <linux/vmalloc.h>
66	#include <linux/workqueue.h>	67	#include <linux/workqueue.h>
67		68
@@ -101,20 +102,28 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
101	static int pcpu_reserved_chunk_limit;	102	static int pcpu_reserved_chunk_limit;
102		103
103	/*	104	/*
104	* One mutex to rule them all.	105	* Synchronization rules.
105	*	106	*
106	* The following mutex is grabbed in the outermost public alloc/free	107	* There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
107	* interface functions and released only when the operation is	108	* protects allocation/reclaim paths, chunks and chunk->page arrays.
108	* complete. As such, every function in this file other than the	109	* The latter is a spinlock and protects the index data structures -
109	* outermost functions are called under pcpu_mutex.	110	* chunk slots, rbtree, chunks and area maps in chunks.
110	*	111	*
111	* It can easily be switched to use spinlock such that only the area	112	* During allocation, pcpu_alloc_mutex is kept locked all the time and
112	* allocation and page population commit are protected with it doing	113	* pcpu_lock is grabbed and released as necessary. All actual memory
113	* actual [de]allocation without holding any lock. However, given	114	* allocations are done using GFP_KERNEL with pcpu_lock released.
114	* what this allocator does, I think it's better to let them run	115	*
115	* sequentially.	116	* Free path accesses and alters only the index data structures, so it
		117	* can be safely called from atomic context. When memory needs to be
		118	* returned to the system, free path schedules reclaim_work which
		119	* grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
		120	* reclaimed, release both locks and frees the chunks. Note that it's
		121	* necessary to grab both locks to remove a chunk from circulation as
		122	* allocation path might be referencing the chunk with only
		123	* pcpu_alloc_mutex locked.
116	*/	124	*/
117	static DEFINE_MUTEX(pcpu_mutex);	125	static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
		126	static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
118		127
119	static struct list_head pcpu_slot __read_mostly; / chunk list slots */	128	static struct list_head pcpu_slot __read_mostly; / chunk list slots */
120	static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */	129	static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
@@ -176,6 +185,9 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
176	* kzalloc() is used; otherwise, vmalloc() is used. The returned	185	* kzalloc() is used; otherwise, vmalloc() is used. The returned
177	* memory is always zeroed.	186	* memory is always zeroed.
178	*	187	*
		188	* CONTEXT:
		189	* Does GFP_KERNEL allocation.
		190	*
179	* RETURNS:	191	* RETURNS:
180	* Pointer to the allocated area on success, NULL on failure.	192	* Pointer to the allocated area on success, NULL on failure.
181	*/	193	*/
@@ -215,6 +227,9 @@ static void pcpu_mem_free(void *ptr, size_t size)
215	* New slot according to the changed state is determined and @chunk is	227	* New slot according to the changed state is determined and @chunk is
216	* moved to the slot. Note that the reserved chunk is never put on	228	* moved to the slot. Note that the reserved chunk is never put on
217	* chunk slots.	229	* chunk slots.
		230	*
		231	* CONTEXT:
		232	* pcpu_lock.
218	*/	233	*/
219	static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)	234	static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
220	{	235	{
@@ -260,6 +275,9 @@ static struct rb_node *pcpu_chunk_rb_search(void addr,
260	* searchs for the chunk with the highest start address which isn't	275	* searchs for the chunk with the highest start address which isn't
261	* beyond @addr.	276	* beyond @addr.
262	*	277	*
		278	* CONTEXT:
		279	* pcpu_lock.
		280	*
263	* RETURNS:	281	* RETURNS:
264	* The address of the found chunk.	282	* The address of the found chunk.
265	*/	283	*/
@@ -300,6 +318,9 @@ static struct pcpu_chunk pcpu_chunk_addr_search(void addr)
300	* @new: chunk to insert	318	* @new: chunk to insert
301	*	319	*
302	* Insert @new into address rb tree.	320	* Insert @new into address rb tree.
		321	*
		322	* CONTEXT:
		323	* pcpu_lock.
303	*/	324	*/
304	static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)	325	static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
305	{	326	{
@@ -319,6 +340,10 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
319	* A single allocation can split an area into three areas, so this	340	* A single allocation can split an area into three areas, so this
320	* function makes sure that @chunk->map has at least two extra slots.	341	* function makes sure that @chunk->map has at least two extra slots.
321	*	342	*
		343	* CONTEXT:
		344	* pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired
		345	* if area map is extended.
		346	*
322	* RETURNS:	347	* RETURNS:
323	* 0 if noop, 1 if successfully extended, -errno on failure.	348	* 0 if noop, 1 if successfully extended, -errno on failure.
324	*/	349	*/
@@ -332,13 +357,25 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
332	if (chunk->map_alloc >= chunk->map_used + 2)	357	if (chunk->map_alloc >= chunk->map_used + 2)
333	return 0;	358	return 0;
334		359
		360	spin_unlock_irq(&pcpu_lock);
		361
335	new_alloc = PCPU_DFL_MAP_ALLOC;	362	new_alloc = PCPU_DFL_MAP_ALLOC;
336	while (new_alloc < chunk->map_used + 2)	363	while (new_alloc < chunk->map_used + 2)
337	new_alloc *= 2;	364	new_alloc *= 2;
338		365
339	new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));	366	new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
340	if (!new)	367	if (!new) {
		368	spin_lock_irq(&pcpu_lock);
341	return -ENOMEM;	369	return -ENOMEM;
		370	}
		371
		372	/*
		373	* Acquire pcpu_lock and switch to new area map. Only free
		374	* could have happened inbetween, so map_used couldn't have
		375	* grown.
		376	*/
		377	spin_lock_irq(&pcpu_lock);
		378	BUG_ON(new_alloc < chunk->map_used + 2);
342		379
343	size = chunk->map_alloc * sizeof(chunk->map[0]);	380	size = chunk->map_alloc * sizeof(chunk->map[0]);
344	memcpy(new, chunk->map, size);	381	memcpy(new, chunk->map, size);
@@ -371,6 +408,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
371	* is inserted after the target block.	408	* is inserted after the target block.
372	*	409	*
373	* @chunk->map must have enough free slots to accomodate the split.	410	* @chunk->map must have enough free slots to accomodate the split.
		411	*
		412	* CONTEXT:
		413	* pcpu_lock.
374	*/	414	*/
375	static void pcpu_split_block(struct pcpu_chunk *chunk, int i,	415	static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
376	int head, int tail)	416	int head, int tail)
@@ -406,6 +446,9 @@ static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
406	*	446	*
407	* @chunk->map must have at least two free slots.	447	* @chunk->map must have at least two free slots.
408	*	448	*
		449	* CONTEXT:
		450	* pcpu_lock.
		451	*
409	* RETURNS:	452	* RETURNS:
410	* Allocated offset in @chunk on success, -1 if no matching area is	453	* Allocated offset in @chunk on success, -1 if no matching area is
411	* found.	454	* found.
@@ -495,6 +538,9 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
495	* Free area starting from @freeme to @chunk. Note that this function	538	* Free area starting from @freeme to @chunk. Note that this function
496	* only modifies the allocation map. It doesn't depopulate or unmap	539	* only modifies the allocation map. It doesn't depopulate or unmap
497	* the area.	540	* the area.
		541	*
		542	* CONTEXT:
		543	* pcpu_lock.
498	*/	544	*/
499	static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)	545	static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
500	{	546	{
@@ -580,6 +626,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
580	* For each cpu, depopulate and unmap pages [@page_start,@page_end)	626	* For each cpu, depopulate and unmap pages [@page_start,@page_end)
581	* from @chunk. If @flush is true, vcache is flushed before unmapping	627	* from @chunk. If @flush is true, vcache is flushed before unmapping
582	* and tlb after.	628	* and tlb after.
		629	*
		630	* CONTEXT:
		631	* pcpu_alloc_mutex.
583	*/	632	*/
584	static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,	633	static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
585	bool flush)	634	bool flush)
@@ -658,6 +707,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
658	*	707	*
659	* For each cpu, populate and map pages [@page_start,@page_end) into	708	* For each cpu, populate and map pages [@page_start,@page_end) into
660	* @chunk. The area is cleared on return.	709	* @chunk. The area is cleared on return.
		710	*
		711	* CONTEXT:
		712	* pcpu_alloc_mutex, does GFP_KERNEL allocation.
661	*/	713	*/
662	static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)	714	static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
663	{	715	{
@@ -748,15 +800,16 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
748	* @align: alignment of area (max PAGE_SIZE)	800	* @align: alignment of area (max PAGE_SIZE)
749	* @reserved: allocate from the reserved chunk if available	801	* @reserved: allocate from the reserved chunk if available
750	*	802	*
751	* Allocate percpu area of @size bytes aligned at @align. Might	803	* Allocate percpu area of @size bytes aligned at @align.
752	* sleep. Might trigger writeouts.	804	*
		805	* CONTEXT:
		806	* Does GFP_KERNEL allocation.
753	*	807	*
754	* RETURNS:	808	* RETURNS:
755	* Percpu pointer to the allocated area on success, NULL on failure.	809	* Percpu pointer to the allocated area on success, NULL on failure.
756	*/	810	*/
757	static void *pcpu_alloc(size_t size, size_t align, bool reserved)	811	static void *pcpu_alloc(size_t size, size_t align, bool reserved)
758	{	812	{
759	void *ptr = NULL;
760	struct pcpu_chunk *chunk;	813	struct pcpu_chunk *chunk;
761	int slot, off;	814	int slot, off;
762		815
@@ -766,27 +819,37 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
766	return NULL;	819	return NULL;
767	}	820	}
768		821
769	mutex_lock(&pcpu_mutex);	822	mutex_lock(&pcpu_alloc_mutex);
		823	spin_lock_irq(&pcpu_lock);
770		824
771	/* serve reserved allocations from the reserved chunk if available */	825	/* serve reserved allocations from the reserved chunk if available */
772	if (reserved && pcpu_reserved_chunk) {	826	if (reserved && pcpu_reserved_chunk) {
773	chunk = pcpu_reserved_chunk;	827	chunk = pcpu_reserved_chunk;
774	if (size > chunk->contig_hint \|\|	828	if (size > chunk->contig_hint \|\|
775	pcpu_extend_area_map(chunk) < 0)	829	pcpu_extend_area_map(chunk) < 0)
776	goto out_unlock;	830	goto fail_unlock;
777	off = pcpu_alloc_area(chunk, size, align);	831	off = pcpu_alloc_area(chunk, size, align);
778	if (off >= 0)	832	if (off >= 0)
779	goto area_found;	833	goto area_found;
780	goto out_unlock;	834	goto fail_unlock;
781	}	835	}
782		836
		837	restart:
783	/* search through normal chunks */	838	/* search through normal chunks */
784	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {	839	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
785	list_for_each_entry(chunk, &pcpu_slot[slot], list) {	840	list_for_each_entry(chunk, &pcpu_slot[slot], list) {
786	if (size > chunk->contig_hint)	841	if (size > chunk->contig_hint)
787	continue;	842	continue;
788	if (pcpu_extend_area_map(chunk) < 0)	843
789	goto out_unlock;	844	switch (pcpu_extend_area_map(chunk)) {
		845	case 0:
		846	break;
		847	case 1:
		848	goto restart; /* pcpu_lock dropped, restart */
		849	default:
		850	goto fail_unlock;
		851	}
		852
790	off = pcpu_alloc_area(chunk, size, align);	853	off = pcpu_alloc_area(chunk, size, align);
791	if (off >= 0)	854	if (off >= 0)
792	goto area_found;	855	goto area_found;
@@ -794,27 +857,36 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
794	}	857	}
795		858
796	/* hmmm... no space left, create a new chunk */	859	/* hmmm... no space left, create a new chunk */
		860	spin_unlock_irq(&pcpu_lock);
		861
797	chunk = alloc_pcpu_chunk();	862	chunk = alloc_pcpu_chunk();
798	if (!chunk)	863	if (!chunk)
799	goto out_unlock;	864	goto fail_unlock_mutex;
		865
		866	spin_lock_irq(&pcpu_lock);
800	pcpu_chunk_relocate(chunk, -1);	867	pcpu_chunk_relocate(chunk, -1);
801	pcpu_chunk_addr_insert(chunk);	868	pcpu_chunk_addr_insert(chunk);
802		869	goto restart;
803	off = pcpu_alloc_area(chunk, size, align);
804	if (off < 0)
805	goto out_unlock;
806		870
807	area_found:	871	area_found:
		872	spin_unlock_irq(&pcpu_lock);
		873
808	/* populate, map and clear the area */	874	/* populate, map and clear the area */
809	if (pcpu_populate_chunk(chunk, off, size)) {	875	if (pcpu_populate_chunk(chunk, off, size)) {
		876	spin_lock_irq(&pcpu_lock);
810	pcpu_free_area(chunk, off);	877	pcpu_free_area(chunk, off);
811	goto out_unlock;	878	goto fail_unlock;
812	}	879	}
813		880
814	ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);	881	mutex_unlock(&pcpu_alloc_mutex);
815	out_unlock:	882
816	mutex_unlock(&pcpu_mutex);	883	return __addr_to_pcpu_ptr(chunk->vm->addr + off);
817	return ptr;	884
		885	fail_unlock:
		886	spin_unlock_irq(&pcpu_lock);
		887	fail_unlock_mutex:
		888	mutex_unlock(&pcpu_alloc_mutex);
		889	return NULL;
818	}	890	}
819		891
820	/**	892	/**
@@ -825,6 +897,9 @@ out_unlock:
825	* Allocate percpu area of @size bytes aligned at @align. Might	897	* Allocate percpu area of @size bytes aligned at @align. Might
826	* sleep. Might trigger writeouts.	898	* sleep. Might trigger writeouts.
827	*	899	*
		900	* CONTEXT:
		901	* Does GFP_KERNEL allocation.
		902	*
828	* RETURNS:	903	* RETURNS:
829	* Percpu pointer to the allocated area on success, NULL on failure.	904	* Percpu pointer to the allocated area on success, NULL on failure.
830	*/	905	*/
@@ -843,6 +918,9 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
843	* percpu area if arch has set it up; otherwise, allocation is served	918	* percpu area if arch has set it up; otherwise, allocation is served
844	* from the same dynamic area. Might sleep. Might trigger writeouts.	919	* from the same dynamic area. Might sleep. Might trigger writeouts.
845	*	920	*
		921	* CONTEXT:
		922	* Does GFP_KERNEL allocation.
		923	*
846	* RETURNS:	924	* RETURNS:
847	* Percpu pointer to the allocated area on success, NULL on failure.	925	* Percpu pointer to the allocated area on success, NULL on failure.
848	*/	926	*/
@@ -856,6 +934,9 @@ void *__alloc_reserved_percpu(size_t size, size_t align)
856	* @work: unused	934	* @work: unused
857	*	935	*
858	* Reclaim all fully free chunks except for the first one.	936	* Reclaim all fully free chunks except for the first one.
		937	*
		938	* CONTEXT:
		939	* workqueue context.
859	*/	940	*/
860	static void pcpu_reclaim(struct work_struct *work)	941	static void pcpu_reclaim(struct work_struct *work)
861	{	942	{
@@ -863,7 +944,8 @@ static void pcpu_reclaim(struct work_struct *work)
863	struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];	944	struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
864	struct pcpu_chunk chunk, next;	945	struct pcpu_chunk chunk, next;
865		946
866	mutex_lock(&pcpu_mutex);	947	mutex_lock(&pcpu_alloc_mutex);
		948	spin_lock_irq(&pcpu_lock);
867		949
868	list_for_each_entry_safe(chunk, next, head, list) {	950	list_for_each_entry_safe(chunk, next, head, list) {
869	WARN_ON(chunk->immutable);	951	WARN_ON(chunk->immutable);
@@ -876,7 +958,8 @@ static void pcpu_reclaim(struct work_struct *work)
876	list_move(&chunk->list, &todo);	958	list_move(&chunk->list, &todo);
877	}	959	}
878		960
879	mutex_unlock(&pcpu_mutex);	961	spin_unlock_irq(&pcpu_lock);
		962	mutex_unlock(&pcpu_alloc_mutex);
880		963
881	list_for_each_entry_safe(chunk, next, &todo, list) {	964	list_for_each_entry_safe(chunk, next, &todo, list) {
882	pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);	965	pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
@@ -888,18 +971,22 @@ static void pcpu_reclaim(struct work_struct *work)
888	* free_percpu - free percpu area	971	* free_percpu - free percpu area
889	* @ptr: pointer to area to free	972	* @ptr: pointer to area to free
890	*	973	*
891	* Free percpu area @ptr. Might sleep.	974	* Free percpu area @ptr.
		975	*
		976	* CONTEXT:
		977	* Can be called from atomic context.
892	*/	978	*/
893	void free_percpu(void *ptr)	979	void free_percpu(void *ptr)
894	{	980	{
895	void *addr = __pcpu_ptr_to_addr(ptr);	981	void *addr = __pcpu_ptr_to_addr(ptr);
896	struct pcpu_chunk *chunk;	982	struct pcpu_chunk *chunk;
		983	unsigned long flags;
897	int off;	984	int off;
898		985
899	if (!ptr)	986	if (!ptr)
900	return;	987	return;
901		988
902	mutex_lock(&pcpu_mutex);	989	spin_lock_irqsave(&pcpu_lock, flags);
903		990
904	chunk = pcpu_chunk_addr_search(addr);	991	chunk = pcpu_chunk_addr_search(addr);
905	off = addr - chunk->vm->addr;	992	off = addr - chunk->vm->addr;
@@ -917,7 +1004,7 @@ void free_percpu(void *ptr)
917	}	1004	}
918	}	1005	}
919		1006
920	mutex_unlock(&pcpu_mutex);	1007	spin_unlock_irqrestore(&pcpu_lock, flags);
921	}	1008	}
922	EXPORT_SYMBOL_GPL(free_percpu);	1009	EXPORT_SYMBOL_GPL(free_percpu);
923		1010