aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2009-03-06 10:44:13 -0500
committerTejun Heo <tj@kernel.org>2009-03-07 00:46:35 -0500
commitccea34b5d0fbab081496d1860f31acee99fa8a6d (patch)
treee7066b5dde0e83a216768569e57cd71cc83fe912
parenta56dbddf06b653ef9c04ca3767f260fd31ccebab (diff)
percpu: finer grained locking to break deadlock and allow atomic free
Impact: fix deadlock and allow atomic free Percpu allocation always uses GFP_KERNEL and whole alloc/free paths were protected by single mutex. All percpu allocations have been from GFP_KERNEL-safe context and the original allocator had this assumption too. However, by protecting both alloc and free paths with the same mutex, the new allocator creates free -> alloc -> GFP_KERNEL dependency which the original allocator didn't have. This can lead to deadlock if free is called from FS or IO paths. Also, in general, allocators are expected to allow free to be called from atomic context. This patch implements finer grained locking to break the deadlock and allow atomic free. For details, please read the "Synchronization rules" comment. While at it, also add CONTEXT: to function comments to describe which context they expect to be called from and what they do to it. This problem was reported by Thomas Gleixner and Peter Zijlstra. http://thread.gmane.org/gmane.linux.kernel/802384 Signed-off-by: Tejun Heo <tj@kernel.org> Reported-by: Thomas Gleixner <tglx@linutronix.de> Reported-by: Peter Zijlstra <peterz@infradead.org>
-rw-r--r--mm/percpu.c161
1 files changed, 124 insertions, 37 deletions
diff --git a/mm/percpu.c b/mm/percpu.c
index 4c8a419119da..bfe6a3afaf45 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -62,6 +62,7 @@
62#include <linux/pfn.h> 62#include <linux/pfn.h>
63#include <linux/rbtree.h> 63#include <linux/rbtree.h>
64#include <linux/slab.h> 64#include <linux/slab.h>
65#include <linux/spinlock.h>
65#include <linux/vmalloc.h> 66#include <linux/vmalloc.h>
66#include <linux/workqueue.h> 67#include <linux/workqueue.h>
67 68
@@ -101,20 +102,28 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
101static int pcpu_reserved_chunk_limit; 102static int pcpu_reserved_chunk_limit;
102 103
103/* 104/*
104 * One mutex to rule them all. 105 * Synchronization rules.
105 * 106 *
106 * The following mutex is grabbed in the outermost public alloc/free 107 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
107 * interface functions and released only when the operation is 108 * protects allocation/reclaim paths, chunks and chunk->page arrays.
108 * complete. As such, every function in this file other than the 109 * The latter is a spinlock and protects the index data structures -
109 * outermost functions are called under pcpu_mutex. 110 * chunk slots, rbtree, chunks and area maps in chunks.
110 * 111 *
111 * It can easily be switched to use spinlock such that only the area 112 * During allocation, pcpu_alloc_mutex is kept locked all the time and
112 * allocation and page population commit are protected with it doing 113 * pcpu_lock is grabbed and released as necessary. All actual memory
113 * actual [de]allocation without holding any lock. However, given 114 * allocations are done using GFP_KERNEL with pcpu_lock released.
114 * what this allocator does, I think it's better to let them run 115 *
115 * sequentially. 116 * Free path accesses and alters only the index data structures, so it
117 * can be safely called from atomic context. When memory needs to be
118 * returned to the system, free path schedules reclaim_work which
119 * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
120 * reclaimed, release both locks and frees the chunks. Note that it's
121 * necessary to grab both locks to remove a chunk from circulation as
122 * allocation path might be referencing the chunk with only
123 * pcpu_alloc_mutex locked.
116 */ 124 */
117static DEFINE_MUTEX(pcpu_mutex); 125static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
126static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
118 127
119static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ 128static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
120static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ 129static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
@@ -176,6 +185,9 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
176 * kzalloc() is used; otherwise, vmalloc() is used. The returned 185 * kzalloc() is used; otherwise, vmalloc() is used. The returned
177 * memory is always zeroed. 186 * memory is always zeroed.
178 * 187 *
188 * CONTEXT:
189 * Does GFP_KERNEL allocation.
190 *
179 * RETURNS: 191 * RETURNS:
180 * Pointer to the allocated area on success, NULL on failure. 192 * Pointer to the allocated area on success, NULL on failure.
181 */ 193 */
@@ -215,6 +227,9 @@ static void pcpu_mem_free(void *ptr, size_t size)
215 * New slot according to the changed state is determined and @chunk is 227 * New slot according to the changed state is determined and @chunk is
216 * moved to the slot. Note that the reserved chunk is never put on 228 * moved to the slot. Note that the reserved chunk is never put on
217 * chunk slots. 229 * chunk slots.
230 *
231 * CONTEXT:
232 * pcpu_lock.
218 */ 233 */
219static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) 234static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
220{ 235{
@@ -260,6 +275,9 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr,
260 * searchs for the chunk with the highest start address which isn't 275 * searchs for the chunk with the highest start address which isn't
261 * beyond @addr. 276 * beyond @addr.
262 * 277 *
278 * CONTEXT:
279 * pcpu_lock.
280 *
263 * RETURNS: 281 * RETURNS:
264 * The address of the found chunk. 282 * The address of the found chunk.
265 */ 283 */
@@ -300,6 +318,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
300 * @new: chunk to insert 318 * @new: chunk to insert
301 * 319 *
302 * Insert @new into address rb tree. 320 * Insert @new into address rb tree.
321 *
322 * CONTEXT:
323 * pcpu_lock.
303 */ 324 */
304static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) 325static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
305{ 326{
@@ -319,6 +340,10 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
319 * A single allocation can split an area into three areas, so this 340 * A single allocation can split an area into three areas, so this
320 * function makes sure that @chunk->map has at least two extra slots. 341 * function makes sure that @chunk->map has at least two extra slots.
321 * 342 *
343 * CONTEXT:
344 * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired
345 * if area map is extended.
346 *
322 * RETURNS: 347 * RETURNS:
323 * 0 if noop, 1 if successfully extended, -errno on failure. 348 * 0 if noop, 1 if successfully extended, -errno on failure.
324 */ 349 */
@@ -332,13 +357,25 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
332 if (chunk->map_alloc >= chunk->map_used + 2) 357 if (chunk->map_alloc >= chunk->map_used + 2)
333 return 0; 358 return 0;
334 359
360 spin_unlock_irq(&pcpu_lock);
361
335 new_alloc = PCPU_DFL_MAP_ALLOC; 362 new_alloc = PCPU_DFL_MAP_ALLOC;
336 while (new_alloc < chunk->map_used + 2) 363 while (new_alloc < chunk->map_used + 2)
337 new_alloc *= 2; 364 new_alloc *= 2;
338 365
339 new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); 366 new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
340 if (!new) 367 if (!new) {
368 spin_lock_irq(&pcpu_lock);
341 return -ENOMEM; 369 return -ENOMEM;
370 }
371
372 /*
373 * Acquire pcpu_lock and switch to new area map. Only free
374 * could have happened inbetween, so map_used couldn't have
375 * grown.
376 */
377 spin_lock_irq(&pcpu_lock);
378 BUG_ON(new_alloc < chunk->map_used + 2);
342 379
343 size = chunk->map_alloc * sizeof(chunk->map[0]); 380 size = chunk->map_alloc * sizeof(chunk->map[0]);
344 memcpy(new, chunk->map, size); 381 memcpy(new, chunk->map, size);
@@ -371,6 +408,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
371 * is inserted after the target block. 408 * is inserted after the target block.
372 * 409 *
373 * @chunk->map must have enough free slots to accomodate the split. 410 * @chunk->map must have enough free slots to accomodate the split.
411 *
412 * CONTEXT:
413 * pcpu_lock.
374 */ 414 */
375static void pcpu_split_block(struct pcpu_chunk *chunk, int i, 415static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
376 int head, int tail) 416 int head, int tail)
@@ -406,6 +446,9 @@ static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
406 * 446 *
407 * @chunk->map must have at least two free slots. 447 * @chunk->map must have at least two free slots.
408 * 448 *
449 * CONTEXT:
450 * pcpu_lock.
451 *
409 * RETURNS: 452 * RETURNS:
410 * Allocated offset in @chunk on success, -1 if no matching area is 453 * Allocated offset in @chunk on success, -1 if no matching area is
411 * found. 454 * found.
@@ -495,6 +538,9 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
495 * Free area starting from @freeme to @chunk. Note that this function 538 * Free area starting from @freeme to @chunk. Note that this function
496 * only modifies the allocation map. It doesn't depopulate or unmap 539 * only modifies the allocation map. It doesn't depopulate or unmap
497 * the area. 540 * the area.
541 *
542 * CONTEXT:
543 * pcpu_lock.
498 */ 544 */
499static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) 545static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
500{ 546{
@@ -580,6 +626,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
580 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 626 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
581 * from @chunk. If @flush is true, vcache is flushed before unmapping 627 * from @chunk. If @flush is true, vcache is flushed before unmapping
582 * and tlb after. 628 * and tlb after.
629 *
630 * CONTEXT:
631 * pcpu_alloc_mutex.
583 */ 632 */
584static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, 633static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
585 bool flush) 634 bool flush)
@@ -658,6 +707,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
658 * 707 *
659 * For each cpu, populate and map pages [@page_start,@page_end) into 708 * For each cpu, populate and map pages [@page_start,@page_end) into
660 * @chunk. The area is cleared on return. 709 * @chunk. The area is cleared on return.
710 *
711 * CONTEXT:
712 * pcpu_alloc_mutex, does GFP_KERNEL allocation.
661 */ 713 */
662static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 714static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
663{ 715{
@@ -748,15 +800,16 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
748 * @align: alignment of area (max PAGE_SIZE) 800 * @align: alignment of area (max PAGE_SIZE)
749 * @reserved: allocate from the reserved chunk if available 801 * @reserved: allocate from the reserved chunk if available
750 * 802 *
751 * Allocate percpu area of @size bytes aligned at @align. Might 803 * Allocate percpu area of @size bytes aligned at @align.
752 * sleep. Might trigger writeouts. 804 *
805 * CONTEXT:
806 * Does GFP_KERNEL allocation.
753 * 807 *
754 * RETURNS: 808 * RETURNS:
755 * Percpu pointer to the allocated area on success, NULL on failure. 809 * Percpu pointer to the allocated area on success, NULL on failure.
756 */ 810 */
757static void *pcpu_alloc(size_t size, size_t align, bool reserved) 811static void *pcpu_alloc(size_t size, size_t align, bool reserved)
758{ 812{
759 void *ptr = NULL;
760 struct pcpu_chunk *chunk; 813 struct pcpu_chunk *chunk;
761 int slot, off; 814 int slot, off;
762 815
@@ -766,27 +819,37 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
766 return NULL; 819 return NULL;
767 } 820 }
768 821
769 mutex_lock(&pcpu_mutex); 822 mutex_lock(&pcpu_alloc_mutex);
823 spin_lock_irq(&pcpu_lock);
770 824
771 /* serve reserved allocations from the reserved chunk if available */ 825 /* serve reserved allocations from the reserved chunk if available */
772 if (reserved && pcpu_reserved_chunk) { 826 if (reserved && pcpu_reserved_chunk) {
773 chunk = pcpu_reserved_chunk; 827 chunk = pcpu_reserved_chunk;
774 if (size > chunk->contig_hint || 828 if (size > chunk->contig_hint ||
775 pcpu_extend_area_map(chunk) < 0) 829 pcpu_extend_area_map(chunk) < 0)
776 goto out_unlock; 830 goto fail_unlock;
777 off = pcpu_alloc_area(chunk, size, align); 831 off = pcpu_alloc_area(chunk, size, align);
778 if (off >= 0) 832 if (off >= 0)
779 goto area_found; 833 goto area_found;
780 goto out_unlock; 834 goto fail_unlock;
781 } 835 }
782 836
837restart:
783 /* search through normal chunks */ 838 /* search through normal chunks */
784 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { 839 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
785 list_for_each_entry(chunk, &pcpu_slot[slot], list) { 840 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
786 if (size > chunk->contig_hint) 841 if (size > chunk->contig_hint)
787 continue; 842 continue;
788 if (pcpu_extend_area_map(chunk) < 0) 843
789 goto out_unlock; 844 switch (pcpu_extend_area_map(chunk)) {
845 case 0:
846 break;
847 case 1:
848 goto restart; /* pcpu_lock dropped, restart */
849 default:
850 goto fail_unlock;
851 }
852
790 off = pcpu_alloc_area(chunk, size, align); 853 off = pcpu_alloc_area(chunk, size, align);
791 if (off >= 0) 854 if (off >= 0)
792 goto area_found; 855 goto area_found;
@@ -794,27 +857,36 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
794 } 857 }
795 858
796 /* hmmm... no space left, create a new chunk */ 859 /* hmmm... no space left, create a new chunk */
860 spin_unlock_irq(&pcpu_lock);
861
797 chunk = alloc_pcpu_chunk(); 862 chunk = alloc_pcpu_chunk();
798 if (!chunk) 863 if (!chunk)
799 goto out_unlock; 864 goto fail_unlock_mutex;
865
866 spin_lock_irq(&pcpu_lock);
800 pcpu_chunk_relocate(chunk, -1); 867 pcpu_chunk_relocate(chunk, -1);
801 pcpu_chunk_addr_insert(chunk); 868 pcpu_chunk_addr_insert(chunk);
802 869 goto restart;
803 off = pcpu_alloc_area(chunk, size, align);
804 if (off < 0)
805 goto out_unlock;
806 870
807area_found: 871area_found:
872 spin_unlock_irq(&pcpu_lock);
873
808 /* populate, map and clear the area */ 874 /* populate, map and clear the area */
809 if (pcpu_populate_chunk(chunk, off, size)) { 875 if (pcpu_populate_chunk(chunk, off, size)) {
876 spin_lock_irq(&pcpu_lock);
810 pcpu_free_area(chunk, off); 877 pcpu_free_area(chunk, off);
811 goto out_unlock; 878 goto fail_unlock;
812 } 879 }
813 880
814 ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off); 881 mutex_unlock(&pcpu_alloc_mutex);
815out_unlock: 882
816 mutex_unlock(&pcpu_mutex); 883 return __addr_to_pcpu_ptr(chunk->vm->addr + off);
817 return ptr; 884
885fail_unlock:
886 spin_unlock_irq(&pcpu_lock);
887fail_unlock_mutex:
888 mutex_unlock(&pcpu_alloc_mutex);
889 return NULL;
818} 890}
819 891
820/** 892/**
@@ -825,6 +897,9 @@ out_unlock:
825 * Allocate percpu area of @size bytes aligned at @align. Might 897 * Allocate percpu area of @size bytes aligned at @align. Might
826 * sleep. Might trigger writeouts. 898 * sleep. Might trigger writeouts.
827 * 899 *
900 * CONTEXT:
901 * Does GFP_KERNEL allocation.
902 *
828 * RETURNS: 903 * RETURNS:
829 * Percpu pointer to the allocated area on success, NULL on failure. 904 * Percpu pointer to the allocated area on success, NULL on failure.
830 */ 905 */
@@ -843,6 +918,9 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
843 * percpu area if arch has set it up; otherwise, allocation is served 918 * percpu area if arch has set it up; otherwise, allocation is served
844 * from the same dynamic area. Might sleep. Might trigger writeouts. 919 * from the same dynamic area. Might sleep. Might trigger writeouts.
845 * 920 *
921 * CONTEXT:
922 * Does GFP_KERNEL allocation.
923 *
846 * RETURNS: 924 * RETURNS:
847 * Percpu pointer to the allocated area on success, NULL on failure. 925 * Percpu pointer to the allocated area on success, NULL on failure.
848 */ 926 */
@@ -856,6 +934,9 @@ void *__alloc_reserved_percpu(size_t size, size_t align)
856 * @work: unused 934 * @work: unused
857 * 935 *
858 * Reclaim all fully free chunks except for the first one. 936 * Reclaim all fully free chunks except for the first one.
937 *
938 * CONTEXT:
939 * workqueue context.
859 */ 940 */
860static void pcpu_reclaim(struct work_struct *work) 941static void pcpu_reclaim(struct work_struct *work)
861{ 942{
@@ -863,7 +944,8 @@ static void pcpu_reclaim(struct work_struct *work)
863 struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; 944 struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
864 struct pcpu_chunk *chunk, *next; 945 struct pcpu_chunk *chunk, *next;
865 946
866 mutex_lock(&pcpu_mutex); 947 mutex_lock(&pcpu_alloc_mutex);
948 spin_lock_irq(&pcpu_lock);
867 949
868 list_for_each_entry_safe(chunk, next, head, list) { 950 list_for_each_entry_safe(chunk, next, head, list) {
869 WARN_ON(chunk->immutable); 951 WARN_ON(chunk->immutable);
@@ -876,7 +958,8 @@ static void pcpu_reclaim(struct work_struct *work)
876 list_move(&chunk->list, &todo); 958 list_move(&chunk->list, &todo);
877 } 959 }
878 960
879 mutex_unlock(&pcpu_mutex); 961 spin_unlock_irq(&pcpu_lock);
962 mutex_unlock(&pcpu_alloc_mutex);
880 963
881 list_for_each_entry_safe(chunk, next, &todo, list) { 964 list_for_each_entry_safe(chunk, next, &todo, list) {
882 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); 965 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
@@ -888,18 +971,22 @@ static void pcpu_reclaim(struct work_struct *work)
888 * free_percpu - free percpu area 971 * free_percpu - free percpu area
889 * @ptr: pointer to area to free 972 * @ptr: pointer to area to free
890 * 973 *
891 * Free percpu area @ptr. Might sleep. 974 * Free percpu area @ptr.
975 *
976 * CONTEXT:
977 * Can be called from atomic context.
892 */ 978 */
893void free_percpu(void *ptr) 979void free_percpu(void *ptr)
894{ 980{
895 void *addr = __pcpu_ptr_to_addr(ptr); 981 void *addr = __pcpu_ptr_to_addr(ptr);
896 struct pcpu_chunk *chunk; 982 struct pcpu_chunk *chunk;
983 unsigned long flags;
897 int off; 984 int off;
898 985
899 if (!ptr) 986 if (!ptr)
900 return; 987 return;
901 988
902 mutex_lock(&pcpu_mutex); 989 spin_lock_irqsave(&pcpu_lock, flags);
903 990
904 chunk = pcpu_chunk_addr_search(addr); 991 chunk = pcpu_chunk_addr_search(addr);
905 off = addr - chunk->vm->addr; 992 off = addr - chunk->vm->addr;
@@ -917,7 +1004,7 @@ void free_percpu(void *ptr)
917 } 1004 }
918 } 1005 }
919 1006
920 mutex_unlock(&pcpu_mutex); 1007 spin_unlock_irqrestore(&pcpu_lock, flags);
921} 1008}
922EXPORT_SYMBOL_GPL(free_percpu); 1009EXPORT_SYMBOL_GPL(free_percpu);
923 1010