aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--mm/percpu.c161
1 files changed, 124 insertions, 37 deletions
diff --git a/mm/percpu.c b/mm/percpu.c
index 4c8a419119da..bfe6a3afaf45 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -62,6 +62,7 @@
62#include <linux/pfn.h> 62#include <linux/pfn.h>
63#include <linux/rbtree.h> 63#include <linux/rbtree.h>
64#include <linux/slab.h> 64#include <linux/slab.h>
65#include <linux/spinlock.h>
65#include <linux/vmalloc.h> 66#include <linux/vmalloc.h>
66#include <linux/workqueue.h> 67#include <linux/workqueue.h>
67 68
@@ -101,20 +102,28 @@ static struct pcpu_chunk *pcpu_reserved_chunk;
101static int pcpu_reserved_chunk_limit; 102static int pcpu_reserved_chunk_limit;
102 103
103/* 104/*
104 * One mutex to rule them all. 105 * Synchronization rules.
105 * 106 *
106 * The following mutex is grabbed in the outermost public alloc/free 107 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
107 * interface functions and released only when the operation is 108 * protects allocation/reclaim paths, chunks and chunk->page arrays.
108 * complete. As such, every function in this file other than the 109 * The latter is a spinlock and protects the index data structures -
109 * outermost functions are called under pcpu_mutex. 110 * chunk slots, rbtree, chunks and area maps in chunks.
110 * 111 *
111 * It can easily be switched to use spinlock such that only the area 112 * During allocation, pcpu_alloc_mutex is kept locked all the time and
112 * allocation and page population commit are protected with it doing 113 * pcpu_lock is grabbed and released as necessary. All actual memory
113 * actual [de]allocation without holding any lock. However, given 114 * allocations are done using GFP_KERNEL with pcpu_lock released.
114 * what this allocator does, I think it's better to let them run 115 *
115 * sequentially. 116 * Free path accesses and alters only the index data structures, so it
117 * can be safely called from atomic context. When memory needs to be
118 * returned to the system, free path schedules reclaim_work which
119 * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
120 * reclaimed, release both locks and frees the chunks. Note that it's
121 * necessary to grab both locks to remove a chunk from circulation as
122 * allocation path might be referencing the chunk with only
123 * pcpu_alloc_mutex locked.
116 */ 124 */
117static DEFINE_MUTEX(pcpu_mutex); 125static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
126static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
118 127
119static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ 128static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
120static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ 129static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */
@@ -176,6 +185,9 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
176 * kzalloc() is used; otherwise, vmalloc() is used. The returned 185 * kzalloc() is used; otherwise, vmalloc() is used. The returned
177 * memory is always zeroed. 186 * memory is always zeroed.
178 * 187 *
188 * CONTEXT:
189 * Does GFP_KERNEL allocation.
190 *
179 * RETURNS: 191 * RETURNS:
180 * Pointer to the allocated area on success, NULL on failure. 192 * Pointer to the allocated area on success, NULL on failure.
181 */ 193 */
@@ -215,6 +227,9 @@ static void pcpu_mem_free(void *ptr, size_t size)
215 * New slot according to the changed state is determined and @chunk is 227 * New slot according to the changed state is determined and @chunk is
216 * moved to the slot. Note that the reserved chunk is never put on 228 * moved to the slot. Note that the reserved chunk is never put on
217 * chunk slots. 229 * chunk slots.
230 *
231 * CONTEXT:
232 * pcpu_lock.
218 */ 233 */
219static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) 234static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
220{ 235{
@@ -260,6 +275,9 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr,
260 * searchs for the chunk with the highest start address which isn't 275 * searchs for the chunk with the highest start address which isn't
261 * beyond @addr. 276 * beyond @addr.
262 * 277 *
278 * CONTEXT:
279 * pcpu_lock.
280 *
263 * RETURNS: 281 * RETURNS:
264 * The address of the found chunk. 282 * The address of the found chunk.
265 */ 283 */
@@ -300,6 +318,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
300 * @new: chunk to insert 318 * @new: chunk to insert
301 * 319 *
302 * Insert @new into address rb tree. 320 * Insert @new into address rb tree.
321 *
322 * CONTEXT:
323 * pcpu_lock.
303 */ 324 */
304static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) 325static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
305{ 326{
@@ -319,6 +340,10 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
319 * A single allocation can split an area into three areas, so this 340 * A single allocation can split an area into three areas, so this
320 * function makes sure that @chunk->map has at least two extra slots. 341 * function makes sure that @chunk->map has at least two extra slots.
321 * 342 *
343 * CONTEXT:
344 * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired
345 * if area map is extended.
346 *
322 * RETURNS: 347 * RETURNS:
323 * 0 if noop, 1 if successfully extended, -errno on failure. 348 * 0 if noop, 1 if successfully extended, -errno on failure.
324 */ 349 */
@@ -332,13 +357,25 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
332 if (chunk->map_alloc >= chunk->map_used + 2) 357 if (chunk->map_alloc >= chunk->map_used + 2)
333 return 0; 358 return 0;
334 359
360 spin_unlock_irq(&pcpu_lock);
361
335 new_alloc = PCPU_DFL_MAP_ALLOC; 362 new_alloc = PCPU_DFL_MAP_ALLOC;
336 while (new_alloc < chunk->map_used + 2) 363 while (new_alloc < chunk->map_used + 2)
337 new_alloc *= 2; 364 new_alloc *= 2;
338 365
339 new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); 366 new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
340 if (!new) 367 if (!new) {
368 spin_lock_irq(&pcpu_lock);
341 return -ENOMEM; 369 return -ENOMEM;
370 }
371
372 /*
373 * Acquire pcpu_lock and switch to new area map. Only free
374 * could have happened inbetween, so map_used couldn't have
375 * grown.
376 */
377 spin_lock_irq(&pcpu_lock);
378 BUG_ON(new_alloc < chunk->map_used + 2);
342 379
343 size = chunk->map_alloc * sizeof(chunk->map[0]); 380 size = chunk->map_alloc * sizeof(chunk->map[0]);
344 memcpy(new, chunk->map, size); 381 memcpy(new, chunk->map, size);
@@ -371,6 +408,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
371 * is inserted after the target block. 408 * is inserted after the target block.
372 * 409 *
373 * @chunk->map must have enough free slots to accomodate the split. 410 * @chunk->map must have enough free slots to accomodate the split.
411 *
412 * CONTEXT:
413 * pcpu_lock.
374 */ 414 */
375static void pcpu_split_block(struct pcpu_chunk *chunk, int i, 415static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
376 int head, int tail) 416 int head, int tail)
@@ -406,6 +446,9 @@ static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
406 * 446 *
407 * @chunk->map must have at least two free slots. 447 * @chunk->map must have at least two free slots.
408 * 448 *
449 * CONTEXT:
450 * pcpu_lock.
451 *
409 * RETURNS: 452 * RETURNS:
410 * Allocated offset in @chunk on success, -1 if no matching area is 453 * Allocated offset in @chunk on success, -1 if no matching area is
411 * found. 454 * found.
@@ -495,6 +538,9 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
495 * Free area starting from @freeme to @chunk. Note that this function 538 * Free area starting from @freeme to @chunk. Note that this function
496 * only modifies the allocation map. It doesn't depopulate or unmap 539 * only modifies the allocation map. It doesn't depopulate or unmap
497 * the area. 540 * the area.
541 *
542 * CONTEXT:
543 * pcpu_lock.
498 */ 544 */
499static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) 545static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
500{ 546{
@@ -580,6 +626,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
580 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 626 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
581 * from @chunk. If @flush is true, vcache is flushed before unmapping 627 * from @chunk. If @flush is true, vcache is flushed before unmapping
582 * and tlb after. 628 * and tlb after.
629 *
630 * CONTEXT:
631 * pcpu_alloc_mutex.
583 */ 632 */
584static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, 633static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size,
585 bool flush) 634 bool flush)
@@ -658,6 +707,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
658 * 707 *
659 * For each cpu, populate and map pages [@page_start,@page_end) into 708 * For each cpu, populate and map pages [@page_start,@page_end) into
660 * @chunk. The area is cleared on return. 709 * @chunk. The area is cleared on return.
710 *
711 * CONTEXT:
712 * pcpu_alloc_mutex, does GFP_KERNEL allocation.
661 */ 713 */
662static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 714static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
663{ 715{
@@ -748,15 +800,16 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
748 * @align: alignment of area (max PAGE_SIZE) 800 * @align: alignment of area (max PAGE_SIZE)
749 * @reserved: allocate from the reserved chunk if available 801 * @reserved: allocate from the reserved chunk if available
750 * 802 *
751 * Allocate percpu area of @size bytes aligned at @align. Might 803 * Allocate percpu area of @size bytes aligned at @align.
752 * sleep. Might trigger writeouts. 804 *
805 * CONTEXT:
806 * Does GFP_KERNEL allocation.
753 * 807 *
754 * RETURNS: 808 * RETURNS:
755 * Percpu pointer to the allocated area on success, NULL on failure. 809 * Percpu pointer to the allocated area on success, NULL on failure.
756 */ 810 */
757static void *pcpu_alloc(size_t size, size_t align, bool reserved) 811static void *pcpu_alloc(size_t size, size_t align, bool reserved)
758{ 812{
759 void *ptr = NULL;
760 struct pcpu_chunk *chunk; 813 struct pcpu_chunk *chunk;
761 int slot, off; 814 int slot, off;
762 815
@@ -766,27 +819,37 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
766 return NULL; 819 return NULL;
767 } 820 }
768 821
769 mutex_lock(&pcpu_mutex); 822 mutex_lock(&pcpu_alloc_mutex);
823 spin_lock_irq(&pcpu_lock);
770 824
771 /* serve reserved allocations from the reserved chunk if available */ 825 /* serve reserved allocations from the reserved chunk if available */
772 if (reserved && pcpu_reserved_chunk) { 826 if (reserved && pcpu_reserved_chunk) {
773 chunk = pcpu_reserved_chunk; 827 chunk = pcpu_reserved_chunk;
774 if (size > chunk->contig_hint || 828 if (size > chunk->contig_hint ||
775 pcpu_extend_area_map(chunk) < 0) 829 pcpu_extend_area_map(chunk) < 0)
776 goto out_unlock; 830 goto fail_unlock;
777 off = pcpu_alloc_area(chunk, size, align); 831 off = pcpu_alloc_area(chunk, size, align);
778 if (off >= 0) 832 if (off >= 0)
779 goto area_found; 833 goto area_found;
780 goto out_unlock; 834 goto fail_unlock;
781 } 835 }
782 836
837restart:
783 /* search through normal chunks */ 838 /* search through normal chunks */
784 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { 839 for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
785 list_for_each_entry(chunk, &pcpu_slot[slot], list) { 840 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
786 if (size > chunk->contig_hint) 841 if (size > chunk->contig_hint)
787 continue; 842 continue;
788 if (pcpu_extend_area_map(chunk) < 0) 843
789 goto out_unlock; 844 switch (pcpu_extend_area_map(chunk)) {
845 case 0:
846 break;
847 case 1:
848 goto restart; /* pcpu_lock dropped, restart */
849 default:
850 goto fail_unlock;
851 }
852
790 off = pcpu_alloc_area(chunk, size, align); 853 off = pcpu_alloc_area(chunk, size, align);
791 if (off >= 0) 854 if (off >= 0)
792 goto area_found; 855 goto area_found;
@@ -794,27 +857,36 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
794 } 857 }
795 858
796 /* hmmm... no space left, create a new chunk */ 859 /* hmmm... no space left, create a new chunk */
860 spin_unlock_irq(&pcpu_lock);
861
797 chunk = alloc_pcpu_chunk(); 862 chunk = alloc_pcpu_chunk();
798 if (!chunk) 863 if (!chunk)
799 goto out_unlock; 864 goto fail_unlock_mutex;
865
866 spin_lock_irq(&pcpu_lock);
800 pcpu_chunk_relocate(chunk, -1); 867 pcpu_chunk_relocate(chunk, -1);
801 pcpu_chunk_addr_insert(chunk); 868 pcpu_chunk_addr_insert(chunk);
802 869 goto restart;
803 off = pcpu_alloc_area(chunk, size, align);
804 if (off < 0)
805 goto out_unlock;
806 870
807area_found: 871area_found:
872 spin_unlock_irq(&pcpu_lock);
873
808 /* populate, map and clear the area */ 874 /* populate, map and clear the area */
809 if (pcpu_populate_chunk(chunk, off, size)) { 875 if (pcpu_populate_chunk(chunk, off, size)) {
876 spin_lock_irq(&pcpu_lock);
810 pcpu_free_area(chunk, off); 877 pcpu_free_area(chunk, off);
811 goto out_unlock; 878 goto fail_unlock;
812 } 879 }
813 880
814 ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off); 881 mutex_unlock(&pcpu_alloc_mutex);
815out_unlock: 882
816 mutex_unlock(&pcpu_mutex); 883 return __addr_to_pcpu_ptr(chunk->vm->addr + off);
817 return ptr; 884
885fail_unlock:
886 spin_unlock_irq(&pcpu_lock);
887fail_unlock_mutex:
888 mutex_unlock(&pcpu_alloc_mutex);
889 return NULL;
818} 890}
819 891
820/** 892/**
@@ -825,6 +897,9 @@ out_unlock:
825 * Allocate percpu area of @size bytes aligned at @align. Might 897 * Allocate percpu area of @size bytes aligned at @align. Might
826 * sleep. Might trigger writeouts. 898 * sleep. Might trigger writeouts.
827 * 899 *
900 * CONTEXT:
901 * Does GFP_KERNEL allocation.
902 *
828 * RETURNS: 903 * RETURNS:
829 * Percpu pointer to the allocated area on success, NULL on failure. 904 * Percpu pointer to the allocated area on success, NULL on failure.
830 */ 905 */
@@ -843,6 +918,9 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
843 * percpu area if arch has set it up; otherwise, allocation is served 918 * percpu area if arch has set it up; otherwise, allocation is served
844 * from the same dynamic area. Might sleep. Might trigger writeouts. 919 * from the same dynamic area. Might sleep. Might trigger writeouts.
845 * 920 *
921 * CONTEXT:
922 * Does GFP_KERNEL allocation.
923 *
846 * RETURNS: 924 * RETURNS:
847 * Percpu pointer to the allocated area on success, NULL on failure. 925 * Percpu pointer to the allocated area on success, NULL on failure.
848 */ 926 */
@@ -856,6 +934,9 @@ void *__alloc_reserved_percpu(size_t size, size_t align)
856 * @work: unused 934 * @work: unused
857 * 935 *
858 * Reclaim all fully free chunks except for the first one. 936 * Reclaim all fully free chunks except for the first one.
937 *
938 * CONTEXT:
939 * workqueue context.
859 */ 940 */
860static void pcpu_reclaim(struct work_struct *work) 941static void pcpu_reclaim(struct work_struct *work)
861{ 942{
@@ -863,7 +944,8 @@ static void pcpu_reclaim(struct work_struct *work)
863 struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; 944 struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
864 struct pcpu_chunk *chunk, *next; 945 struct pcpu_chunk *chunk, *next;
865 946
866 mutex_lock(&pcpu_mutex); 947 mutex_lock(&pcpu_alloc_mutex);
948 spin_lock_irq(&pcpu_lock);
867 949
868 list_for_each_entry_safe(chunk, next, head, list) { 950 list_for_each_entry_safe(chunk, next, head, list) {
869 WARN_ON(chunk->immutable); 951 WARN_ON(chunk->immutable);
@@ -876,7 +958,8 @@ static void pcpu_reclaim(struct work_struct *work)
876 list_move(&chunk->list, &todo); 958 list_move(&chunk->list, &todo);
877 } 959 }
878 960
879 mutex_unlock(&pcpu_mutex); 961 spin_unlock_irq(&pcpu_lock);
962 mutex_unlock(&pcpu_alloc_mutex);
880 963
881 list_for_each_entry_safe(chunk, next, &todo, list) { 964 list_for_each_entry_safe(chunk, next, &todo, list) {
882 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); 965 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
@@ -888,18 +971,22 @@ static void pcpu_reclaim(struct work_struct *work)
888 * free_percpu - free percpu area 971 * free_percpu - free percpu area
889 * @ptr: pointer to area to free 972 * @ptr: pointer to area to free
890 * 973 *
891 * Free percpu area @ptr. Might sleep. 974 * Free percpu area @ptr.
975 *
976 * CONTEXT:
977 * Can be called from atomic context.
892 */ 978 */
893void free_percpu(void *ptr) 979void free_percpu(void *ptr)
894{ 980{
895 void *addr = __pcpu_ptr_to_addr(ptr); 981 void *addr = __pcpu_ptr_to_addr(ptr);
896 struct pcpu_chunk *chunk; 982 struct pcpu_chunk *chunk;
983 unsigned long flags;
897 int off; 984 int off;
898 985
899 if (!ptr) 986 if (!ptr)
900 return; 987 return;
901 988
902 mutex_lock(&pcpu_mutex); 989 spin_lock_irqsave(&pcpu_lock, flags);
903 990
904 chunk = pcpu_chunk_addr_search(addr); 991 chunk = pcpu_chunk_addr_search(addr);
905 off = addr - chunk->vm->addr; 992 off = addr - chunk->vm->addr;
@@ -917,7 +1004,7 @@ void free_percpu(void *ptr)
917 } 1004 }
918 } 1005 }
919 1006
920 mutex_unlock(&pcpu_mutex); 1007 spin_unlock_irqrestore(&pcpu_lock, flags);
921} 1008}
922EXPORT_SYMBOL_GPL(free_percpu); 1009EXPORT_SYMBOL_GPL(free_percpu);
923 1010