diff options
-rw-r--r-- | mm/percpu.c | 161 |
1 files changed, 124 insertions, 37 deletions
diff --git a/mm/percpu.c b/mm/percpu.c index 4c8a419119da..bfe6a3afaf45 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -62,6 +62,7 @@ | |||
62 | #include <linux/pfn.h> | 62 | #include <linux/pfn.h> |
63 | #include <linux/rbtree.h> | 63 | #include <linux/rbtree.h> |
64 | #include <linux/slab.h> | 64 | #include <linux/slab.h> |
65 | #include <linux/spinlock.h> | ||
65 | #include <linux/vmalloc.h> | 66 | #include <linux/vmalloc.h> |
66 | #include <linux/workqueue.h> | 67 | #include <linux/workqueue.h> |
67 | 68 | ||
@@ -101,20 +102,28 @@ static struct pcpu_chunk *pcpu_reserved_chunk; | |||
101 | static int pcpu_reserved_chunk_limit; | 102 | static int pcpu_reserved_chunk_limit; |
102 | 103 | ||
103 | /* | 104 | /* |
104 | * One mutex to rule them all. | 105 | * Synchronization rules. |
105 | * | 106 | * |
106 | * The following mutex is grabbed in the outermost public alloc/free | 107 | * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former |
107 | * interface functions and released only when the operation is | 108 | * protects allocation/reclaim paths, chunks and chunk->page arrays. |
108 | * complete. As such, every function in this file other than the | 109 | * The latter is a spinlock and protects the index data structures - |
109 | * outermost functions are called under pcpu_mutex. | 110 | * chunk slots, rbtree, chunks and area maps in chunks. |
110 | * | 111 | * |
111 | * It can easily be switched to use spinlock such that only the area | 112 | * During allocation, pcpu_alloc_mutex is kept locked all the time and |
112 | * allocation and page population commit are protected with it doing | 113 | * pcpu_lock is grabbed and released as necessary. All actual memory |
113 | * actual [de]allocation without holding any lock. However, given | 114 | * allocations are done using GFP_KERNEL with pcpu_lock released. |
114 | * what this allocator does, I think it's better to let them run | 115 | * |
115 | * sequentially. | 116 | * Free path accesses and alters only the index data structures, so it |
117 | * can be safely called from atomic context. When memory needs to be | ||
118 | * returned to the system, free path schedules reclaim_work which | ||
119 | * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be | ||
120 | * reclaimed, release both locks and frees the chunks. Note that it's | ||
121 | * necessary to grab both locks to remove a chunk from circulation as | ||
122 | * allocation path might be referencing the chunk with only | ||
123 | * pcpu_alloc_mutex locked. | ||
116 | */ | 124 | */ |
117 | static DEFINE_MUTEX(pcpu_mutex); | 125 | static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ |
126 | static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ | ||
118 | 127 | ||
119 | static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ | 128 | static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ |
120 | static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ | 129 | static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ |
@@ -176,6 +185,9 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, | |||
176 | * kzalloc() is used; otherwise, vmalloc() is used. The returned | 185 | * kzalloc() is used; otherwise, vmalloc() is used. The returned |
177 | * memory is always zeroed. | 186 | * memory is always zeroed. |
178 | * | 187 | * |
188 | * CONTEXT: | ||
189 | * Does GFP_KERNEL allocation. | ||
190 | * | ||
179 | * RETURNS: | 191 | * RETURNS: |
180 | * Pointer to the allocated area on success, NULL on failure. | 192 | * Pointer to the allocated area on success, NULL on failure. |
181 | */ | 193 | */ |
@@ -215,6 +227,9 @@ static void pcpu_mem_free(void *ptr, size_t size) | |||
215 | * New slot according to the changed state is determined and @chunk is | 227 | * New slot according to the changed state is determined and @chunk is |
216 | * moved to the slot. Note that the reserved chunk is never put on | 228 | * moved to the slot. Note that the reserved chunk is never put on |
217 | * chunk slots. | 229 | * chunk slots. |
230 | * | ||
231 | * CONTEXT: | ||
232 | * pcpu_lock. | ||
218 | */ | 233 | */ |
219 | static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) | 234 | static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) |
220 | { | 235 | { |
@@ -260,6 +275,9 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr, | |||
260 | * searchs for the chunk with the highest start address which isn't | 275 | * searchs for the chunk with the highest start address which isn't |
261 | * beyond @addr. | 276 | * beyond @addr. |
262 | * | 277 | * |
278 | * CONTEXT: | ||
279 | * pcpu_lock. | ||
280 | * | ||
263 | * RETURNS: | 281 | * RETURNS: |
264 | * The address of the found chunk. | 282 | * The address of the found chunk. |
265 | */ | 283 | */ |
@@ -300,6 +318,9 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) | |||
300 | * @new: chunk to insert | 318 | * @new: chunk to insert |
301 | * | 319 | * |
302 | * Insert @new into address rb tree. | 320 | * Insert @new into address rb tree. |
321 | * | ||
322 | * CONTEXT: | ||
323 | * pcpu_lock. | ||
303 | */ | 324 | */ |
304 | static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) | 325 | static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) |
305 | { | 326 | { |
@@ -319,6 +340,10 @@ static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) | |||
319 | * A single allocation can split an area into three areas, so this | 340 | * A single allocation can split an area into three areas, so this |
320 | * function makes sure that @chunk->map has at least two extra slots. | 341 | * function makes sure that @chunk->map has at least two extra slots. |
321 | * | 342 | * |
343 | * CONTEXT: | ||
344 | * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired | ||
345 | * if area map is extended. | ||
346 | * | ||
322 | * RETURNS: | 347 | * RETURNS: |
323 | * 0 if noop, 1 if successfully extended, -errno on failure. | 348 | * 0 if noop, 1 if successfully extended, -errno on failure. |
324 | */ | 349 | */ |
@@ -332,13 +357,25 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk) | |||
332 | if (chunk->map_alloc >= chunk->map_used + 2) | 357 | if (chunk->map_alloc >= chunk->map_used + 2) |
333 | return 0; | 358 | return 0; |
334 | 359 | ||
360 | spin_unlock_irq(&pcpu_lock); | ||
361 | |||
335 | new_alloc = PCPU_DFL_MAP_ALLOC; | 362 | new_alloc = PCPU_DFL_MAP_ALLOC; |
336 | while (new_alloc < chunk->map_used + 2) | 363 | while (new_alloc < chunk->map_used + 2) |
337 | new_alloc *= 2; | 364 | new_alloc *= 2; |
338 | 365 | ||
339 | new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); | 366 | new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); |
340 | if (!new) | 367 | if (!new) { |
368 | spin_lock_irq(&pcpu_lock); | ||
341 | return -ENOMEM; | 369 | return -ENOMEM; |
370 | } | ||
371 | |||
372 | /* | ||
373 | * Acquire pcpu_lock and switch to new area map. Only free | ||
374 | * could have happened inbetween, so map_used couldn't have | ||
375 | * grown. | ||
376 | */ | ||
377 | spin_lock_irq(&pcpu_lock); | ||
378 | BUG_ON(new_alloc < chunk->map_used + 2); | ||
342 | 379 | ||
343 | size = chunk->map_alloc * sizeof(chunk->map[0]); | 380 | size = chunk->map_alloc * sizeof(chunk->map[0]); |
344 | memcpy(new, chunk->map, size); | 381 | memcpy(new, chunk->map, size); |
@@ -371,6 +408,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk) | |||
371 | * is inserted after the target block. | 408 | * is inserted after the target block. |
372 | * | 409 | * |
373 | * @chunk->map must have enough free slots to accomodate the split. | 410 | * @chunk->map must have enough free slots to accomodate the split. |
411 | * | ||
412 | * CONTEXT: | ||
413 | * pcpu_lock. | ||
374 | */ | 414 | */ |
375 | static void pcpu_split_block(struct pcpu_chunk *chunk, int i, | 415 | static void pcpu_split_block(struct pcpu_chunk *chunk, int i, |
376 | int head, int tail) | 416 | int head, int tail) |
@@ -406,6 +446,9 @@ static void pcpu_split_block(struct pcpu_chunk *chunk, int i, | |||
406 | * | 446 | * |
407 | * @chunk->map must have at least two free slots. | 447 | * @chunk->map must have at least two free slots. |
408 | * | 448 | * |
449 | * CONTEXT: | ||
450 | * pcpu_lock. | ||
451 | * | ||
409 | * RETURNS: | 452 | * RETURNS: |
410 | * Allocated offset in @chunk on success, -1 if no matching area is | 453 | * Allocated offset in @chunk on success, -1 if no matching area is |
411 | * found. | 454 | * found. |
@@ -495,6 +538,9 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) | |||
495 | * Free area starting from @freeme to @chunk. Note that this function | 538 | * Free area starting from @freeme to @chunk. Note that this function |
496 | * only modifies the allocation map. It doesn't depopulate or unmap | 539 | * only modifies the allocation map. It doesn't depopulate or unmap |
497 | * the area. | 540 | * the area. |
541 | * | ||
542 | * CONTEXT: | ||
543 | * pcpu_lock. | ||
498 | */ | 544 | */ |
499 | static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) | 545 | static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) |
500 | { | 546 | { |
@@ -580,6 +626,9 @@ static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, | |||
580 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) | 626 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) |
581 | * from @chunk. If @flush is true, vcache is flushed before unmapping | 627 | * from @chunk. If @flush is true, vcache is flushed before unmapping |
582 | * and tlb after. | 628 | * and tlb after. |
629 | * | ||
630 | * CONTEXT: | ||
631 | * pcpu_alloc_mutex. | ||
583 | */ | 632 | */ |
584 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, | 633 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, |
585 | bool flush) | 634 | bool flush) |
@@ -658,6 +707,9 @@ static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) | |||
658 | * | 707 | * |
659 | * For each cpu, populate and map pages [@page_start,@page_end) into | 708 | * For each cpu, populate and map pages [@page_start,@page_end) into |
660 | * @chunk. The area is cleared on return. | 709 | * @chunk. The area is cleared on return. |
710 | * | ||
711 | * CONTEXT: | ||
712 | * pcpu_alloc_mutex, does GFP_KERNEL allocation. | ||
661 | */ | 713 | */ |
662 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) | 714 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) |
663 | { | 715 | { |
@@ -748,15 +800,16 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) | |||
748 | * @align: alignment of area (max PAGE_SIZE) | 800 | * @align: alignment of area (max PAGE_SIZE) |
749 | * @reserved: allocate from the reserved chunk if available | 801 | * @reserved: allocate from the reserved chunk if available |
750 | * | 802 | * |
751 | * Allocate percpu area of @size bytes aligned at @align. Might | 803 | * Allocate percpu area of @size bytes aligned at @align. |
752 | * sleep. Might trigger writeouts. | 804 | * |
805 | * CONTEXT: | ||
806 | * Does GFP_KERNEL allocation. | ||
753 | * | 807 | * |
754 | * RETURNS: | 808 | * RETURNS: |
755 | * Percpu pointer to the allocated area on success, NULL on failure. | 809 | * Percpu pointer to the allocated area on success, NULL on failure. |
756 | */ | 810 | */ |
757 | static void *pcpu_alloc(size_t size, size_t align, bool reserved) | 811 | static void *pcpu_alloc(size_t size, size_t align, bool reserved) |
758 | { | 812 | { |
759 | void *ptr = NULL; | ||
760 | struct pcpu_chunk *chunk; | 813 | struct pcpu_chunk *chunk; |
761 | int slot, off; | 814 | int slot, off; |
762 | 815 | ||
@@ -766,27 +819,37 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) | |||
766 | return NULL; | 819 | return NULL; |
767 | } | 820 | } |
768 | 821 | ||
769 | mutex_lock(&pcpu_mutex); | 822 | mutex_lock(&pcpu_alloc_mutex); |
823 | spin_lock_irq(&pcpu_lock); | ||
770 | 824 | ||
771 | /* serve reserved allocations from the reserved chunk if available */ | 825 | /* serve reserved allocations from the reserved chunk if available */ |
772 | if (reserved && pcpu_reserved_chunk) { | 826 | if (reserved && pcpu_reserved_chunk) { |
773 | chunk = pcpu_reserved_chunk; | 827 | chunk = pcpu_reserved_chunk; |
774 | if (size > chunk->contig_hint || | 828 | if (size > chunk->contig_hint || |
775 | pcpu_extend_area_map(chunk) < 0) | 829 | pcpu_extend_area_map(chunk) < 0) |
776 | goto out_unlock; | 830 | goto fail_unlock; |
777 | off = pcpu_alloc_area(chunk, size, align); | 831 | off = pcpu_alloc_area(chunk, size, align); |
778 | if (off >= 0) | 832 | if (off >= 0) |
779 | goto area_found; | 833 | goto area_found; |
780 | goto out_unlock; | 834 | goto fail_unlock; |
781 | } | 835 | } |
782 | 836 | ||
837 | restart: | ||
783 | /* search through normal chunks */ | 838 | /* search through normal chunks */ |
784 | for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { | 839 | for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { |
785 | list_for_each_entry(chunk, &pcpu_slot[slot], list) { | 840 | list_for_each_entry(chunk, &pcpu_slot[slot], list) { |
786 | if (size > chunk->contig_hint) | 841 | if (size > chunk->contig_hint) |
787 | continue; | 842 | continue; |
788 | if (pcpu_extend_area_map(chunk) < 0) | 843 | |
789 | goto out_unlock; | 844 | switch (pcpu_extend_area_map(chunk)) { |
845 | case 0: | ||
846 | break; | ||
847 | case 1: | ||
848 | goto restart; /* pcpu_lock dropped, restart */ | ||
849 | default: | ||
850 | goto fail_unlock; | ||
851 | } | ||
852 | |||
790 | off = pcpu_alloc_area(chunk, size, align); | 853 | off = pcpu_alloc_area(chunk, size, align); |
791 | if (off >= 0) | 854 | if (off >= 0) |
792 | goto area_found; | 855 | goto area_found; |
@@ -794,27 +857,36 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) | |||
794 | } | 857 | } |
795 | 858 | ||
796 | /* hmmm... no space left, create a new chunk */ | 859 | /* hmmm... no space left, create a new chunk */ |
860 | spin_unlock_irq(&pcpu_lock); | ||
861 | |||
797 | chunk = alloc_pcpu_chunk(); | 862 | chunk = alloc_pcpu_chunk(); |
798 | if (!chunk) | 863 | if (!chunk) |
799 | goto out_unlock; | 864 | goto fail_unlock_mutex; |
865 | |||
866 | spin_lock_irq(&pcpu_lock); | ||
800 | pcpu_chunk_relocate(chunk, -1); | 867 | pcpu_chunk_relocate(chunk, -1); |
801 | pcpu_chunk_addr_insert(chunk); | 868 | pcpu_chunk_addr_insert(chunk); |
802 | 869 | goto restart; | |
803 | off = pcpu_alloc_area(chunk, size, align); | ||
804 | if (off < 0) | ||
805 | goto out_unlock; | ||
806 | 870 | ||
807 | area_found: | 871 | area_found: |
872 | spin_unlock_irq(&pcpu_lock); | ||
873 | |||
808 | /* populate, map and clear the area */ | 874 | /* populate, map and clear the area */ |
809 | if (pcpu_populate_chunk(chunk, off, size)) { | 875 | if (pcpu_populate_chunk(chunk, off, size)) { |
876 | spin_lock_irq(&pcpu_lock); | ||
810 | pcpu_free_area(chunk, off); | 877 | pcpu_free_area(chunk, off); |
811 | goto out_unlock; | 878 | goto fail_unlock; |
812 | } | 879 | } |
813 | 880 | ||
814 | ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off); | 881 | mutex_unlock(&pcpu_alloc_mutex); |
815 | out_unlock: | 882 | |
816 | mutex_unlock(&pcpu_mutex); | 883 | return __addr_to_pcpu_ptr(chunk->vm->addr + off); |
817 | return ptr; | 884 | |
885 | fail_unlock: | ||
886 | spin_unlock_irq(&pcpu_lock); | ||
887 | fail_unlock_mutex: | ||
888 | mutex_unlock(&pcpu_alloc_mutex); | ||
889 | return NULL; | ||
818 | } | 890 | } |
819 | 891 | ||
820 | /** | 892 | /** |
@@ -825,6 +897,9 @@ out_unlock: | |||
825 | * Allocate percpu area of @size bytes aligned at @align. Might | 897 | * Allocate percpu area of @size bytes aligned at @align. Might |
826 | * sleep. Might trigger writeouts. | 898 | * sleep. Might trigger writeouts. |
827 | * | 899 | * |
900 | * CONTEXT: | ||
901 | * Does GFP_KERNEL allocation. | ||
902 | * | ||
828 | * RETURNS: | 903 | * RETURNS: |
829 | * Percpu pointer to the allocated area on success, NULL on failure. | 904 | * Percpu pointer to the allocated area on success, NULL on failure. |
830 | */ | 905 | */ |
@@ -843,6 +918,9 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); | |||
843 | * percpu area if arch has set it up; otherwise, allocation is served | 918 | * percpu area if arch has set it up; otherwise, allocation is served |
844 | * from the same dynamic area. Might sleep. Might trigger writeouts. | 919 | * from the same dynamic area. Might sleep. Might trigger writeouts. |
845 | * | 920 | * |
921 | * CONTEXT: | ||
922 | * Does GFP_KERNEL allocation. | ||
923 | * | ||
846 | * RETURNS: | 924 | * RETURNS: |
847 | * Percpu pointer to the allocated area on success, NULL on failure. | 925 | * Percpu pointer to the allocated area on success, NULL on failure. |
848 | */ | 926 | */ |
@@ -856,6 +934,9 @@ void *__alloc_reserved_percpu(size_t size, size_t align) | |||
856 | * @work: unused | 934 | * @work: unused |
857 | * | 935 | * |
858 | * Reclaim all fully free chunks except for the first one. | 936 | * Reclaim all fully free chunks except for the first one. |
937 | * | ||
938 | * CONTEXT: | ||
939 | * workqueue context. | ||
859 | */ | 940 | */ |
860 | static void pcpu_reclaim(struct work_struct *work) | 941 | static void pcpu_reclaim(struct work_struct *work) |
861 | { | 942 | { |
@@ -863,7 +944,8 @@ static void pcpu_reclaim(struct work_struct *work) | |||
863 | struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; | 944 | struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; |
864 | struct pcpu_chunk *chunk, *next; | 945 | struct pcpu_chunk *chunk, *next; |
865 | 946 | ||
866 | mutex_lock(&pcpu_mutex); | 947 | mutex_lock(&pcpu_alloc_mutex); |
948 | spin_lock_irq(&pcpu_lock); | ||
867 | 949 | ||
868 | list_for_each_entry_safe(chunk, next, head, list) { | 950 | list_for_each_entry_safe(chunk, next, head, list) { |
869 | WARN_ON(chunk->immutable); | 951 | WARN_ON(chunk->immutable); |
@@ -876,7 +958,8 @@ static void pcpu_reclaim(struct work_struct *work) | |||
876 | list_move(&chunk->list, &todo); | 958 | list_move(&chunk->list, &todo); |
877 | } | 959 | } |
878 | 960 | ||
879 | mutex_unlock(&pcpu_mutex); | 961 | spin_unlock_irq(&pcpu_lock); |
962 | mutex_unlock(&pcpu_alloc_mutex); | ||
880 | 963 | ||
881 | list_for_each_entry_safe(chunk, next, &todo, list) { | 964 | list_for_each_entry_safe(chunk, next, &todo, list) { |
882 | pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); | 965 | pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); |
@@ -888,18 +971,22 @@ static void pcpu_reclaim(struct work_struct *work) | |||
888 | * free_percpu - free percpu area | 971 | * free_percpu - free percpu area |
889 | * @ptr: pointer to area to free | 972 | * @ptr: pointer to area to free |
890 | * | 973 | * |
891 | * Free percpu area @ptr. Might sleep. | 974 | * Free percpu area @ptr. |
975 | * | ||
976 | * CONTEXT: | ||
977 | * Can be called from atomic context. | ||
892 | */ | 978 | */ |
893 | void free_percpu(void *ptr) | 979 | void free_percpu(void *ptr) |
894 | { | 980 | { |
895 | void *addr = __pcpu_ptr_to_addr(ptr); | 981 | void *addr = __pcpu_ptr_to_addr(ptr); |
896 | struct pcpu_chunk *chunk; | 982 | struct pcpu_chunk *chunk; |
983 | unsigned long flags; | ||
897 | int off; | 984 | int off; |
898 | 985 | ||
899 | if (!ptr) | 986 | if (!ptr) |
900 | return; | 987 | return; |
901 | 988 | ||
902 | mutex_lock(&pcpu_mutex); | 989 | spin_lock_irqsave(&pcpu_lock, flags); |
903 | 990 | ||
904 | chunk = pcpu_chunk_addr_search(addr); | 991 | chunk = pcpu_chunk_addr_search(addr); |
905 | off = addr - chunk->vm->addr; | 992 | off = addr - chunk->vm->addr; |
@@ -917,7 +1004,7 @@ void free_percpu(void *ptr) | |||
917 | } | 1004 | } |
918 | } | 1005 | } |
919 | 1006 | ||
920 | mutex_unlock(&pcpu_mutex); | 1007 | spin_unlock_irqrestore(&pcpu_lock, flags); |
921 | } | 1008 | } |
922 | EXPORT_SYMBOL_GPL(free_percpu); | 1009 | EXPORT_SYMBOL_GPL(free_percpu); |
923 | 1010 | ||