percpu: implement asynchronous chunk population

The percpu allocator now supports atomic allocations by only allocating from already populated areas but the mechanism to ensure that there's adequate amount of populated areas was missing. This patch expands pcpu_balance_work so that in addition to freeing excess free chunks it also populates chunks to maintain an adequate level of populated areas. pcpu_alloc() schedules pcpu_balance_work if the amount of free populated areas is too low or after an atomic allocation failure. * PERPCU_DYNAMIC_RESERVE is increased by two pages to account for PCPU_EMPTY_POP_PAGES_LOW. * pcpu_async_enabled is added to gate both async jobs - chunk->map_extend_work and pcpu_balance_work - so that we don't end up scheduling them while the needed subsystems aren't up yet. Signed-off-by: Tejun Heo <tj@kernel.org>
author: Tejun Heo <tj@kernel.org> 2014-09-02 14:46:05 -0400
committer: Tejun Heo <tj@kernel.org> 2014-09-02 14:46:05 -0400
commit: 1a4d76076cda69b0abf15463a8cebc172406da25 (patch)
tree: eb2f8e317795c30942ae298585708e10652e8537 /mm/percpu.c
parent: fe6bd8c3d28357174587c4fe895d10b00321b692 (diff)
1 files changed, 113 insertions, 4 deletions
diff --git a/mm/percpu.c b/mm/percpu.c
index 28a830590b4c..867efd38d879 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -78,6 +78,8 @@
 #define PCPU_DFL_MAP_ALLOC              16      /* start a map with 16 ents */
 #define PCPU_ATOMIC_MAP_MARGIN_LOW      32
 #define PCPU_ATOMIC_MAP_MARGIN_HIGH     64
+#define PCPU_EMPTY_POP_PAGES_LOW        2
+#define PCPU_EMPTY_POP_PAGES_HIGH       4
 #ifdef CONFIG_SMP
 /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
@@ -168,9 +170,22 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
 */
 static int pcpu_nr_empty_pop_pages;
-/* balance work is used to populate or destroy chunks asynchronously */
+/*
+ * Balance work is used to populate or destroy chunks asynchronously.  We
+ * try to keep the number of populated free pages between
+ * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
+ * empty chunk.
+ */
 static void pcpu_balance_workfn(struct work_struct *work);
 static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
+static bool pcpu_async_enabled __read_mostly;
+static bool pcpu_atomic_alloc_failed;
+static void pcpu_schedule_balance_work(void)
+{
+        if (pcpu_async_enabled)
+                schedule_work(&pcpu_balance_work);
+}
 static bool pcpu_addr_in_first_chunk(void *addr)
 {
@@ -386,7 +401,8 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
                margin = 3;
                if (chunk->map_alloc <
-                    chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW)
+                    chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
+                    pcpu_async_enabled)
                        schedule_work(&chunk->map_extend_work);
        } else {
                margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
@@ -1005,6 +1021,9 @@ area_found:
        if (chunk != pcpu_reserved_chunk)
                pcpu_nr_empty_pop_pages -= occ_pages;
+        if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
+                pcpu_schedule_balance_work();
        /* clear the areas and return address relative to base address */
        for_each_possible_cpu(cpu)
                memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
@@ -1023,6 +1042,11 @@ fail:
                if (!--warn_limit)
                        pr_info("PERCPU: limit reached, disable warning\n");
        }
+        if (is_atomic) {
+                /* see the flag handling in pcpu_blance_workfn() */
+                pcpu_atomic_alloc_failed = true;
+                pcpu_schedule_balance_work();
+        }
        return NULL;
 }
@@ -1080,7 +1104,7 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
 }
 /**
- * pcpu_balance_workfn - reclaim fully free chunks, workqueue function
+ * pcpu_balance_workfn - manage the amount of free chunks and populated pages
 * @work: unused
 *
 * Reclaim all fully free chunks except for the first one.
@@ -1090,7 +1114,12 @@ static void pcpu_balance_workfn(struct work_struct *work)
        LIST_HEAD(to_free);
        struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
        struct pcpu_chunk *chunk, *next;
+        int slot, nr_to_pop, ret;
+        /*
+         * There's no reason to keep around multiple unused chunks and VM
+         * areas can be scarce.  Destroy all free chunks except for one.
+         */
        mutex_lock(&pcpu_alloc_mutex);
        spin_lock_irq(&pcpu_lock);
@@ -1118,6 +1147,74 @@ static void pcpu_balance_workfn(struct work_struct *work)
                pcpu_destroy_chunk(chunk);
        }
+        /*
+         * Ensure there are certain number of free populated pages for
+         * atomic allocs.  Fill up from the most packed so that atomic
+         * allocs don't increase fragmentation.  If atomic allocation
+         * failed previously, always populate the maximum amount.  This
+         * should prevent atomic allocs larger than PAGE_SIZE from keeping
+         * failing indefinitely; however, large atomic allocs are not
+         * something we support properly and can be highly unreliable and
+         * inefficient.
+         */
+retry_pop:
+        if (pcpu_atomic_alloc_failed) {
+                nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
+                /* best effort anyway, don't worry about synchronization */
+                pcpu_atomic_alloc_failed = false;
+        } else {
+                nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
+                                  pcpu_nr_empty_pop_pages,
+                                  0, PCPU_EMPTY_POP_PAGES_HIGH);
+        }
+        for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
+                int nr_unpop = 0, rs, re;
+                if (!nr_to_pop)
+                        break;
+                spin_lock_irq(&pcpu_lock);
+                list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+                        nr_unpop = pcpu_unit_pages - chunk->nr_populated;
+                        if (nr_unpop)
+                                break;
+                }
+                spin_unlock_irq(&pcpu_lock);
+                if (!nr_unpop)
+                        continue;
+                /* @chunk can't go away while pcpu_alloc_mutex is held */
+                pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
+                        int nr = min(re - rs, nr_to_pop);
+                        ret = pcpu_populate_chunk(chunk, rs, rs + nr);
+                        if (!ret) {
+                                nr_to_pop -= nr;
+                                spin_lock_irq(&pcpu_lock);
+                                pcpu_chunk_populated(chunk, rs, rs + nr);
+                                spin_unlock_irq(&pcpu_lock);
+                        } else {
+                                nr_to_pop = 0;
+                        }
+                        if (!nr_to_pop)
+                                break;
+                }
+        }
+        if (nr_to_pop) {
+                /* ran out of chunks to populate, create a new one and retry */
+                chunk = pcpu_create_chunk();
+                if (chunk) {
+                        spin_lock_irq(&pcpu_lock);
+                        pcpu_chunk_relocate(chunk, -1);
+                        spin_unlock_irq(&pcpu_lock);
+                        goto retry_pop;
+                }
+        }
        mutex_unlock(&pcpu_alloc_mutex);
 }
@@ -1160,7 +1257,7 @@ void free_percpu(void __percpu *ptr)
                list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
                        if (pos != chunk) {
-                                schedule_work(&pcpu_balance_work);
+                                pcpu_schedule_balance_work();
                                break;
                        }
        }
@@ -2187,3 +2284,15 @@ void __init percpu_init_late(void)
                spin_unlock_irqrestore(&pcpu_lock, flags);
        }
 }
+/*
+ * Percpu allocator is initialized early during boot when neither slab or
+ * workqueue is available.  Plug async management until everything is up
+ * and running.
+ */
+static int __init percpu_enable_async(void)
+{
+        pcpu_async_enabled = true;
+        return 0;
+}
+subsys_initcall(percpu_enable_async);
author	Tejun Heo <tj@kernel.org>	2014-09-02 14:46:05 -0400
committer	Tejun Heo <tj@kernel.org>	2014-09-02 14:46:05 -0400
commit	1a4d76076cda69b0abf15463a8cebc172406da25 (patch)
tree	eb2f8e317795c30942ae298585708e10652e8537 /mm/percpu.c
parent	fe6bd8c3d28357174587c4fe895d10b00321b692 (diff)

diff --git a/mm/percpu.c b/mm/percpu.c index 28a830590b4c..867efd38d879 100644 --- a/mm/percpu.c +++ b/mm/percpu.c
@@ -78,6 +78,8 @@
78	#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */	78	#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
79	#define PCPU_ATOMIC_MAP_MARGIN_LOW 32	79	#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
80	#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64	80	#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
		81	#define PCPU_EMPTY_POP_PAGES_LOW 2
		82	#define PCPU_EMPTY_POP_PAGES_HIGH 4
81		83
82	#ifdef CONFIG_SMP	84	#ifdef CONFIG_SMP
83	/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */	85	/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
@@ -168,9 +170,22 @@ static struct list_head pcpu_slot __read_mostly; / chunk list slots */
168	*/	170	*/
169	static int pcpu_nr_empty_pop_pages;	171	static int pcpu_nr_empty_pop_pages;
170		172
171	/* balance work is used to populate or destroy chunks asynchronously */	173	/*
		174	* Balance work is used to populate or destroy chunks asynchronously. We
		175	* try to keep the number of populated free pages between
		176	* PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
		177	* empty chunk.
		178	*/
172	static void pcpu_balance_workfn(struct work_struct *work);	179	static void pcpu_balance_workfn(struct work_struct *work);
173	static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);	180	static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
		181	static bool pcpu_async_enabled __read_mostly;
		182	static bool pcpu_atomic_alloc_failed;
		183
		184	static void pcpu_schedule_balance_work(void)
		185	{
		186	if (pcpu_async_enabled)
		187	schedule_work(&pcpu_balance_work);
		188	}
174		189
175	static bool pcpu_addr_in_first_chunk(void *addr)	190	static bool pcpu_addr_in_first_chunk(void *addr)
176	{	191	{
@@ -386,7 +401,8 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
386	margin = 3;	401	margin = 3;
387		402
388	if (chunk->map_alloc <	403	if (chunk->map_alloc <
389	chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW)	404	chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
		405	pcpu_async_enabled)
390	schedule_work(&chunk->map_extend_work);	406	schedule_work(&chunk->map_extend_work);
391	} else {	407	} else {
392	margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;	408	margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
@@ -1005,6 +1021,9 @@ area_found:
1005	if (chunk != pcpu_reserved_chunk)	1021	if (chunk != pcpu_reserved_chunk)
1006	pcpu_nr_empty_pop_pages -= occ_pages;	1022	pcpu_nr_empty_pop_pages -= occ_pages;
1007		1023
		1024	if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
		1025	pcpu_schedule_balance_work();
		1026
1008	/* clear the areas and return address relative to base address */	1027	/* clear the areas and return address relative to base address */
1009	for_each_possible_cpu(cpu)	1028	for_each_possible_cpu(cpu)
1010	memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);	1029	memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
@@ -1023,6 +1042,11 @@ fail:
1023	if (!--warn_limit)	1042	if (!--warn_limit)
1024	pr_info("PERCPU: limit reached, disable warning\n");	1043	pr_info("PERCPU: limit reached, disable warning\n");
1025	}	1044	}
		1045	if (is_atomic) {
		1046	/* see the flag handling in pcpu_blance_workfn() */
		1047	pcpu_atomic_alloc_failed = true;
		1048	pcpu_schedule_balance_work();
		1049	}
1026	return NULL;	1050	return NULL;
1027	}	1051	}
1028		1052
@@ -1080,7 +1104,7 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
1080	}	1104	}
1081		1105
1082	/**	1106	/**
1083	* pcpu_balance_workfn - reclaim fully free chunks, workqueue function	1107	* pcpu_balance_workfn - manage the amount of free chunks and populated pages
1084	* @work: unused	1108	* @work: unused
1085	*	1109	*
1086	* Reclaim all fully free chunks except for the first one.	1110	* Reclaim all fully free chunks except for the first one.
@@ -1090,7 +1114,12 @@ static void pcpu_balance_workfn(struct work_struct *work)
1090	LIST_HEAD(to_free);	1114	LIST_HEAD(to_free);
1091	struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];	1115	struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
1092	struct pcpu_chunk chunk, next;	1116	struct pcpu_chunk chunk, next;
		1117	int slot, nr_to_pop, ret;
1093		1118
		1119	/*
		1120	* There's no reason to keep around multiple unused chunks and VM
		1121	* areas can be scarce. Destroy all free chunks except for one.
		1122	*/
1094	mutex_lock(&pcpu_alloc_mutex);	1123	mutex_lock(&pcpu_alloc_mutex);
1095	spin_lock_irq(&pcpu_lock);	1124	spin_lock_irq(&pcpu_lock);
1096		1125
@@ -1118,6 +1147,74 @@ static void pcpu_balance_workfn(struct work_struct *work)
1118	pcpu_destroy_chunk(chunk);	1147	pcpu_destroy_chunk(chunk);
1119	}	1148	}
1120		1149
		1150	/*
		1151	* Ensure there are certain number of free populated pages for
		1152	* atomic allocs. Fill up from the most packed so that atomic
		1153	* allocs don't increase fragmentation. If atomic allocation
		1154	* failed previously, always populate the maximum amount. This
		1155	* should prevent atomic allocs larger than PAGE_SIZE from keeping
		1156	* failing indefinitely; however, large atomic allocs are not
		1157	* something we support properly and can be highly unreliable and
		1158	* inefficient.
		1159	*/
		1160	retry_pop:
		1161	if (pcpu_atomic_alloc_failed) {
		1162	nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
		1163	/* best effort anyway, don't worry about synchronization */
		1164	pcpu_atomic_alloc_failed = false;
		1165	} else {
		1166	nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
		1167	pcpu_nr_empty_pop_pages,
		1168	0, PCPU_EMPTY_POP_PAGES_HIGH);
		1169	}
		1170
		1171	for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
		1172	int nr_unpop = 0, rs, re;
		1173
		1174	if (!nr_to_pop)
		1175	break;
		1176
		1177	spin_lock_irq(&pcpu_lock);
		1178	list_for_each_entry(chunk, &pcpu_slot[slot], list) {
		1179	nr_unpop = pcpu_unit_pages - chunk->nr_populated;
		1180	if (nr_unpop)
		1181	break;
		1182	}
		1183	spin_unlock_irq(&pcpu_lock);
		1184
		1185	if (!nr_unpop)
		1186	continue;
		1187
		1188	/* @chunk can't go away while pcpu_alloc_mutex is held */
		1189	pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
		1190	int nr = min(re - rs, nr_to_pop);
		1191
		1192	ret = pcpu_populate_chunk(chunk, rs, rs + nr);
		1193	if (!ret) {
		1194	nr_to_pop -= nr;
		1195	spin_lock_irq(&pcpu_lock);
		1196	pcpu_chunk_populated(chunk, rs, rs + nr);
		1197	spin_unlock_irq(&pcpu_lock);
		1198	} else {
		1199	nr_to_pop = 0;
		1200	}
		1201
		1202	if (!nr_to_pop)
		1203	break;
		1204	}
		1205	}
		1206
		1207	if (nr_to_pop) {
		1208	/* ran out of chunks to populate, create a new one and retry */
		1209	chunk = pcpu_create_chunk();
		1210	if (chunk) {
		1211	spin_lock_irq(&pcpu_lock);
		1212	pcpu_chunk_relocate(chunk, -1);
		1213	spin_unlock_irq(&pcpu_lock);
		1214	goto retry_pop;
		1215	}
		1216	}
		1217
1121	mutex_unlock(&pcpu_alloc_mutex);	1218	mutex_unlock(&pcpu_alloc_mutex);
1122	}	1219	}
1123		1220
@@ -1160,7 +1257,7 @@ void free_percpu(void __percpu *ptr)
1160		1257
1161	list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)	1258	list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
1162	if (pos != chunk) {	1259	if (pos != chunk) {
1163	schedule_work(&pcpu_balance_work);	1260	pcpu_schedule_balance_work();
1164	break;	1261	break;
1165	}	1262	}
1166	}	1263	}
@@ -2187,3 +2284,15 @@ void __init percpu_init_late(void)
2187	spin_unlock_irqrestore(&pcpu_lock, flags);	2284	spin_unlock_irqrestore(&pcpu_lock, flags);
2188	}	2285	}
2189	}	2286	}
		2287
		2288	/*
		2289	* Percpu allocator is initialized early during boot when neither slab or
		2290	* workqueue is available. Plug async management until everything is up
		2291	* and running.
		2292	*/
		2293	static int __init percpu_enable_async(void)
		2294	{
		2295	pcpu_async_enabled = true;
		2296	return 0;
		2297	}
		2298	subsys_initcall(percpu_enable_async);