aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/base/cpu.c2
-rw-r--r--include/linux/percpu.h6
-rw-r--r--kernel/sched.c22
-rw-r--r--mm/percpu.c162
4 files changed, 132 insertions, 60 deletions
diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index e62a4ccea54d..69ee5b7517ec 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -97,7 +97,7 @@ static ssize_t show_crash_notes(struct sys_device *dev, struct sysdev_attribute
97 * boot up and this data does not change there after. Hence this 97 * boot up and this data does not change there after. Hence this
98 * operation should be safe. No locking required. 98 * operation should be safe. No locking required.
99 */ 99 */
100 addr = __pa(per_cpu_ptr(crash_notes, cpunum)); 100 addr = per_cpu_ptr_to_phys(per_cpu_ptr(crash_notes, cpunum));
101 rc = sprintf(buf, "%Lx\n", addr); 101 rc = sprintf(buf, "%Lx\n", addr);
102 return rc; 102 return rc;
103} 103}
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 522f421ec213..cf5efbcf716c 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -130,6 +130,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size,
130extern void *__alloc_reserved_percpu(size_t size, size_t align); 130extern void *__alloc_reserved_percpu(size_t size, size_t align);
131extern void *__alloc_percpu(size_t size, size_t align); 131extern void *__alloc_percpu(size_t size, size_t align);
132extern void free_percpu(void *__pdata); 132extern void free_percpu(void *__pdata);
133extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
133 134
134#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA 135#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
135extern void __init setup_per_cpu_areas(void); 136extern void __init setup_per_cpu_areas(void);
@@ -155,6 +156,11 @@ static inline void free_percpu(void *p)
155 kfree(p); 156 kfree(p);
156} 157}
157 158
159static inline phys_addr_t per_cpu_ptr_to_phys(void *addr)
160{
161 return __pa(addr);
162}
163
158static inline void __init setup_per_cpu_areas(void) { } 164static inline void __init setup_per_cpu_areas(void) { }
159 165
160static inline void *pcpu_lpage_remapped(void *kaddr) 166static inline void *pcpu_lpage_remapped(void *kaddr)
diff --git a/kernel/sched.c b/kernel/sched.c
index 854ab418fd42..eecf070ffd1a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1563,11 +1563,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
1563 1563
1564#ifdef CONFIG_FAIR_GROUP_SCHED 1564#ifdef CONFIG_FAIR_GROUP_SCHED
1565 1565
1566struct update_shares_data { 1566static __read_mostly unsigned long *update_shares_data;
1567 unsigned long rq_weight[NR_CPUS];
1568};
1569
1570static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
1571 1567
1572static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1568static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1573 1569
@@ -1577,12 +1573,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1577static void update_group_shares_cpu(struct task_group *tg, int cpu, 1573static void update_group_shares_cpu(struct task_group *tg, int cpu,
1578 unsigned long sd_shares, 1574 unsigned long sd_shares,
1579 unsigned long sd_rq_weight, 1575 unsigned long sd_rq_weight,
1580 struct update_shares_data *usd) 1576 unsigned long *usd_rq_weight)
1581{ 1577{
1582 unsigned long shares, rq_weight; 1578 unsigned long shares, rq_weight;
1583 int boost = 0; 1579 int boost = 0;
1584 1580
1585 rq_weight = usd->rq_weight[cpu]; 1581 rq_weight = usd_rq_weight[cpu];
1586 if (!rq_weight) { 1582 if (!rq_weight) {
1587 boost = 1; 1583 boost = 1;
1588 rq_weight = NICE_0_LOAD; 1584 rq_weight = NICE_0_LOAD;
@@ -1617,7 +1613,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
1617static int tg_shares_up(struct task_group *tg, void *data) 1613static int tg_shares_up(struct task_group *tg, void *data)
1618{ 1614{
1619 unsigned long weight, rq_weight = 0, shares = 0; 1615 unsigned long weight, rq_weight = 0, shares = 0;
1620 struct update_shares_data *usd; 1616 unsigned long *usd_rq_weight;
1621 struct sched_domain *sd = data; 1617 struct sched_domain *sd = data;
1622 unsigned long flags; 1618 unsigned long flags;
1623 int i; 1619 int i;
@@ -1626,11 +1622,11 @@ static int tg_shares_up(struct task_group *tg, void *data)
1626 return 0; 1622 return 0;
1627 1623
1628 local_irq_save(flags); 1624 local_irq_save(flags);
1629 usd = &__get_cpu_var(update_shares_data); 1625 usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
1630 1626
1631 for_each_cpu(i, sched_domain_span(sd)) { 1627 for_each_cpu(i, sched_domain_span(sd)) {
1632 weight = tg->cfs_rq[i]->load.weight; 1628 weight = tg->cfs_rq[i]->load.weight;
1633 usd->rq_weight[i] = weight; 1629 usd_rq_weight[i] = weight;
1634 1630
1635 /* 1631 /*
1636 * If there are currently no tasks on the cpu pretend there 1632 * If there are currently no tasks on the cpu pretend there
@@ -1651,7 +1647,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
1651 shares = tg->shares; 1647 shares = tg->shares;
1652 1648
1653 for_each_cpu(i, sched_domain_span(sd)) 1649 for_each_cpu(i, sched_domain_span(sd))
1654 update_group_shares_cpu(tg, i, shares, rq_weight, usd); 1650 update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
1655 1651
1656 local_irq_restore(flags); 1652 local_irq_restore(flags);
1657 1653
@@ -9406,6 +9402,10 @@ void __init sched_init(void)
9406#endif /* CONFIG_USER_SCHED */ 9402#endif /* CONFIG_USER_SCHED */
9407#endif /* CONFIG_GROUP_SCHED */ 9403#endif /* CONFIG_GROUP_SCHED */
9408 9404
9405#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
9406 update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
9407 __alignof__(unsigned long));
9408#endif
9409 for_each_possible_cpu(i) { 9409 for_each_possible_cpu(i) {
9410 struct rq *rq; 9410 struct rq *rq;
9411 9411
diff --git a/mm/percpu.c b/mm/percpu.c
index e2e80fc78601..442010cc91c6 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -72,6 +72,7 @@
72#include <asm/cacheflush.h> 72#include <asm/cacheflush.h>
73#include <asm/sections.h> 73#include <asm/sections.h>
74#include <asm/tlbflush.h> 74#include <asm/tlbflush.h>
75#include <asm/io.h>
75 76
76#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ 77#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
77#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 78#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
@@ -151,7 +152,10 @@ static int pcpu_reserved_chunk_limit;
151 * 152 *
152 * During allocation, pcpu_alloc_mutex is kept locked all the time and 153 * During allocation, pcpu_alloc_mutex is kept locked all the time and
153 * pcpu_lock is grabbed and released as necessary. All actual memory 154 * pcpu_lock is grabbed and released as necessary. All actual memory
154 * allocations are done using GFP_KERNEL with pcpu_lock released. 155 * allocations are done using GFP_KERNEL with pcpu_lock released. In
156 * general, percpu memory can't be allocated with irq off but
157 * irqsave/restore are still used in alloc path so that it can be used
158 * from early init path - sched_init() specifically.
155 * 159 *
156 * Free path accesses and alters only the index data structures, so it 160 * Free path accesses and alters only the index data structures, so it
157 * can be safely called from atomic context. When memory needs to be 161 * can be safely called from atomic context. When memory needs to be
@@ -350,63 +354,86 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
350} 354}
351 355
352/** 356/**
353 * pcpu_extend_area_map - extend area map for allocation 357 * pcpu_need_to_extend - determine whether chunk area map needs to be extended
354 * @chunk: target chunk 358 * @chunk: chunk of interest
355 * 359 *
356 * Extend area map of @chunk so that it can accomodate an allocation. 360 * Determine whether area map of @chunk needs to be extended to
357 * A single allocation can split an area into three areas, so this 361 * accomodate a new allocation.
358 * function makes sure that @chunk->map has at least two extra slots.
359 * 362 *
360 * CONTEXT: 363 * CONTEXT:
361 * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired 364 * pcpu_lock.
362 * if area map is extended.
363 * 365 *
364 * RETURNS: 366 * RETURNS:
365 * 0 if noop, 1 if successfully extended, -errno on failure. 367 * New target map allocation length if extension is necessary, 0
368 * otherwise.
366 */ 369 */
367static int pcpu_extend_area_map(struct pcpu_chunk *chunk) 370static int pcpu_need_to_extend(struct pcpu_chunk *chunk)
368 __releases(lock) __acquires(lock)
369{ 371{
370 int new_alloc; 372 int new_alloc;
371 int *new;
372 size_t size;
373 373
374 /* has enough? */
375 if (chunk->map_alloc >= chunk->map_used + 2) 374 if (chunk->map_alloc >= chunk->map_used + 2)
376 return 0; 375 return 0;
377 376
378 spin_unlock_irq(&pcpu_lock);
379
380 new_alloc = PCPU_DFL_MAP_ALLOC; 377 new_alloc = PCPU_DFL_MAP_ALLOC;
381 while (new_alloc < chunk->map_used + 2) 378 while (new_alloc < chunk->map_used + 2)
382 new_alloc *= 2; 379 new_alloc *= 2;
383 380
384 new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); 381 return new_alloc;
385 if (!new) { 382}
386 spin_lock_irq(&pcpu_lock); 383
384/**
385 * pcpu_extend_area_map - extend area map of a chunk
386 * @chunk: chunk of interest
387 * @new_alloc: new target allocation length of the area map
388 *
389 * Extend area map of @chunk to have @new_alloc entries.
390 *
391 * CONTEXT:
392 * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock.
393 *
394 * RETURNS:
395 * 0 on success, -errno on failure.
396 */
397static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
398{
399 int *old = NULL, *new = NULL;
400 size_t old_size = 0, new_size = new_alloc * sizeof(new[0]);
401 unsigned long flags;
402
403 new = pcpu_mem_alloc(new_size);
404 if (!new)
387 return -ENOMEM; 405 return -ENOMEM;
388 }
389 406
390 /* 407 /* acquire pcpu_lock and switch to new area map */
391 * Acquire pcpu_lock and switch to new area map. Only free 408 spin_lock_irqsave(&pcpu_lock, flags);
392 * could have happened inbetween, so map_used couldn't have
393 * grown.
394 */
395 spin_lock_irq(&pcpu_lock);
396 BUG_ON(new_alloc < chunk->map_used + 2);
397 409
398 size = chunk->map_alloc * sizeof(chunk->map[0]); 410 if (new_alloc <= chunk->map_alloc)
399 memcpy(new, chunk->map, size); 411 goto out_unlock;
412
413 old_size = chunk->map_alloc * sizeof(chunk->map[0]);
414 memcpy(new, chunk->map, old_size);
400 415
401 /* 416 /*
402 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is 417 * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
403 * one of the first chunks and still using static map. 418 * one of the first chunks and still using static map.
404 */ 419 */
405 if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) 420 if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
406 pcpu_mem_free(chunk->map, size); 421 old = chunk->map;
407 422
408 chunk->map_alloc = new_alloc; 423 chunk->map_alloc = new_alloc;
409 chunk->map = new; 424 chunk->map = new;
425 new = NULL;
426
427out_unlock:
428 spin_unlock_irqrestore(&pcpu_lock, flags);
429
430 /*
431 * pcpu_mem_free() might end up calling vfree() which uses
432 * IRQ-unsafe lock and thus can't be called under pcpu_lock.
433 */
434 pcpu_mem_free(old, old_size);
435 pcpu_mem_free(new, new_size);
436
410 return 0; 437 return 0;
411} 438}
412 439
@@ -1045,7 +1072,8 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
1045 static int warn_limit = 10; 1072 static int warn_limit = 10;
1046 struct pcpu_chunk *chunk; 1073 struct pcpu_chunk *chunk;
1047 const char *err; 1074 const char *err;
1048 int slot, off; 1075 int slot, off, new_alloc;
1076 unsigned long flags;
1049 1077
1050 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { 1078 if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
1051 WARN(true, "illegal size (%zu) or align (%zu) for " 1079 WARN(true, "illegal size (%zu) or align (%zu) for "
@@ -1054,19 +1082,30 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved)
1054 } 1082 }
1055 1083
1056 mutex_lock(&pcpu_alloc_mutex); 1084 mutex_lock(&pcpu_alloc_mutex);
1057 spin_lock_irq(&pcpu_lock); 1085 spin_lock_irqsave(&pcpu_lock, flags);
1058 1086
1059 /* serve reserved allocations from the reserved chunk if available */ 1087 /* serve reserved allocations from the reserved chunk if available */
1060 if (reserved && pcpu_reserved_chunk) { 1088 if (reserved && pcpu_reserved_chunk) {
1061 chunk = pcpu_reserved_chunk; 1089 chunk = pcpu_reserved_chunk;
1062 if (size > chunk->contig_hint || 1090
1063 pcpu_extend_area_map(chunk) < 0) { 1091 if (size > chunk->contig_hint) {
1064 err = "failed to extend area map of reserved chunk"; 1092 err = "alloc from reserved chunk failed";
1065 goto fail_unlock; 1093 goto fail_unlock;
1066 } 1094 }
1095
1096 while ((new_alloc = pcpu_need_to_extend(chunk))) {
1097 spin_unlock_irqrestore(&pcpu_lock, flags);
1098 if (pcpu_extend_area_map(chunk, new_alloc) < 0) {
1099 err = "failed to extend area map of reserved chunk";
1100 goto fail_unlock_mutex;
1101 }
1102 spin_lock_irqsave(&pcpu_lock, flags);
1103 }
1104
1067 off = pcpu_alloc_area(chunk, size, align); 1105 off = pcpu_alloc_area(chunk, size, align);
1068 if (off >= 0) 1106 if (off >= 0)
1069 goto area_found; 1107 goto area_found;
1108
1070 err = "alloc from reserved chunk failed"; 1109 err = "alloc from reserved chunk failed";
1071 goto fail_unlock; 1110 goto fail_unlock;
1072 } 1111 }
@@ -1078,14 +1117,20 @@ restart:
1078 if (size > chunk->contig_hint) 1117 if (size > chunk->contig_hint)
1079 continue; 1118 continue;
1080 1119
1081 switch (pcpu_extend_area_map(chunk)) { 1120 new_alloc = pcpu_need_to_extend(chunk);
1082 case 0: 1121 if (new_alloc) {
1083 break; 1122 spin_unlock_irqrestore(&pcpu_lock, flags);
1084 case 1: 1123 if (pcpu_extend_area_map(chunk,
1085 goto restart; /* pcpu_lock dropped, restart */ 1124 new_alloc) < 0) {
1086 default: 1125 err = "failed to extend area map";
1087 err = "failed to extend area map"; 1126 goto fail_unlock_mutex;
1088 goto fail_unlock; 1127 }
1128 spin_lock_irqsave(&pcpu_lock, flags);
1129 /*
1130 * pcpu_lock has been dropped, need to
1131 * restart cpu_slot list walking.
1132 */
1133 goto restart;
1089 } 1134 }
1090 1135
1091 off = pcpu_alloc_area(chunk, size, align); 1136 off = pcpu_alloc_area(chunk, size, align);
@@ -1095,7 +1140,7 @@ restart:
1095 } 1140 }
1096 1141
1097 /* hmmm... no space left, create a new chunk */ 1142 /* hmmm... no space left, create a new chunk */
1098 spin_unlock_irq(&pcpu_lock); 1143 spin_unlock_irqrestore(&pcpu_lock, flags);
1099 1144
1100 chunk = alloc_pcpu_chunk(); 1145 chunk = alloc_pcpu_chunk();
1101 if (!chunk) { 1146 if (!chunk) {
@@ -1103,16 +1148,16 @@ restart:
1103 goto fail_unlock_mutex; 1148 goto fail_unlock_mutex;
1104 } 1149 }
1105 1150
1106 spin_lock_irq(&pcpu_lock); 1151 spin_lock_irqsave(&pcpu_lock, flags);
1107 pcpu_chunk_relocate(chunk, -1); 1152 pcpu_chunk_relocate(chunk, -1);
1108 goto restart; 1153 goto restart;
1109 1154
1110area_found: 1155area_found:
1111 spin_unlock_irq(&pcpu_lock); 1156 spin_unlock_irqrestore(&pcpu_lock, flags);
1112 1157
1113 /* populate, map and clear the area */ 1158 /* populate, map and clear the area */
1114 if (pcpu_populate_chunk(chunk, off, size)) { 1159 if (pcpu_populate_chunk(chunk, off, size)) {
1115 spin_lock_irq(&pcpu_lock); 1160 spin_lock_irqsave(&pcpu_lock, flags);
1116 pcpu_free_area(chunk, off); 1161 pcpu_free_area(chunk, off);
1117 err = "failed to populate"; 1162 err = "failed to populate";
1118 goto fail_unlock; 1163 goto fail_unlock;
@@ -1124,7 +1169,7 @@ area_found:
1124 return __addr_to_pcpu_ptr(chunk->base_addr + off); 1169 return __addr_to_pcpu_ptr(chunk->base_addr + off);
1125 1170
1126fail_unlock: 1171fail_unlock:
1127 spin_unlock_irq(&pcpu_lock); 1172 spin_unlock_irqrestore(&pcpu_lock, flags);
1128fail_unlock_mutex: 1173fail_unlock_mutex:
1129 mutex_unlock(&pcpu_alloc_mutex); 1174 mutex_unlock(&pcpu_alloc_mutex);
1130 if (warn_limit) { 1175 if (warn_limit) {
@@ -1256,6 +1301,27 @@ void free_percpu(void *ptr)
1256} 1301}
1257EXPORT_SYMBOL_GPL(free_percpu); 1302EXPORT_SYMBOL_GPL(free_percpu);
1258 1303
1304/**
1305 * per_cpu_ptr_to_phys - convert translated percpu address to physical address
1306 * @addr: the address to be converted to physical address
1307 *
1308 * Given @addr which is dereferenceable address obtained via one of
1309 * percpu access macros, this function translates it into its physical
1310 * address. The caller is responsible for ensuring @addr stays valid
1311 * until this function finishes.
1312 *
1313 * RETURNS:
1314 * The physical address for @addr.
1315 */
1316phys_addr_t per_cpu_ptr_to_phys(void *addr)
1317{
1318 if ((unsigned long)addr < VMALLOC_START ||
1319 (unsigned long)addr >= VMALLOC_END)
1320 return __pa(addr);
1321 else
1322 return page_to_phys(vmalloc_to_page(addr));
1323}
1324
1259static inline size_t pcpu_calc_fc_sizes(size_t static_size, 1325static inline size_t pcpu_calc_fc_sizes(size_t static_size,
1260 size_t reserved_size, 1326 size_t reserved_size,
1261 ssize_t *dyn_sizep) 1327 ssize_t *dyn_sizep)