diff options
39 files changed, 879 insertions, 444 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 3201e93ebd07..ac1c4de3a484 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c | |||
@@ -4549,7 +4549,7 @@ int kvm_mmu_module_init(void) | |||
4549 | if (!mmu_page_header_cache) | 4549 | if (!mmu_page_header_cache) |
4550 | goto nomem; | 4550 | goto nomem; |
4551 | 4551 | ||
4552 | if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) | 4552 | if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL)) |
4553 | goto nomem; | 4553 | goto nomem; |
4554 | 4554 | ||
4555 | register_shrinker(&mmu_shrinker); | 4555 | register_shrinker(&mmu_shrinker); |
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index ed5217867555..371d8800b48a 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c | |||
@@ -402,6 +402,12 @@ static void blk_mq_sysfs_init(struct request_queue *q) | |||
402 | } | 402 | } |
403 | } | 403 | } |
404 | 404 | ||
405 | /* see blk_register_queue() */ | ||
406 | void blk_mq_finish_init(struct request_queue *q) | ||
407 | { | ||
408 | percpu_ref_switch_to_percpu(&q->mq_usage_counter); | ||
409 | } | ||
410 | |||
405 | int blk_mq_register_disk(struct gendisk *disk) | 411 | int blk_mq_register_disk(struct gendisk *disk) |
406 | { | 412 | { |
407 | struct device *dev = disk_to_dev(disk); | 413 | struct device *dev = disk_to_dev(disk); |
diff --git a/block/blk-mq.c b/block/blk-mq.c index df8e1e09dd17..38f4a165640d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c | |||
@@ -119,16 +119,7 @@ void blk_mq_freeze_queue(struct request_queue *q) | |||
119 | spin_unlock_irq(q->queue_lock); | 119 | spin_unlock_irq(q->queue_lock); |
120 | 120 | ||
121 | if (freeze) { | 121 | if (freeze) { |
122 | /* | 122 | percpu_ref_kill(&q->mq_usage_counter); |
123 | * XXX: Temporary kludge to work around SCSI blk-mq stall. | ||
124 | * SCSI synchronously creates and destroys many queues | ||
125 | * back-to-back during probe leading to lengthy stalls. | ||
126 | * This will be fixed by keeping ->mq_usage_counter in | ||
127 | * atomic mode until genhd registration, but, for now, | ||
128 | * let's work around using expedited synchronization. | ||
129 | */ | ||
130 | __percpu_ref_kill_expedited(&q->mq_usage_counter); | ||
131 | |||
132 | blk_mq_run_queues(q, false); | 123 | blk_mq_run_queues(q, false); |
133 | } | 124 | } |
134 | wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); | 125 | wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); |
@@ -1804,7 +1795,12 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) | |||
1804 | if (!q) | 1795 | if (!q) |
1805 | goto err_hctxs; | 1796 | goto err_hctxs; |
1806 | 1797 | ||
1807 | if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release)) | 1798 | /* |
1799 | * Init percpu_ref in atomic mode so that it's faster to shutdown. | ||
1800 | * See blk_register_queue() for details. | ||
1801 | */ | ||
1802 | if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, | ||
1803 | PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) | ||
1808 | goto err_map; | 1804 | goto err_map; |
1809 | 1805 | ||
1810 | setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); | 1806 | setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); |
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 17f5c84ce7bf..521ae9089c50 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c | |||
@@ -551,12 +551,19 @@ int blk_register_queue(struct gendisk *disk) | |||
551 | return -ENXIO; | 551 | return -ENXIO; |
552 | 552 | ||
553 | /* | 553 | /* |
554 | * Initialization must be complete by now. Finish the initial | 554 | * SCSI probing may synchronously create and destroy a lot of |
555 | * bypass from queue allocation. | 555 | * request_queues for non-existent devices. Shutting down a fully |
556 | * functional queue takes measureable wallclock time as RCU grace | ||
557 | * periods are involved. To avoid excessive latency in these | ||
558 | * cases, a request_queue starts out in a degraded mode which is | ||
559 | * faster to shut down and is made fully functional here as | ||
560 | * request_queues for non-existent devices never get registered. | ||
556 | */ | 561 | */ |
557 | if (!blk_queue_init_done(q)) { | 562 | if (!blk_queue_init_done(q)) { |
558 | queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); | 563 | queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); |
559 | blk_queue_bypass_end(q); | 564 | blk_queue_bypass_end(q); |
565 | if (q->mq_ops) | ||
566 | blk_mq_finish_init(q); | ||
560 | } | 567 | } |
561 | 568 | ||
562 | ret = blk_trace_init_sysfs(dev); | 569 | ret = blk_trace_init_sysfs(dev); |
diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c index fddfae61222f..be783f717f19 100644 --- a/drivers/target/target_core_tpg.c +++ b/drivers/target/target_core_tpg.c | |||
@@ -819,7 +819,8 @@ int core_tpg_add_lun( | |||
819 | { | 819 | { |
820 | int ret; | 820 | int ret; |
821 | 821 | ||
822 | ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release); | 822 | ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release, 0, |
823 | GFP_KERNEL); | ||
823 | if (ret < 0) | 824 | if (ret < 0) |
824 | return ret; | 825 | return ret; |
825 | 826 | ||
@@ -661,10 +661,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) | |||
661 | 661 | ||
662 | INIT_LIST_HEAD(&ctx->active_reqs); | 662 | INIT_LIST_HEAD(&ctx->active_reqs); |
663 | 663 | ||
664 | if (percpu_ref_init(&ctx->users, free_ioctx_users)) | 664 | if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL)) |
665 | goto err; | 665 | goto err; |
666 | 666 | ||
667 | if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) | 667 | if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL)) |
668 | goto err; | 668 | goto err; |
669 | 669 | ||
670 | ctx->cpu = alloc_percpu(struct kioctx_cpu); | 670 | ctx->cpu = alloc_percpu(struct kioctx_cpu); |
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a1d36e62179c..d0d78dc07792 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c | |||
@@ -1183,7 +1183,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) | |||
1183 | if (!writers) | 1183 | if (!writers) |
1184 | return ERR_PTR(-ENOMEM); | 1184 | return ERR_PTR(-ENOMEM); |
1185 | 1185 | ||
1186 | ret = percpu_counter_init(&writers->counter, 0); | 1186 | ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL); |
1187 | if (ret < 0) { | 1187 | if (ret < 0) { |
1188 | kfree(writers); | 1188 | kfree(writers); |
1189 | return ERR_PTR(ret); | 1189 | return ERR_PTR(ret); |
@@ -2188,7 +2188,7 @@ int open_ctree(struct super_block *sb, | |||
2188 | goto fail_srcu; | 2188 | goto fail_srcu; |
2189 | } | 2189 | } |
2190 | 2190 | ||
2191 | ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); | 2191 | ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); |
2192 | if (ret) { | 2192 | if (ret) { |
2193 | err = ret; | 2193 | err = ret; |
2194 | goto fail_bdi; | 2194 | goto fail_bdi; |
@@ -2196,13 +2196,13 @@ int open_ctree(struct super_block *sb, | |||
2196 | fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * | 2196 | fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * |
2197 | (1 + ilog2(nr_cpu_ids)); | 2197 | (1 + ilog2(nr_cpu_ids)); |
2198 | 2198 | ||
2199 | ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); | 2199 | ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); |
2200 | if (ret) { | 2200 | if (ret) { |
2201 | err = ret; | 2201 | err = ret; |
2202 | goto fail_dirty_metadata_bytes; | 2202 | goto fail_dirty_metadata_bytes; |
2203 | } | 2203 | } |
2204 | 2204 | ||
2205 | ret = percpu_counter_init(&fs_info->bio_counter, 0); | 2205 | ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL); |
2206 | if (ret) { | 2206 | if (ret) { |
2207 | err = ret; | 2207 | err = ret; |
2208 | goto fail_delalloc_bytes; | 2208 | goto fail_delalloc_bytes; |
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3efe1c3877bf..caaf015d6e4b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c | |||
@@ -3494,7 +3494,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, | |||
3494 | if (!found) | 3494 | if (!found) |
3495 | return -ENOMEM; | 3495 | return -ENOMEM; |
3496 | 3496 | ||
3497 | ret = percpu_counter_init(&found->total_bytes_pinned, 0); | 3497 | ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); |
3498 | if (ret) { | 3498 | if (ret) { |
3499 | kfree(found); | 3499 | kfree(found); |
3500 | return ret; | 3500 | return ret; |
diff --git a/fs/ext2/super.c b/fs/ext2/super.c index b88edc05c230..170dc41e8bf4 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c | |||
@@ -1067,14 +1067,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) | |||
1067 | ext2_rsv_window_add(sb, &sbi->s_rsv_window_head); | 1067 | ext2_rsv_window_add(sb, &sbi->s_rsv_window_head); |
1068 | 1068 | ||
1069 | err = percpu_counter_init(&sbi->s_freeblocks_counter, | 1069 | err = percpu_counter_init(&sbi->s_freeblocks_counter, |
1070 | ext2_count_free_blocks(sb)); | 1070 | ext2_count_free_blocks(sb), GFP_KERNEL); |
1071 | if (!err) { | 1071 | if (!err) { |
1072 | err = percpu_counter_init(&sbi->s_freeinodes_counter, | 1072 | err = percpu_counter_init(&sbi->s_freeinodes_counter, |
1073 | ext2_count_free_inodes(sb)); | 1073 | ext2_count_free_inodes(sb), GFP_KERNEL); |
1074 | } | 1074 | } |
1075 | if (!err) { | 1075 | if (!err) { |
1076 | err = percpu_counter_init(&sbi->s_dirs_counter, | 1076 | err = percpu_counter_init(&sbi->s_dirs_counter, |
1077 | ext2_count_dirs(sb)); | 1077 | ext2_count_dirs(sb), GFP_KERNEL); |
1078 | } | 1078 | } |
1079 | if (err) { | 1079 | if (err) { |
1080 | ext2_msg(sb, KERN_ERR, "error: insufficient memory"); | 1080 | ext2_msg(sb, KERN_ERR, "error: insufficient memory"); |
diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 622e88249024..bb0fdacad058 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c | |||
@@ -2039,14 +2039,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) | |||
2039 | goto failed_mount2; | 2039 | goto failed_mount2; |
2040 | } | 2040 | } |
2041 | err = percpu_counter_init(&sbi->s_freeblocks_counter, | 2041 | err = percpu_counter_init(&sbi->s_freeblocks_counter, |
2042 | ext3_count_free_blocks(sb)); | 2042 | ext3_count_free_blocks(sb), GFP_KERNEL); |
2043 | if (!err) { | 2043 | if (!err) { |
2044 | err = percpu_counter_init(&sbi->s_freeinodes_counter, | 2044 | err = percpu_counter_init(&sbi->s_freeinodes_counter, |
2045 | ext3_count_free_inodes(sb)); | 2045 | ext3_count_free_inodes(sb), GFP_KERNEL); |
2046 | } | 2046 | } |
2047 | if (!err) { | 2047 | if (!err) { |
2048 | err = percpu_counter_init(&sbi->s_dirs_counter, | 2048 | err = percpu_counter_init(&sbi->s_dirs_counter, |
2049 | ext3_count_dirs(sb)); | 2049 | ext3_count_dirs(sb), GFP_KERNEL); |
2050 | } | 2050 | } |
2051 | if (err) { | 2051 | if (err) { |
2052 | ext3_msg(sb, KERN_ERR, "error: insufficient memory"); | 2052 | ext3_msg(sb, KERN_ERR, "error: insufficient memory"); |
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0b28b36e7915..05c159218bc2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c | |||
@@ -3892,7 +3892,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) | |||
3892 | /* Register extent status tree shrinker */ | 3892 | /* Register extent status tree shrinker */ |
3893 | ext4_es_register_shrinker(sbi); | 3893 | ext4_es_register_shrinker(sbi); |
3894 | 3894 | ||
3895 | if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) { | 3895 | err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL); |
3896 | if (err) { | ||
3896 | ext4_msg(sb, KERN_ERR, "insufficient memory"); | 3897 | ext4_msg(sb, KERN_ERR, "insufficient memory"); |
3897 | goto failed_mount3; | 3898 | goto failed_mount3; |
3898 | } | 3899 | } |
@@ -4106,17 +4107,20 @@ no_journal: | |||
4106 | block = ext4_count_free_clusters(sb); | 4107 | block = ext4_count_free_clusters(sb); |
4107 | ext4_free_blocks_count_set(sbi->s_es, | 4108 | ext4_free_blocks_count_set(sbi->s_es, |
4108 | EXT4_C2B(sbi, block)); | 4109 | EXT4_C2B(sbi, block)); |
4109 | err = percpu_counter_init(&sbi->s_freeclusters_counter, block); | 4110 | err = percpu_counter_init(&sbi->s_freeclusters_counter, block, |
4111 | GFP_KERNEL); | ||
4110 | if (!err) { | 4112 | if (!err) { |
4111 | unsigned long freei = ext4_count_free_inodes(sb); | 4113 | unsigned long freei = ext4_count_free_inodes(sb); |
4112 | sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); | 4114 | sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); |
4113 | err = percpu_counter_init(&sbi->s_freeinodes_counter, freei); | 4115 | err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, |
4116 | GFP_KERNEL); | ||
4114 | } | 4117 | } |
4115 | if (!err) | 4118 | if (!err) |
4116 | err = percpu_counter_init(&sbi->s_dirs_counter, | 4119 | err = percpu_counter_init(&sbi->s_dirs_counter, |
4117 | ext4_count_dirs(sb)); | 4120 | ext4_count_dirs(sb), GFP_KERNEL); |
4118 | if (!err) | 4121 | if (!err) |
4119 | err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); | 4122 | err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, |
4123 | GFP_KERNEL); | ||
4120 | if (err) { | 4124 | if (err) { |
4121 | ext4_msg(sb, KERN_ERR, "insufficient memory"); | 4125 | ext4_msg(sb, KERN_ERR, "insufficient memory"); |
4122 | goto failed_mount6; | 4126 | goto failed_mount6; |
diff --git a/fs/file_table.c b/fs/file_table.c index 385bfd31512a..0bab12b20460 100644 --- a/fs/file_table.c +++ b/fs/file_table.c | |||
@@ -331,5 +331,5 @@ void __init files_init(unsigned long mempages) | |||
331 | 331 | ||
332 | n = (mempages * (PAGE_SIZE / 1024)) / 10; | 332 | n = (mempages * (PAGE_SIZE / 1024)) / 10; |
333 | files_stat.max_files = max_t(unsigned long, n, NR_FILE); | 333 | files_stat.max_files = max_t(unsigned long, n, NR_FILE); |
334 | percpu_counter_init(&nr_files, 0); | 334 | percpu_counter_init(&nr_files, 0, GFP_KERNEL); |
335 | } | 335 | } |
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index f2d0eee9d1f1..8b663b2d9562 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c | |||
@@ -2725,7 +2725,7 @@ static int __init dquot_init(void) | |||
2725 | panic("Cannot create dquot hash table"); | 2725 | panic("Cannot create dquot hash table"); |
2726 | 2726 | ||
2727 | for (i = 0; i < _DQST_DQSTAT_LAST; i++) { | 2727 | for (i = 0; i < _DQST_DQSTAT_LAST; i++) { |
2728 | ret = percpu_counter_init(&dqstats.counter[i], 0); | 2728 | ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL); |
2729 | if (ret) | 2729 | if (ret) |
2730 | panic("Cannot create dquot stat counters"); | 2730 | panic("Cannot create dquot stat counters"); |
2731 | } | 2731 | } |
diff --git a/fs/super.c b/fs/super.c index b9a214d2fe98..1b836107acee 100644 --- a/fs/super.c +++ b/fs/super.c | |||
@@ -175,7 +175,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) | |||
175 | goto fail; | 175 | goto fail; |
176 | 176 | ||
177 | for (i = 0; i < SB_FREEZE_LEVELS; i++) { | 177 | for (i = 0; i < SB_FREEZE_LEVELS; i++) { |
178 | if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0) | 178 | if (percpu_counter_init(&s->s_writers.counter[i], 0, |
179 | GFP_KERNEL) < 0) | ||
179 | goto fail; | 180 | goto fail; |
180 | lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], | 181 | lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], |
181 | &type->s_writers_key[i], 0); | 182 | &type->s_writers_key[i], 0); |
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index a1e31f274fcd..c13a0c09faea 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h | |||
@@ -140,6 +140,7 @@ enum { | |||
140 | }; | 140 | }; |
141 | 141 | ||
142 | struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); | 142 | struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); |
143 | void blk_mq_finish_init(struct request_queue *q); | ||
143 | int blk_mq_register_disk(struct gendisk *); | 144 | int blk_mq_register_disk(struct gendisk *); |
144 | void blk_mq_unregister_disk(struct gendisk *); | 145 | void blk_mq_unregister_disk(struct gendisk *); |
145 | 146 | ||
diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h index 4ebc49fae391..0d348e011a6e 100644 --- a/include/linux/flex_proportions.h +++ b/include/linux/flex_proportions.h | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/percpu_counter.h> | 10 | #include <linux/percpu_counter.h> |
11 | #include <linux/spinlock.h> | 11 | #include <linux/spinlock.h> |
12 | #include <linux/seqlock.h> | 12 | #include <linux/seqlock.h> |
13 | #include <linux/gfp.h> | ||
13 | 14 | ||
14 | /* | 15 | /* |
15 | * When maximum proportion of some event type is specified, this is the | 16 | * When maximum proportion of some event type is specified, this is the |
@@ -32,7 +33,7 @@ struct fprop_global { | |||
32 | seqcount_t sequence; | 33 | seqcount_t sequence; |
33 | }; | 34 | }; |
34 | 35 | ||
35 | int fprop_global_init(struct fprop_global *p); | 36 | int fprop_global_init(struct fprop_global *p, gfp_t gfp); |
36 | void fprop_global_destroy(struct fprop_global *p); | 37 | void fprop_global_destroy(struct fprop_global *p); |
37 | bool fprop_new_period(struct fprop_global *p, int periods); | 38 | bool fprop_new_period(struct fprop_global *p, int periods); |
38 | 39 | ||
@@ -79,7 +80,7 @@ struct fprop_local_percpu { | |||
79 | raw_spinlock_t lock; /* Protect period and numerator */ | 80 | raw_spinlock_t lock; /* Protect period and numerator */ |
80 | }; | 81 | }; |
81 | 82 | ||
82 | int fprop_local_init_percpu(struct fprop_local_percpu *pl); | 83 | int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp); |
83 | void fprop_local_destroy_percpu(struct fprop_local_percpu *pl); | 84 | void fprop_local_destroy_percpu(struct fprop_local_percpu *pl); |
84 | void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl); | 85 | void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl); |
85 | void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl, | 86 | void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl, |
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h index 68a64f11ce02..d5c89e0dd0e6 100644 --- a/include/linux/percpu-refcount.h +++ b/include/linux/percpu-refcount.h | |||
@@ -13,7 +13,7 @@ | |||
13 | * | 13 | * |
14 | * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less | 14 | * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less |
15 | * than an atomic_t - this is because of the way shutdown works, see | 15 | * than an atomic_t - this is because of the way shutdown works, see |
16 | * percpu_ref_kill()/PCPU_COUNT_BIAS. | 16 | * percpu_ref_kill()/PERCPU_COUNT_BIAS. |
17 | * | 17 | * |
18 | * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the | 18 | * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the |
19 | * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() | 19 | * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() |
@@ -49,29 +49,60 @@ | |||
49 | #include <linux/kernel.h> | 49 | #include <linux/kernel.h> |
50 | #include <linux/percpu.h> | 50 | #include <linux/percpu.h> |
51 | #include <linux/rcupdate.h> | 51 | #include <linux/rcupdate.h> |
52 | #include <linux/gfp.h> | ||
52 | 53 | ||
53 | struct percpu_ref; | 54 | struct percpu_ref; |
54 | typedef void (percpu_ref_func_t)(struct percpu_ref *); | 55 | typedef void (percpu_ref_func_t)(struct percpu_ref *); |
55 | 56 | ||
57 | /* flags set in the lower bits of percpu_ref->percpu_count_ptr */ | ||
58 | enum { | ||
59 | __PERCPU_REF_ATOMIC = 1LU << 0, /* operating in atomic mode */ | ||
60 | __PERCPU_REF_DEAD = 1LU << 1, /* (being) killed */ | ||
61 | __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD, | ||
62 | |||
63 | __PERCPU_REF_FLAG_BITS = 2, | ||
64 | }; | ||
65 | |||
66 | /* @flags for percpu_ref_init() */ | ||
67 | enum { | ||
68 | /* | ||
69 | * Start w/ ref == 1 in atomic mode. Can be switched to percpu | ||
70 | * operation using percpu_ref_switch_to_percpu(). If initialized | ||
71 | * with this flag, the ref will stay in atomic mode until | ||
72 | * percpu_ref_switch_to_percpu() is invoked on it. | ||
73 | */ | ||
74 | PERCPU_REF_INIT_ATOMIC = 1 << 0, | ||
75 | |||
76 | /* | ||
77 | * Start dead w/ ref == 0 in atomic mode. Must be revived with | ||
78 | * percpu_ref_reinit() before used. Implies INIT_ATOMIC. | ||
79 | */ | ||
80 | PERCPU_REF_INIT_DEAD = 1 << 1, | ||
81 | }; | ||
82 | |||
56 | struct percpu_ref { | 83 | struct percpu_ref { |
57 | atomic_t count; | 84 | atomic_long_t count; |
58 | /* | 85 | /* |
59 | * The low bit of the pointer indicates whether the ref is in percpu | 86 | * The low bit of the pointer indicates whether the ref is in percpu |
60 | * mode; if set, then get/put will manipulate the atomic_t. | 87 | * mode; if set, then get/put will manipulate the atomic_t. |
61 | */ | 88 | */ |
62 | unsigned long pcpu_count_ptr; | 89 | unsigned long percpu_count_ptr; |
63 | percpu_ref_func_t *release; | 90 | percpu_ref_func_t *release; |
64 | percpu_ref_func_t *confirm_kill; | 91 | percpu_ref_func_t *confirm_switch; |
92 | bool force_atomic:1; | ||
65 | struct rcu_head rcu; | 93 | struct rcu_head rcu; |
66 | }; | 94 | }; |
67 | 95 | ||
68 | int __must_check percpu_ref_init(struct percpu_ref *ref, | 96 | int __must_check percpu_ref_init(struct percpu_ref *ref, |
69 | percpu_ref_func_t *release); | 97 | percpu_ref_func_t *release, unsigned int flags, |
70 | void percpu_ref_reinit(struct percpu_ref *ref); | 98 | gfp_t gfp); |
71 | void percpu_ref_exit(struct percpu_ref *ref); | 99 | void percpu_ref_exit(struct percpu_ref *ref); |
100 | void percpu_ref_switch_to_atomic(struct percpu_ref *ref, | ||
101 | percpu_ref_func_t *confirm_switch); | ||
102 | void percpu_ref_switch_to_percpu(struct percpu_ref *ref); | ||
72 | void percpu_ref_kill_and_confirm(struct percpu_ref *ref, | 103 | void percpu_ref_kill_and_confirm(struct percpu_ref *ref, |
73 | percpu_ref_func_t *confirm_kill); | 104 | percpu_ref_func_t *confirm_kill); |
74 | void __percpu_ref_kill_expedited(struct percpu_ref *ref); | 105 | void percpu_ref_reinit(struct percpu_ref *ref); |
75 | 106 | ||
76 | /** | 107 | /** |
77 | * percpu_ref_kill - drop the initial ref | 108 | * percpu_ref_kill - drop the initial ref |
@@ -88,26 +119,24 @@ static inline void percpu_ref_kill(struct percpu_ref *ref) | |||
88 | return percpu_ref_kill_and_confirm(ref, NULL); | 119 | return percpu_ref_kill_and_confirm(ref, NULL); |
89 | } | 120 | } |
90 | 121 | ||
91 | #define PCPU_REF_DEAD 1 | ||
92 | |||
93 | /* | 122 | /* |
94 | * Internal helper. Don't use outside percpu-refcount proper. The | 123 | * Internal helper. Don't use outside percpu-refcount proper. The |
95 | * function doesn't return the pointer and let the caller test it for NULL | 124 | * function doesn't return the pointer and let the caller test it for NULL |
96 | * because doing so forces the compiler to generate two conditional | 125 | * because doing so forces the compiler to generate two conditional |
97 | * branches as it can't assume that @ref->pcpu_count is not NULL. | 126 | * branches as it can't assume that @ref->percpu_count is not NULL. |
98 | */ | 127 | */ |
99 | static inline bool __pcpu_ref_alive(struct percpu_ref *ref, | 128 | static inline bool __ref_is_percpu(struct percpu_ref *ref, |
100 | unsigned __percpu **pcpu_countp) | 129 | unsigned long __percpu **percpu_countp) |
101 | { | 130 | { |
102 | unsigned long pcpu_ptr = ACCESS_ONCE(ref->pcpu_count_ptr); | 131 | unsigned long percpu_ptr = ACCESS_ONCE(ref->percpu_count_ptr); |
103 | 132 | ||
104 | /* paired with smp_store_release() in percpu_ref_reinit() */ | 133 | /* paired with smp_store_release() in percpu_ref_reinit() */ |
105 | smp_read_barrier_depends(); | 134 | smp_read_barrier_depends(); |
106 | 135 | ||
107 | if (unlikely(pcpu_ptr & PCPU_REF_DEAD)) | 136 | if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC)) |
108 | return false; | 137 | return false; |
109 | 138 | ||
110 | *pcpu_countp = (unsigned __percpu *)pcpu_ptr; | 139 | *percpu_countp = (unsigned long __percpu *)percpu_ptr; |
111 | return true; | 140 | return true; |
112 | } | 141 | } |
113 | 142 | ||
@@ -115,18 +144,20 @@ static inline bool __pcpu_ref_alive(struct percpu_ref *ref, | |||
115 | * percpu_ref_get - increment a percpu refcount | 144 | * percpu_ref_get - increment a percpu refcount |
116 | * @ref: percpu_ref to get | 145 | * @ref: percpu_ref to get |
117 | * | 146 | * |
118 | * Analagous to atomic_inc(). | 147 | * Analagous to atomic_long_inc(). |
119 | */ | 148 | * |
149 | * This function is safe to call as long as @ref is between init and exit. | ||
150 | */ | ||
120 | static inline void percpu_ref_get(struct percpu_ref *ref) | 151 | static inline void percpu_ref_get(struct percpu_ref *ref) |
121 | { | 152 | { |
122 | unsigned __percpu *pcpu_count; | 153 | unsigned long __percpu *percpu_count; |
123 | 154 | ||
124 | rcu_read_lock_sched(); | 155 | rcu_read_lock_sched(); |
125 | 156 | ||
126 | if (__pcpu_ref_alive(ref, &pcpu_count)) | 157 | if (__ref_is_percpu(ref, &percpu_count)) |
127 | this_cpu_inc(*pcpu_count); | 158 | this_cpu_inc(*percpu_count); |
128 | else | 159 | else |
129 | atomic_inc(&ref->count); | 160 | atomic_long_inc(&ref->count); |
130 | 161 | ||
131 | rcu_read_unlock_sched(); | 162 | rcu_read_unlock_sched(); |
132 | } | 163 | } |
@@ -138,20 +169,20 @@ static inline void percpu_ref_get(struct percpu_ref *ref) | |||
138 | * Increment a percpu refcount unless its count already reached zero. | 169 | * Increment a percpu refcount unless its count already reached zero. |
139 | * Returns %true on success; %false on failure. | 170 | * Returns %true on success; %false on failure. |
140 | * | 171 | * |
141 | * The caller is responsible for ensuring that @ref stays accessible. | 172 | * This function is safe to call as long as @ref is between init and exit. |
142 | */ | 173 | */ |
143 | static inline bool percpu_ref_tryget(struct percpu_ref *ref) | 174 | static inline bool percpu_ref_tryget(struct percpu_ref *ref) |
144 | { | 175 | { |
145 | unsigned __percpu *pcpu_count; | 176 | unsigned long __percpu *percpu_count; |
146 | int ret = false; | 177 | int ret; |
147 | 178 | ||
148 | rcu_read_lock_sched(); | 179 | rcu_read_lock_sched(); |
149 | 180 | ||
150 | if (__pcpu_ref_alive(ref, &pcpu_count)) { | 181 | if (__ref_is_percpu(ref, &percpu_count)) { |
151 | this_cpu_inc(*pcpu_count); | 182 | this_cpu_inc(*percpu_count); |
152 | ret = true; | 183 | ret = true; |
153 | } else { | 184 | } else { |
154 | ret = atomic_inc_not_zero(&ref->count); | 185 | ret = atomic_long_inc_not_zero(&ref->count); |
155 | } | 186 | } |
156 | 187 | ||
157 | rcu_read_unlock_sched(); | 188 | rcu_read_unlock_sched(); |
@@ -166,23 +197,26 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref) | |||
166 | * Increment a percpu refcount unless it has already been killed. Returns | 197 | * Increment a percpu refcount unless it has already been killed. Returns |
167 | * %true on success; %false on failure. | 198 | * %true on success; %false on failure. |
168 | * | 199 | * |
169 | * Completion of percpu_ref_kill() in itself doesn't guarantee that tryget | 200 | * Completion of percpu_ref_kill() in itself doesn't guarantee that this |
170 | * will fail. For such guarantee, percpu_ref_kill_and_confirm() should be | 201 | * function will fail. For such guarantee, percpu_ref_kill_and_confirm() |
171 | * used. After the confirm_kill callback is invoked, it's guaranteed that | 202 | * should be used. After the confirm_kill callback is invoked, it's |
172 | * no new reference will be given out by percpu_ref_tryget(). | 203 | * guaranteed that no new reference will be given out by |
204 | * percpu_ref_tryget_live(). | ||
173 | * | 205 | * |
174 | * The caller is responsible for ensuring that @ref stays accessible. | 206 | * This function is safe to call as long as @ref is between init and exit. |
175 | */ | 207 | */ |
176 | static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) | 208 | static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) |
177 | { | 209 | { |
178 | unsigned __percpu *pcpu_count; | 210 | unsigned long __percpu *percpu_count; |
179 | int ret = false; | 211 | int ret = false; |
180 | 212 | ||
181 | rcu_read_lock_sched(); | 213 | rcu_read_lock_sched(); |
182 | 214 | ||
183 | if (__pcpu_ref_alive(ref, &pcpu_count)) { | 215 | if (__ref_is_percpu(ref, &percpu_count)) { |
184 | this_cpu_inc(*pcpu_count); | 216 | this_cpu_inc(*percpu_count); |
185 | ret = true; | 217 | ret = true; |
218 | } else if (!(ACCESS_ONCE(ref->percpu_count_ptr) & __PERCPU_REF_DEAD)) { | ||
219 | ret = atomic_long_inc_not_zero(&ref->count); | ||
186 | } | 220 | } |
187 | 221 | ||
188 | rcu_read_unlock_sched(); | 222 | rcu_read_unlock_sched(); |
@@ -196,16 +230,18 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) | |||
196 | * | 230 | * |
197 | * Decrement the refcount, and if 0, call the release function (which was passed | 231 | * Decrement the refcount, and if 0, call the release function (which was passed |
198 | * to percpu_ref_init()) | 232 | * to percpu_ref_init()) |
233 | * | ||
234 | * This function is safe to call as long as @ref is between init and exit. | ||
199 | */ | 235 | */ |
200 | static inline void percpu_ref_put(struct percpu_ref *ref) | 236 | static inline void percpu_ref_put(struct percpu_ref *ref) |
201 | { | 237 | { |
202 | unsigned __percpu *pcpu_count; | 238 | unsigned long __percpu *percpu_count; |
203 | 239 | ||
204 | rcu_read_lock_sched(); | 240 | rcu_read_lock_sched(); |
205 | 241 | ||
206 | if (__pcpu_ref_alive(ref, &pcpu_count)) | 242 | if (__ref_is_percpu(ref, &percpu_count)) |
207 | this_cpu_dec(*pcpu_count); | 243 | this_cpu_dec(*percpu_count); |
208 | else if (unlikely(atomic_dec_and_test(&ref->count))) | 244 | else if (unlikely(atomic_long_dec_and_test(&ref->count))) |
209 | ref->release(ref); | 245 | ref->release(ref); |
210 | 246 | ||
211 | rcu_read_unlock_sched(); | 247 | rcu_read_unlock_sched(); |
@@ -216,14 +252,16 @@ static inline void percpu_ref_put(struct percpu_ref *ref) | |||
216 | * @ref: percpu_ref to test | 252 | * @ref: percpu_ref to test |
217 | * | 253 | * |
218 | * Returns %true if @ref reached zero. | 254 | * Returns %true if @ref reached zero. |
255 | * | ||
256 | * This function is safe to call as long as @ref is between init and exit. | ||
219 | */ | 257 | */ |
220 | static inline bool percpu_ref_is_zero(struct percpu_ref *ref) | 258 | static inline bool percpu_ref_is_zero(struct percpu_ref *ref) |
221 | { | 259 | { |
222 | unsigned __percpu *pcpu_count; | 260 | unsigned long __percpu *percpu_count; |
223 | 261 | ||
224 | if (__pcpu_ref_alive(ref, &pcpu_count)) | 262 | if (__ref_is_percpu(ref, &percpu_count)) |
225 | return false; | 263 | return false; |
226 | return !atomic_read(&ref->count); | 264 | return !atomic_long_read(&ref->count); |
227 | } | 265 | } |
228 | 266 | ||
229 | #endif | 267 | #endif |
diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 6f61b61b7996..a3aa63e47637 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h | |||
@@ -48,9 +48,9 @@ | |||
48 | * intelligent way to determine this would be nice. | 48 | * intelligent way to determine this would be nice. |
49 | */ | 49 | */ |
50 | #if BITS_PER_LONG > 32 | 50 | #if BITS_PER_LONG > 32 |
51 | #define PERCPU_DYNAMIC_RESERVE (20 << 10) | 51 | #define PERCPU_DYNAMIC_RESERVE (28 << 10) |
52 | #else | 52 | #else |
53 | #define PERCPU_DYNAMIC_RESERVE (12 << 10) | 53 | #define PERCPU_DYNAMIC_RESERVE (20 << 10) |
54 | #endif | 54 | #endif |
55 | 55 | ||
56 | extern void *pcpu_base_addr; | 56 | extern void *pcpu_base_addr; |
@@ -122,11 +122,16 @@ extern void __init setup_per_cpu_areas(void); | |||
122 | #endif | 122 | #endif |
123 | extern void __init percpu_init_late(void); | 123 | extern void __init percpu_init_late(void); |
124 | 124 | ||
125 | extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); | ||
125 | extern void __percpu *__alloc_percpu(size_t size, size_t align); | 126 | extern void __percpu *__alloc_percpu(size_t size, size_t align); |
126 | extern void free_percpu(void __percpu *__pdata); | 127 | extern void free_percpu(void __percpu *__pdata); |
127 | extern phys_addr_t per_cpu_ptr_to_phys(void *addr); | 128 | extern phys_addr_t per_cpu_ptr_to_phys(void *addr); |
128 | 129 | ||
129 | #define alloc_percpu(type) \ | 130 | #define alloc_percpu_gfp(type, gfp) \ |
130 | (typeof(type) __percpu *)__alloc_percpu(sizeof(type), __alignof__(type)) | 131 | (typeof(type) __percpu *)__alloc_percpu_gfp(sizeof(type), \ |
132 | __alignof__(type), gfp) | ||
133 | #define alloc_percpu(type) \ | ||
134 | (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ | ||
135 | __alignof__(type)) | ||
131 | 136 | ||
132 | #endif /* __LINUX_PERCPU_H */ | 137 | #endif /* __LINUX_PERCPU_H */ |
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index d5dd4657c8d6..50e50095c8d1 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/threads.h> | 12 | #include <linux/threads.h> |
13 | #include <linux/percpu.h> | 13 | #include <linux/percpu.h> |
14 | #include <linux/types.h> | 14 | #include <linux/types.h> |
15 | #include <linux/gfp.h> | ||
15 | 16 | ||
16 | #ifdef CONFIG_SMP | 17 | #ifdef CONFIG_SMP |
17 | 18 | ||
@@ -26,14 +27,14 @@ struct percpu_counter { | |||
26 | 27 | ||
27 | extern int percpu_counter_batch; | 28 | extern int percpu_counter_batch; |
28 | 29 | ||
29 | int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, | 30 | int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, |
30 | struct lock_class_key *key); | 31 | struct lock_class_key *key); |
31 | 32 | ||
32 | #define percpu_counter_init(fbc, value) \ | 33 | #define percpu_counter_init(fbc, value, gfp) \ |
33 | ({ \ | 34 | ({ \ |
34 | static struct lock_class_key __key; \ | 35 | static struct lock_class_key __key; \ |
35 | \ | 36 | \ |
36 | __percpu_counter_init(fbc, value, &__key); \ | 37 | __percpu_counter_init(fbc, value, gfp, &__key); \ |
37 | }) | 38 | }) |
38 | 39 | ||
39 | void percpu_counter_destroy(struct percpu_counter *fbc); | 40 | void percpu_counter_destroy(struct percpu_counter *fbc); |
@@ -89,7 +90,8 @@ struct percpu_counter { | |||
89 | s64 count; | 90 | s64 count; |
90 | }; | 91 | }; |
91 | 92 | ||
92 | static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount) | 93 | static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount, |
94 | gfp_t gfp) | ||
93 | { | 95 | { |
94 | fbc->count = amount; | 96 | fbc->count = amount; |
95 | return 0; | 97 | return 0; |
diff --git a/include/linux/proportions.h b/include/linux/proportions.h index 26a8a4ed9b07..00e8e8fa7358 100644 --- a/include/linux/proportions.h +++ b/include/linux/proportions.h | |||
@@ -12,6 +12,7 @@ | |||
12 | #include <linux/percpu_counter.h> | 12 | #include <linux/percpu_counter.h> |
13 | #include <linux/spinlock.h> | 13 | #include <linux/spinlock.h> |
14 | #include <linux/mutex.h> | 14 | #include <linux/mutex.h> |
15 | #include <linux/gfp.h> | ||
15 | 16 | ||
16 | struct prop_global { | 17 | struct prop_global { |
17 | /* | 18 | /* |
@@ -40,7 +41,7 @@ struct prop_descriptor { | |||
40 | struct mutex mutex; /* serialize the prop_global switch */ | 41 | struct mutex mutex; /* serialize the prop_global switch */ |
41 | }; | 42 | }; |
42 | 43 | ||
43 | int prop_descriptor_init(struct prop_descriptor *pd, int shift); | 44 | int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp); |
44 | void prop_change_shift(struct prop_descriptor *pd, int new_shift); | 45 | void prop_change_shift(struct prop_descriptor *pd, int new_shift); |
45 | 46 | ||
46 | /* | 47 | /* |
@@ -61,7 +62,7 @@ struct prop_local_percpu { | |||
61 | raw_spinlock_t lock; /* protect the snapshot state */ | 62 | raw_spinlock_t lock; /* protect the snapshot state */ |
62 | }; | 63 | }; |
63 | 64 | ||
64 | int prop_local_init_percpu(struct prop_local_percpu *pl); | 65 | int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp); |
65 | void prop_local_destroy_percpu(struct prop_local_percpu *pl); | 66 | void prop_local_destroy_percpu(struct prop_local_percpu *pl); |
66 | void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl); | 67 | void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl); |
67 | void prop_fraction_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl, | 68 | void prop_fraction_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl, |
diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h index 2f26dfb8450e..1f99a1de0e4f 100644 --- a/include/net/dst_ops.h +++ b/include/net/dst_ops.h | |||
@@ -63,7 +63,7 @@ static inline void dst_entries_add(struct dst_ops *dst, int val) | |||
63 | 63 | ||
64 | static inline int dst_entries_init(struct dst_ops *dst) | 64 | static inline int dst_entries_init(struct dst_ops *dst) |
65 | { | 65 | { |
66 | return percpu_counter_init(&dst->pcpuc_entries, 0); | 66 | return percpu_counter_init(&dst->pcpuc_entries, 0, GFP_KERNEL); |
67 | } | 67 | } |
68 | 68 | ||
69 | static inline void dst_entries_destroy(struct dst_ops *dst) | 69 | static inline void dst_entries_destroy(struct dst_ops *dst) |
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 65a8855e99fe..8d1765577acc 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h | |||
@@ -151,7 +151,7 @@ static inline void add_frag_mem_limit(struct inet_frag_queue *q, int i) | |||
151 | 151 | ||
152 | static inline void init_frag_mem_limit(struct netns_frags *nf) | 152 | static inline void init_frag_mem_limit(struct netns_frags *nf) |
153 | { | 153 | { |
154 | percpu_counter_init(&nf->mem, 0); | 154 | percpu_counter_init(&nf->mem, 0, GFP_KERNEL); |
155 | } | 155 | } |
156 | 156 | ||
157 | static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf) | 157 | static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf) |
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index cab7dc4284dc..136eceadeed1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -1607,7 +1607,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask) | |||
1607 | goto out; | 1607 | goto out; |
1608 | root_cgrp->id = ret; | 1608 | root_cgrp->id = ret; |
1609 | 1609 | ||
1610 | ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); | 1610 | ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, |
1611 | GFP_KERNEL); | ||
1611 | if (ret) | 1612 | if (ret) |
1612 | goto out; | 1613 | goto out; |
1613 | 1614 | ||
@@ -4482,7 +4483,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, | |||
4482 | 4483 | ||
4483 | init_and_link_css(css, ss, cgrp); | 4484 | init_and_link_css(css, ss, cgrp); |
4484 | 4485 | ||
4485 | err = percpu_ref_init(&css->refcnt, css_release); | 4486 | err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL); |
4486 | if (err) | 4487 | if (err) |
4487 | goto err_free_css; | 4488 | goto err_free_css; |
4488 | 4489 | ||
@@ -4555,7 +4556,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, | |||
4555 | goto out_unlock; | 4556 | goto out_unlock; |
4556 | } | 4557 | } |
4557 | 4558 | ||
4558 | ret = percpu_ref_init(&cgrp->self.refcnt, css_release); | 4559 | ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); |
4559 | if (ret) | 4560 | if (ret) |
4560 | goto out_free_cgrp; | 4561 | goto out_free_cgrp; |
4561 | 4562 | ||
diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c index ebf3bac460b0..8f25652f40d4 100644 --- a/lib/flex_proportions.c +++ b/lib/flex_proportions.c | |||
@@ -34,13 +34,13 @@ | |||
34 | */ | 34 | */ |
35 | #include <linux/flex_proportions.h> | 35 | #include <linux/flex_proportions.h> |
36 | 36 | ||
37 | int fprop_global_init(struct fprop_global *p) | 37 | int fprop_global_init(struct fprop_global *p, gfp_t gfp) |
38 | { | 38 | { |
39 | int err; | 39 | int err; |
40 | 40 | ||
41 | p->period = 0; | 41 | p->period = 0; |
42 | /* Use 1 to avoid dealing with periods with 0 events... */ | 42 | /* Use 1 to avoid dealing with periods with 0 events... */ |
43 | err = percpu_counter_init(&p->events, 1); | 43 | err = percpu_counter_init(&p->events, 1, gfp); |
44 | if (err) | 44 | if (err) |
45 | return err; | 45 | return err; |
46 | seqcount_init(&p->sequence); | 46 | seqcount_init(&p->sequence); |
@@ -168,11 +168,11 @@ void fprop_fraction_single(struct fprop_global *p, | |||
168 | */ | 168 | */ |
169 | #define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) | 169 | #define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) |
170 | 170 | ||
171 | int fprop_local_init_percpu(struct fprop_local_percpu *pl) | 171 | int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp) |
172 | { | 172 | { |
173 | int err; | 173 | int err; |
174 | 174 | ||
175 | err = percpu_counter_init(&pl->events, 0); | 175 | err = percpu_counter_init(&pl->events, 0, gfp); |
176 | if (err) | 176 | if (err) |
177 | return err; | 177 | return err; |
178 | pl->period = 0; | 178 | pl->period = 0; |
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index a89cf09a8268..6111bcb28376 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c | |||
@@ -1,6 +1,8 @@ | |||
1 | #define pr_fmt(fmt) "%s: " fmt "\n", __func__ | 1 | #define pr_fmt(fmt) "%s: " fmt "\n", __func__ |
2 | 2 | ||
3 | #include <linux/kernel.h> | 3 | #include <linux/kernel.h> |
4 | #include <linux/sched.h> | ||
5 | #include <linux/wait.h> | ||
4 | #include <linux/percpu-refcount.h> | 6 | #include <linux/percpu-refcount.h> |
5 | 7 | ||
6 | /* | 8 | /* |
@@ -11,8 +13,8 @@ | |||
11 | * percpu counters will all sum to the correct value | 13 | * percpu counters will all sum to the correct value |
12 | * | 14 | * |
13 | * (More precisely: because moduler arithmatic is commutative the sum of all the | 15 | * (More precisely: because moduler arithmatic is commutative the sum of all the |
14 | * pcpu_count vars will be equal to what it would have been if all the gets and | 16 | * percpu_count vars will be equal to what it would have been if all the gets |
15 | * puts were done to a single integer, even if some of the percpu integers | 17 | * and puts were done to a single integer, even if some of the percpu integers |
16 | * overflow or underflow). | 18 | * overflow or underflow). |
17 | * | 19 | * |
18 | * The real trick to implementing percpu refcounts is shutdown. We can't detect | 20 | * The real trick to implementing percpu refcounts is shutdown. We can't detect |
@@ -25,75 +27,64 @@ | |||
25 | * works. | 27 | * works. |
26 | * | 28 | * |
27 | * Converting to non percpu mode is done with some RCUish stuff in | 29 | * Converting to non percpu mode is done with some RCUish stuff in |
28 | * percpu_ref_kill. Additionally, we need a bias value so that the atomic_t | 30 | * percpu_ref_kill. Additionally, we need a bias value so that the |
29 | * can't hit 0 before we've added up all the percpu refs. | 31 | * atomic_long_t can't hit 0 before we've added up all the percpu refs. |
30 | */ | 32 | */ |
31 | 33 | ||
32 | #define PCPU_COUNT_BIAS (1U << 31) | 34 | #define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1)) |
33 | 35 | ||
34 | static unsigned __percpu *pcpu_count_ptr(struct percpu_ref *ref) | 36 | static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq); |
37 | |||
38 | static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref) | ||
35 | { | 39 | { |
36 | return (unsigned __percpu *)(ref->pcpu_count_ptr & ~PCPU_REF_DEAD); | 40 | return (unsigned long __percpu *) |
41 | (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD); | ||
37 | } | 42 | } |
38 | 43 | ||
39 | /** | 44 | /** |
40 | * percpu_ref_init - initialize a percpu refcount | 45 | * percpu_ref_init - initialize a percpu refcount |
41 | * @ref: percpu_ref to initialize | 46 | * @ref: percpu_ref to initialize |
42 | * @release: function which will be called when refcount hits 0 | 47 | * @release: function which will be called when refcount hits 0 |
48 | * @flags: PERCPU_REF_INIT_* flags | ||
49 | * @gfp: allocation mask to use | ||
43 | * | 50 | * |
44 | * Initializes the refcount in single atomic counter mode with a refcount of 1; | 51 | * Initializes @ref. If @flags is zero, @ref starts in percpu mode with a |
45 | * analagous to atomic_set(ref, 1). | 52 | * refcount of 1; analagous to atomic_long_set(ref, 1). See the |
53 | * definitions of PERCPU_REF_INIT_* flags for flag behaviors. | ||
46 | * | 54 | * |
47 | * Note that @release must not sleep - it may potentially be called from RCU | 55 | * Note that @release must not sleep - it may potentially be called from RCU |
48 | * callback context by percpu_ref_kill(). | 56 | * callback context by percpu_ref_kill(). |
49 | */ | 57 | */ |
50 | int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release) | 58 | int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release, |
59 | unsigned int flags, gfp_t gfp) | ||
51 | { | 60 | { |
52 | atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); | 61 | size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS, |
62 | __alignof__(unsigned long)); | ||
63 | unsigned long start_count = 0; | ||
53 | 64 | ||
54 | ref->pcpu_count_ptr = (unsigned long)alloc_percpu(unsigned); | 65 | ref->percpu_count_ptr = (unsigned long) |
55 | if (!ref->pcpu_count_ptr) | 66 | __alloc_percpu_gfp(sizeof(unsigned long), align, gfp); |
67 | if (!ref->percpu_count_ptr) | ||
56 | return -ENOMEM; | 68 | return -ENOMEM; |
57 | 69 | ||
58 | ref->release = release; | 70 | ref->force_atomic = flags & PERCPU_REF_INIT_ATOMIC; |
59 | return 0; | ||
60 | } | ||
61 | EXPORT_SYMBOL_GPL(percpu_ref_init); | ||
62 | |||
63 | /** | ||
64 | * percpu_ref_reinit - re-initialize a percpu refcount | ||
65 | * @ref: perpcu_ref to re-initialize | ||
66 | * | ||
67 | * Re-initialize @ref so that it's in the same state as when it finished | ||
68 | * percpu_ref_init(). @ref must have been initialized successfully, killed | ||
69 | * and reached 0 but not exited. | ||
70 | * | ||
71 | * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while | ||
72 | * this function is in progress. | ||
73 | */ | ||
74 | void percpu_ref_reinit(struct percpu_ref *ref) | ||
75 | { | ||
76 | unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); | ||
77 | int cpu; | ||
78 | 71 | ||
79 | BUG_ON(!pcpu_count); | 72 | if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD)) |
80 | WARN_ON(!percpu_ref_is_zero(ref)); | 73 | ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; |
74 | else | ||
75 | start_count += PERCPU_COUNT_BIAS; | ||
81 | 76 | ||
82 | atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); | 77 | if (flags & PERCPU_REF_INIT_DEAD) |
78 | ref->percpu_count_ptr |= __PERCPU_REF_DEAD; | ||
79 | else | ||
80 | start_count++; | ||
83 | 81 | ||
84 | /* | 82 | atomic_long_set(&ref->count, start_count); |
85 | * Restore per-cpu operation. smp_store_release() is paired with | ||
86 | * smp_read_barrier_depends() in __pcpu_ref_alive() and guarantees | ||
87 | * that the zeroing is visible to all percpu accesses which can see | ||
88 | * the following PCPU_REF_DEAD clearing. | ||
89 | */ | ||
90 | for_each_possible_cpu(cpu) | ||
91 | *per_cpu_ptr(pcpu_count, cpu) = 0; | ||
92 | 83 | ||
93 | smp_store_release(&ref->pcpu_count_ptr, | 84 | ref->release = release; |
94 | ref->pcpu_count_ptr & ~PCPU_REF_DEAD); | 85 | return 0; |
95 | } | 86 | } |
96 | EXPORT_SYMBOL_GPL(percpu_ref_reinit); | 87 | EXPORT_SYMBOL_GPL(percpu_ref_init); |
97 | 88 | ||
98 | /** | 89 | /** |
99 | * percpu_ref_exit - undo percpu_ref_init() | 90 | * percpu_ref_exit - undo percpu_ref_init() |
@@ -107,26 +98,39 @@ EXPORT_SYMBOL_GPL(percpu_ref_reinit); | |||
107 | */ | 98 | */ |
108 | void percpu_ref_exit(struct percpu_ref *ref) | 99 | void percpu_ref_exit(struct percpu_ref *ref) |
109 | { | 100 | { |
110 | unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); | 101 | unsigned long __percpu *percpu_count = percpu_count_ptr(ref); |
111 | 102 | ||
112 | if (pcpu_count) { | 103 | if (percpu_count) { |
113 | free_percpu(pcpu_count); | 104 | free_percpu(percpu_count); |
114 | ref->pcpu_count_ptr = PCPU_REF_DEAD; | 105 | ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD; |
115 | } | 106 | } |
116 | } | 107 | } |
117 | EXPORT_SYMBOL_GPL(percpu_ref_exit); | 108 | EXPORT_SYMBOL_GPL(percpu_ref_exit); |
118 | 109 | ||
119 | static void percpu_ref_kill_rcu(struct rcu_head *rcu) | 110 | static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu) |
111 | { | ||
112 | struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); | ||
113 | |||
114 | ref->confirm_switch(ref); | ||
115 | ref->confirm_switch = NULL; | ||
116 | wake_up_all(&percpu_ref_switch_waitq); | ||
117 | |||
118 | /* drop ref from percpu_ref_switch_to_atomic() */ | ||
119 | percpu_ref_put(ref); | ||
120 | } | ||
121 | |||
122 | static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu) | ||
120 | { | 123 | { |
121 | struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); | 124 | struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); |
122 | unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); | 125 | unsigned long __percpu *percpu_count = percpu_count_ptr(ref); |
123 | unsigned count = 0; | 126 | unsigned long count = 0; |
124 | int cpu; | 127 | int cpu; |
125 | 128 | ||
126 | for_each_possible_cpu(cpu) | 129 | for_each_possible_cpu(cpu) |
127 | count += *per_cpu_ptr(pcpu_count, cpu); | 130 | count += *per_cpu_ptr(percpu_count, cpu); |
128 | 131 | ||
129 | pr_debug("global %i pcpu %i", atomic_read(&ref->count), (int) count); | 132 | pr_debug("global %ld percpu %ld", |
133 | atomic_long_read(&ref->count), (long)count); | ||
130 | 134 | ||
131 | /* | 135 | /* |
132 | * It's crucial that we sum the percpu counters _before_ adding the sum | 136 | * It's crucial that we sum the percpu counters _before_ adding the sum |
@@ -140,21 +144,137 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) | |||
140 | * reaching 0 before we add the percpu counts. But doing it at the same | 144 | * reaching 0 before we add the percpu counts. But doing it at the same |
141 | * time is equivalent and saves us atomic operations: | 145 | * time is equivalent and saves us atomic operations: |
142 | */ | 146 | */ |
147 | atomic_long_add((long)count - PERCPU_COUNT_BIAS, &ref->count); | ||
148 | |||
149 | WARN_ONCE(atomic_long_read(&ref->count) <= 0, | ||
150 | "percpu ref (%pf) <= 0 (%ld) after switching to atomic", | ||
151 | ref->release, atomic_long_read(&ref->count)); | ||
152 | |||
153 | /* @ref is viewed as dead on all CPUs, send out switch confirmation */ | ||
154 | percpu_ref_call_confirm_rcu(rcu); | ||
155 | } | ||
156 | |||
157 | static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref) | ||
158 | { | ||
159 | } | ||
160 | |||
161 | static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref, | ||
162 | percpu_ref_func_t *confirm_switch) | ||
163 | { | ||
164 | if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) { | ||
165 | /* switching from percpu to atomic */ | ||
166 | ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC; | ||
167 | |||
168 | /* | ||
169 | * Non-NULL ->confirm_switch is used to indicate that | ||
170 | * switching is in progress. Use noop one if unspecified. | ||
171 | */ | ||
172 | WARN_ON_ONCE(ref->confirm_switch); | ||
173 | ref->confirm_switch = | ||
174 | confirm_switch ?: percpu_ref_noop_confirm_switch; | ||
175 | |||
176 | percpu_ref_get(ref); /* put after confirmation */ | ||
177 | call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu); | ||
178 | } else if (confirm_switch) { | ||
179 | /* | ||
180 | * Somebody already set ATOMIC. Switching may still be in | ||
181 | * progress. @confirm_switch must be invoked after the | ||
182 | * switching is complete and a full sched RCU grace period | ||
183 | * has passed. Wait synchronously for the previous | ||
184 | * switching and schedule @confirm_switch invocation. | ||
185 | */ | ||
186 | wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); | ||
187 | ref->confirm_switch = confirm_switch; | ||
143 | 188 | ||
144 | atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count); | 189 | percpu_ref_get(ref); /* put after confirmation */ |
190 | call_rcu_sched(&ref->rcu, percpu_ref_call_confirm_rcu); | ||
191 | } | ||
192 | } | ||
193 | |||
194 | /** | ||
195 | * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode | ||
196 | * @ref: percpu_ref to switch to atomic mode | ||
197 | * @confirm_switch: optional confirmation callback | ||
198 | * | ||
199 | * There's no reason to use this function for the usual reference counting. | ||
200 | * Use percpu_ref_kill[_and_confirm](). | ||
201 | * | ||
202 | * Schedule switching of @ref to atomic mode. All its percpu counts will | ||
203 | * be collected to the main atomic counter. On completion, when all CPUs | ||
204 | * are guaraneed to be in atomic mode, @confirm_switch, which may not | ||
205 | * block, is invoked. This function may be invoked concurrently with all | ||
206 | * the get/put operations and can safely be mixed with kill and reinit | ||
207 | * operations. Note that @ref will stay in atomic mode across kill/reinit | ||
208 | * cycles until percpu_ref_switch_to_percpu() is called. | ||
209 | * | ||
210 | * This function normally doesn't block and can be called from any context | ||
211 | * but it may block if @confirm_kill is specified and @ref is already in | ||
212 | * the process of switching to atomic mode. In such cases, @confirm_switch | ||
213 | * will be invoked after the switching is complete. | ||
214 | * | ||
215 | * Due to the way percpu_ref is implemented, @confirm_switch will be called | ||
216 | * after at least one full sched RCU grace period has passed but this is an | ||
217 | * implementation detail and must not be depended upon. | ||
218 | */ | ||
219 | void percpu_ref_switch_to_atomic(struct percpu_ref *ref, | ||
220 | percpu_ref_func_t *confirm_switch) | ||
221 | { | ||
222 | ref->force_atomic = true; | ||
223 | __percpu_ref_switch_to_atomic(ref, confirm_switch); | ||
224 | } | ||
145 | 225 | ||
146 | WARN_ONCE(atomic_read(&ref->count) <= 0, "percpu ref <= 0 (%i)", | 226 | static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref) |
147 | atomic_read(&ref->count)); | 227 | { |
228 | unsigned long __percpu *percpu_count = percpu_count_ptr(ref); | ||
229 | int cpu; | ||
230 | |||
231 | BUG_ON(!percpu_count); | ||
148 | 232 | ||
149 | /* @ref is viewed as dead on all CPUs, send out kill confirmation */ | 233 | if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) |
150 | if (ref->confirm_kill) | 234 | return; |
151 | ref->confirm_kill(ref); | 235 | |
236 | wait_event(percpu_ref_switch_waitq, !ref->confirm_switch); | ||
237 | |||
238 | atomic_long_add(PERCPU_COUNT_BIAS, &ref->count); | ||
152 | 239 | ||
153 | /* | 240 | /* |
154 | * Now we're in single atomic_t mode with a consistent refcount, so it's | 241 | * Restore per-cpu operation. smp_store_release() is paired with |
155 | * safe to drop our initial ref: | 242 | * smp_read_barrier_depends() in __ref_is_percpu() and guarantees |
243 | * that the zeroing is visible to all percpu accesses which can see | ||
244 | * the following __PERCPU_REF_ATOMIC clearing. | ||
156 | */ | 245 | */ |
157 | percpu_ref_put(ref); | 246 | for_each_possible_cpu(cpu) |
247 | *per_cpu_ptr(percpu_count, cpu) = 0; | ||
248 | |||
249 | smp_store_release(&ref->percpu_count_ptr, | ||
250 | ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC); | ||
251 | } | ||
252 | |||
253 | /** | ||
254 | * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode | ||
255 | * @ref: percpu_ref to switch to percpu mode | ||
256 | * | ||
257 | * There's no reason to use this function for the usual reference counting. | ||
258 | * To re-use an expired ref, use percpu_ref_reinit(). | ||
259 | * | ||
260 | * Switch @ref to percpu mode. This function may be invoked concurrently | ||
261 | * with all the get/put operations and can safely be mixed with kill and | ||
262 | * reinit operations. This function reverses the sticky atomic state set | ||
263 | * by PERCPU_REF_INIT_ATOMIC or percpu_ref_switch_to_atomic(). If @ref is | ||
264 | * dying or dead, the actual switching takes place on the following | ||
265 | * percpu_ref_reinit(). | ||
266 | * | ||
267 | * This function normally doesn't block and can be called from any context | ||
268 | * but it may block if @ref is in the process of switching to atomic mode | ||
269 | * by percpu_ref_switch_atomic(). | ||
270 | */ | ||
271 | void percpu_ref_switch_to_percpu(struct percpu_ref *ref) | ||
272 | { | ||
273 | ref->force_atomic = false; | ||
274 | |||
275 | /* a dying or dead ref can't be switched to percpu mode w/o reinit */ | ||
276 | if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD)) | ||
277 | __percpu_ref_switch_to_percpu(ref); | ||
158 | } | 278 | } |
159 | 279 | ||
160 | /** | 280 | /** |
@@ -164,39 +284,48 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu) | |||
164 | * | 284 | * |
165 | * Equivalent to percpu_ref_kill() but also schedules kill confirmation if | 285 | * Equivalent to percpu_ref_kill() but also schedules kill confirmation if |
166 | * @confirm_kill is not NULL. @confirm_kill, which may not block, will be | 286 | * @confirm_kill is not NULL. @confirm_kill, which may not block, will be |
167 | * called after @ref is seen as dead from all CPUs - all further | 287 | * called after @ref is seen as dead from all CPUs at which point all |
168 | * invocations of percpu_ref_tryget() will fail. See percpu_ref_tryget() | 288 | * further invocations of percpu_ref_tryget_live() will fail. See |
169 | * for more details. | 289 | * percpu_ref_tryget_live() for details. |
290 | * | ||
291 | * This function normally doesn't block and can be called from any context | ||
292 | * but it may block if @confirm_kill is specified and @ref is in the | ||
293 | * process of switching to atomic mode by percpu_ref_switch_atomic(). | ||
170 | * | 294 | * |
171 | * Due to the way percpu_ref is implemented, @confirm_kill will be called | 295 | * Due to the way percpu_ref is implemented, @confirm_switch will be called |
172 | * after at least one full RCU grace period has passed but this is an | 296 | * after at least one full sched RCU grace period has passed but this is an |
173 | * implementation detail and callers must not depend on it. | 297 | * implementation detail and must not be depended upon. |
174 | */ | 298 | */ |
175 | void percpu_ref_kill_and_confirm(struct percpu_ref *ref, | 299 | void percpu_ref_kill_and_confirm(struct percpu_ref *ref, |
176 | percpu_ref_func_t *confirm_kill) | 300 | percpu_ref_func_t *confirm_kill) |
177 | { | 301 | { |
178 | WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, | 302 | WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD, |
179 | "percpu_ref_kill() called more than once!\n"); | 303 | "%s called more than once on %pf!", __func__, ref->release); |
180 | 304 | ||
181 | ref->pcpu_count_ptr |= PCPU_REF_DEAD; | 305 | ref->percpu_count_ptr |= __PERCPU_REF_DEAD; |
182 | ref->confirm_kill = confirm_kill; | 306 | __percpu_ref_switch_to_atomic(ref, confirm_kill); |
183 | 307 | percpu_ref_put(ref); | |
184 | call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); | ||
185 | } | 308 | } |
186 | EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); | 309 | EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); |
187 | 310 | ||
188 | /* | 311 | /** |
189 | * XXX: Temporary kludge to work around SCSI blk-mq stall. Used only by | 312 | * percpu_ref_reinit - re-initialize a percpu refcount |
190 | * block/blk-mq.c::blk_mq_freeze_queue(). Will be removed during v3.18 | 313 | * @ref: perpcu_ref to re-initialize |
191 | * devel cycle. Do not use anywhere else. | 314 | * |
315 | * Re-initialize @ref so that it's in the same state as when it finished | ||
316 | * percpu_ref_init() ignoring %PERCPU_REF_INIT_DEAD. @ref must have been | ||
317 | * initialized successfully and reached 0 but not exited. | ||
318 | * | ||
319 | * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while | ||
320 | * this function is in progress. | ||
192 | */ | 321 | */ |
193 | void __percpu_ref_kill_expedited(struct percpu_ref *ref) | 322 | void percpu_ref_reinit(struct percpu_ref *ref) |
194 | { | 323 | { |
195 | WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, | 324 | WARN_ON_ONCE(!percpu_ref_is_zero(ref)); |
196 | "percpu_ref_kill() called more than once on %pf!", | ||
197 | ref->release); | ||
198 | 325 | ||
199 | ref->pcpu_count_ptr |= PCPU_REF_DEAD; | 326 | ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD; |
200 | synchronize_sched_expedited(); | 327 | percpu_ref_get(ref); |
201 | percpu_ref_kill_rcu(&ref->rcu); | 328 | if (!ref->force_atomic) |
329 | __percpu_ref_switch_to_percpu(ref); | ||
202 | } | 330 | } |
331 | EXPORT_SYMBOL_GPL(percpu_ref_reinit); | ||
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 7dd33577b905..48144cdae819 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c | |||
@@ -112,13 +112,15 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc) | |||
112 | } | 112 | } |
113 | EXPORT_SYMBOL(__percpu_counter_sum); | 113 | EXPORT_SYMBOL(__percpu_counter_sum); |
114 | 114 | ||
115 | int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, | 115 | int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, |
116 | struct lock_class_key *key) | 116 | struct lock_class_key *key) |
117 | { | 117 | { |
118 | unsigned long flags __maybe_unused; | ||
119 | |||
118 | raw_spin_lock_init(&fbc->lock); | 120 | raw_spin_lock_init(&fbc->lock); |
119 | lockdep_set_class(&fbc->lock, key); | 121 | lockdep_set_class(&fbc->lock, key); |
120 | fbc->count = amount; | 122 | fbc->count = amount; |
121 | fbc->counters = alloc_percpu(s32); | 123 | fbc->counters = alloc_percpu_gfp(s32, gfp); |
122 | if (!fbc->counters) | 124 | if (!fbc->counters) |
123 | return -ENOMEM; | 125 | return -ENOMEM; |
124 | 126 | ||
@@ -126,9 +128,9 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, | |||
126 | 128 | ||
127 | #ifdef CONFIG_HOTPLUG_CPU | 129 | #ifdef CONFIG_HOTPLUG_CPU |
128 | INIT_LIST_HEAD(&fbc->list); | 130 | INIT_LIST_HEAD(&fbc->list); |
129 | spin_lock(&percpu_counters_lock); | 131 | spin_lock_irqsave(&percpu_counters_lock, flags); |
130 | list_add(&fbc->list, &percpu_counters); | 132 | list_add(&fbc->list, &percpu_counters); |
131 | spin_unlock(&percpu_counters_lock); | 133 | spin_unlock_irqrestore(&percpu_counters_lock, flags); |
132 | #endif | 134 | #endif |
133 | return 0; | 135 | return 0; |
134 | } | 136 | } |
@@ -136,15 +138,17 @@ EXPORT_SYMBOL(__percpu_counter_init); | |||
136 | 138 | ||
137 | void percpu_counter_destroy(struct percpu_counter *fbc) | 139 | void percpu_counter_destroy(struct percpu_counter *fbc) |
138 | { | 140 | { |
141 | unsigned long flags __maybe_unused; | ||
142 | |||
139 | if (!fbc->counters) | 143 | if (!fbc->counters) |
140 | return; | 144 | return; |
141 | 145 | ||
142 | debug_percpu_counter_deactivate(fbc); | 146 | debug_percpu_counter_deactivate(fbc); |
143 | 147 | ||
144 | #ifdef CONFIG_HOTPLUG_CPU | 148 | #ifdef CONFIG_HOTPLUG_CPU |
145 | spin_lock(&percpu_counters_lock); | 149 | spin_lock_irqsave(&percpu_counters_lock, flags); |
146 | list_del(&fbc->list); | 150 | list_del(&fbc->list); |
147 | spin_unlock(&percpu_counters_lock); | 151 | spin_unlock_irqrestore(&percpu_counters_lock, flags); |
148 | #endif | 152 | #endif |
149 | free_percpu(fbc->counters); | 153 | free_percpu(fbc->counters); |
150 | fbc->counters = NULL; | 154 | fbc->counters = NULL; |
@@ -173,7 +177,7 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb, | |||
173 | return NOTIFY_OK; | 177 | return NOTIFY_OK; |
174 | 178 | ||
175 | cpu = (unsigned long)hcpu; | 179 | cpu = (unsigned long)hcpu; |
176 | spin_lock(&percpu_counters_lock); | 180 | spin_lock_irq(&percpu_counters_lock); |
177 | list_for_each_entry(fbc, &percpu_counters, list) { | 181 | list_for_each_entry(fbc, &percpu_counters, list) { |
178 | s32 *pcount; | 182 | s32 *pcount; |
179 | unsigned long flags; | 183 | unsigned long flags; |
@@ -184,7 +188,7 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb, | |||
184 | *pcount = 0; | 188 | *pcount = 0; |
185 | raw_spin_unlock_irqrestore(&fbc->lock, flags); | 189 | raw_spin_unlock_irqrestore(&fbc->lock, flags); |
186 | } | 190 | } |
187 | spin_unlock(&percpu_counters_lock); | 191 | spin_unlock_irq(&percpu_counters_lock); |
188 | #endif | 192 | #endif |
189 | return NOTIFY_OK; | 193 | return NOTIFY_OK; |
190 | } | 194 | } |
diff --git a/lib/proportions.c b/lib/proportions.c index 05df84801b56..6f724298f67a 100644 --- a/lib/proportions.c +++ b/lib/proportions.c | |||
@@ -73,7 +73,7 @@ | |||
73 | #include <linux/proportions.h> | 73 | #include <linux/proportions.h> |
74 | #include <linux/rcupdate.h> | 74 | #include <linux/rcupdate.h> |
75 | 75 | ||
76 | int prop_descriptor_init(struct prop_descriptor *pd, int shift) | 76 | int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp) |
77 | { | 77 | { |
78 | int err; | 78 | int err; |
79 | 79 | ||
@@ -83,11 +83,11 @@ int prop_descriptor_init(struct prop_descriptor *pd, int shift) | |||
83 | pd->index = 0; | 83 | pd->index = 0; |
84 | pd->pg[0].shift = shift; | 84 | pd->pg[0].shift = shift; |
85 | mutex_init(&pd->mutex); | 85 | mutex_init(&pd->mutex); |
86 | err = percpu_counter_init(&pd->pg[0].events, 0); | 86 | err = percpu_counter_init(&pd->pg[0].events, 0, gfp); |
87 | if (err) | 87 | if (err) |
88 | goto out; | 88 | goto out; |
89 | 89 | ||
90 | err = percpu_counter_init(&pd->pg[1].events, 0); | 90 | err = percpu_counter_init(&pd->pg[1].events, 0, gfp); |
91 | if (err) | 91 | if (err) |
92 | percpu_counter_destroy(&pd->pg[0].events); | 92 | percpu_counter_destroy(&pd->pg[0].events); |
93 | 93 | ||
@@ -188,12 +188,12 @@ prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift) | |||
188 | 188 | ||
189 | #define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) | 189 | #define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) |
190 | 190 | ||
191 | int prop_local_init_percpu(struct prop_local_percpu *pl) | 191 | int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp) |
192 | { | 192 | { |
193 | raw_spin_lock_init(&pl->lock); | 193 | raw_spin_lock_init(&pl->lock); |
194 | pl->shift = 0; | 194 | pl->shift = 0; |
195 | pl->period = 0; | 195 | pl->period = 0; |
196 | return percpu_counter_init(&pl->events, 0); | 196 | return percpu_counter_init(&pl->events, 0, gfp); |
197 | } | 197 | } |
198 | 198 | ||
199 | void prop_local_destroy_percpu(struct prop_local_percpu *pl) | 199 | void prop_local_destroy_percpu(struct prop_local_percpu *pl) |
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index b27714f1b40f..12a992b62576 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c | |||
@@ -455,7 +455,7 @@ int bdi_init(struct backing_dev_info *bdi) | |||
455 | bdi_wb_init(&bdi->wb, bdi); | 455 | bdi_wb_init(&bdi->wb, bdi); |
456 | 456 | ||
457 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { | 457 | for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { |
458 | err = percpu_counter_init(&bdi->bdi_stat[i], 0); | 458 | err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL); |
459 | if (err) | 459 | if (err) |
460 | goto err; | 460 | goto err; |
461 | } | 461 | } |
@@ -470,7 +470,7 @@ int bdi_init(struct backing_dev_info *bdi) | |||
470 | bdi->write_bandwidth = INIT_BW; | 470 | bdi->write_bandwidth = INIT_BW; |
471 | bdi->avg_write_bandwidth = INIT_BW; | 471 | bdi->avg_write_bandwidth = INIT_BW; |
472 | 472 | ||
473 | err = fprop_local_init_percpu(&bdi->completions); | 473 | err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL); |
474 | 474 | ||
475 | if (err) { | 475 | if (err) { |
476 | err: | 476 | err: |
@@ -3202,7 +3202,7 @@ void __init mmap_init(void) | |||
3202 | { | 3202 | { |
3203 | int ret; | 3203 | int ret; |
3204 | 3204 | ||
3205 | ret = percpu_counter_init(&vm_committed_as, 0); | 3205 | ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); |
3206 | VM_BUG_ON(ret); | 3206 | VM_BUG_ON(ret); |
3207 | } | 3207 | } |
3208 | 3208 | ||
diff --git a/mm/nommu.c b/mm/nommu.c index a881d9673c6b..bd1808e194a7 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -539,7 +539,7 @@ void __init mmap_init(void) | |||
539 | { | 539 | { |
540 | int ret; | 540 | int ret; |
541 | 541 | ||
542 | ret = percpu_counter_init(&vm_committed_as, 0); | 542 | ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL); |
543 | VM_BUG_ON(ret); | 543 | VM_BUG_ON(ret); |
544 | vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); | 544 | vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); |
545 | } | 545 | } |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 35ca7102d421..ff24c9d83112 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
@@ -1777,7 +1777,7 @@ void __init page_writeback_init(void) | |||
1777 | writeback_set_ratelimit(); | 1777 | writeback_set_ratelimit(); |
1778 | register_cpu_notifier(&ratelimit_nb); | 1778 | register_cpu_notifier(&ratelimit_nb); |
1779 | 1779 | ||
1780 | fprop_global_init(&writeout_completions); | 1780 | fprop_global_init(&writeout_completions, GFP_KERNEL); |
1781 | } | 1781 | } |
1782 | 1782 | ||
1783 | /** | 1783 | /** |
diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 89633fefc6a2..10e3d0b8a86d 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c | |||
@@ -33,17 +33,14 @@ | |||
33 | 33 | ||
34 | #include <linux/log2.h> | 34 | #include <linux/log2.h> |
35 | 35 | ||
36 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) | 36 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, |
37 | int page_start, int page_end) | ||
37 | { | 38 | { |
38 | unsigned int cpu; | ||
39 | |||
40 | for_each_possible_cpu(cpu) | ||
41 | memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); | ||
42 | |||
43 | return 0; | 39 | return 0; |
44 | } | 40 | } |
45 | 41 | ||
46 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) | 42 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, |
43 | int page_start, int page_end) | ||
47 | { | 44 | { |
48 | /* nada */ | 45 | /* nada */ |
49 | } | 46 | } |
@@ -70,6 +67,11 @@ static struct pcpu_chunk *pcpu_create_chunk(void) | |||
70 | 67 | ||
71 | chunk->data = pages; | 68 | chunk->data = pages; |
72 | chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; | 69 | chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; |
70 | |||
71 | spin_lock_irq(&pcpu_lock); | ||
72 | pcpu_chunk_populated(chunk, 0, nr_pages); | ||
73 | spin_unlock_irq(&pcpu_lock); | ||
74 | |||
73 | return chunk; | 75 | return chunk; |
74 | } | 76 | } |
75 | 77 | ||
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 51108165f829..538998a137d2 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c | |||
@@ -20,46 +20,25 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, | |||
20 | } | 20 | } |
21 | 21 | ||
22 | /** | 22 | /** |
23 | * pcpu_get_pages_and_bitmap - get temp pages array and bitmap | 23 | * pcpu_get_pages - get temp pages array |
24 | * @chunk: chunk of interest | 24 | * @chunk: chunk of interest |
25 | * @bitmapp: output parameter for bitmap | ||
26 | * @may_alloc: may allocate the array | ||
27 | * | 25 | * |
28 | * Returns pointer to array of pointers to struct page and bitmap, | 26 | * Returns pointer to array of pointers to struct page which can be indexed |
29 | * both of which can be indexed with pcpu_page_idx(). The returned | 27 | * with pcpu_page_idx(). Note that there is only one array and accesses |
30 | * array is cleared to zero and *@bitmapp is copied from | 28 | * should be serialized by pcpu_alloc_mutex. |
31 | * @chunk->populated. Note that there is only one array and bitmap | ||
32 | * and access exclusion is the caller's responsibility. | ||
33 | * | ||
34 | * CONTEXT: | ||
35 | * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. | ||
36 | * Otherwise, don't care. | ||
37 | * | 29 | * |
38 | * RETURNS: | 30 | * RETURNS: |
39 | * Pointer to temp pages array on success, NULL on failure. | 31 | * Pointer to temp pages array on success. |
40 | */ | 32 | */ |
41 | static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, | 33 | static struct page **pcpu_get_pages(struct pcpu_chunk *chunk_alloc) |
42 | unsigned long **bitmapp, | ||
43 | bool may_alloc) | ||
44 | { | 34 | { |
45 | static struct page **pages; | 35 | static struct page **pages; |
46 | static unsigned long *bitmap; | ||
47 | size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); | 36 | size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); |
48 | size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * | ||
49 | sizeof(unsigned long); | ||
50 | |||
51 | if (!pages || !bitmap) { | ||
52 | if (may_alloc && !pages) | ||
53 | pages = pcpu_mem_zalloc(pages_size); | ||
54 | if (may_alloc && !bitmap) | ||
55 | bitmap = pcpu_mem_zalloc(bitmap_size); | ||
56 | if (!pages || !bitmap) | ||
57 | return NULL; | ||
58 | } | ||
59 | 37 | ||
60 | bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); | 38 | lockdep_assert_held(&pcpu_alloc_mutex); |
61 | 39 | ||
62 | *bitmapp = bitmap; | 40 | if (!pages) |
41 | pages = pcpu_mem_zalloc(pages_size); | ||
63 | return pages; | 42 | return pages; |
64 | } | 43 | } |
65 | 44 | ||
@@ -67,7 +46,6 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, | |||
67 | * pcpu_free_pages - free pages which were allocated for @chunk | 46 | * pcpu_free_pages - free pages which were allocated for @chunk |
68 | * @chunk: chunk pages were allocated for | 47 | * @chunk: chunk pages were allocated for |
69 | * @pages: array of pages to be freed, indexed by pcpu_page_idx() | 48 | * @pages: array of pages to be freed, indexed by pcpu_page_idx() |
70 | * @populated: populated bitmap | ||
71 | * @page_start: page index of the first page to be freed | 49 | * @page_start: page index of the first page to be freed |
72 | * @page_end: page index of the last page to be freed + 1 | 50 | * @page_end: page index of the last page to be freed + 1 |
73 | * | 51 | * |
@@ -75,8 +53,7 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, | |||
75 | * The pages were allocated for @chunk. | 53 | * The pages were allocated for @chunk. |
76 | */ | 54 | */ |
77 | static void pcpu_free_pages(struct pcpu_chunk *chunk, | 55 | static void pcpu_free_pages(struct pcpu_chunk *chunk, |
78 | struct page **pages, unsigned long *populated, | 56 | struct page **pages, int page_start, int page_end) |
79 | int page_start, int page_end) | ||
80 | { | 57 | { |
81 | unsigned int cpu; | 58 | unsigned int cpu; |
82 | int i; | 59 | int i; |
@@ -95,7 +72,6 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, | |||
95 | * pcpu_alloc_pages - allocates pages for @chunk | 72 | * pcpu_alloc_pages - allocates pages for @chunk |
96 | * @chunk: target chunk | 73 | * @chunk: target chunk |
97 | * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() | 74 | * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() |
98 | * @populated: populated bitmap | ||
99 | * @page_start: page index of the first page to be allocated | 75 | * @page_start: page index of the first page to be allocated |
100 | * @page_end: page index of the last page to be allocated + 1 | 76 | * @page_end: page index of the last page to be allocated + 1 |
101 | * | 77 | * |
@@ -104,8 +80,7 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, | |||
104 | * content of @pages and will pass it verbatim to pcpu_map_pages(). | 80 | * content of @pages and will pass it verbatim to pcpu_map_pages(). |
105 | */ | 81 | */ |
106 | static int pcpu_alloc_pages(struct pcpu_chunk *chunk, | 82 | static int pcpu_alloc_pages(struct pcpu_chunk *chunk, |
107 | struct page **pages, unsigned long *populated, | 83 | struct page **pages, int page_start, int page_end) |
108 | int page_start, int page_end) | ||
109 | { | 84 | { |
110 | const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; | 85 | const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; |
111 | unsigned int cpu, tcpu; | 86 | unsigned int cpu, tcpu; |
@@ -164,7 +139,6 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) | |||
164 | * pcpu_unmap_pages - unmap pages out of a pcpu_chunk | 139 | * pcpu_unmap_pages - unmap pages out of a pcpu_chunk |
165 | * @chunk: chunk of interest | 140 | * @chunk: chunk of interest |
166 | * @pages: pages array which can be used to pass information to free | 141 | * @pages: pages array which can be used to pass information to free |
167 | * @populated: populated bitmap | ||
168 | * @page_start: page index of the first page to unmap | 142 | * @page_start: page index of the first page to unmap |
169 | * @page_end: page index of the last page to unmap + 1 | 143 | * @page_end: page index of the last page to unmap + 1 |
170 | * | 144 | * |
@@ -175,8 +149,7 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) | |||
175 | * proper pre/post flush functions. | 149 | * proper pre/post flush functions. |
176 | */ | 150 | */ |
177 | static void pcpu_unmap_pages(struct pcpu_chunk *chunk, | 151 | static void pcpu_unmap_pages(struct pcpu_chunk *chunk, |
178 | struct page **pages, unsigned long *populated, | 152 | struct page **pages, int page_start, int page_end) |
179 | int page_start, int page_end) | ||
180 | { | 153 | { |
181 | unsigned int cpu; | 154 | unsigned int cpu; |
182 | int i; | 155 | int i; |
@@ -192,8 +165,6 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk, | |||
192 | __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), | 165 | __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), |
193 | page_end - page_start); | 166 | page_end - page_start); |
194 | } | 167 | } |
195 | |||
196 | bitmap_clear(populated, page_start, page_end - page_start); | ||
197 | } | 168 | } |
198 | 169 | ||
199 | /** | 170 | /** |
@@ -228,7 +199,6 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, | |||
228 | * pcpu_map_pages - map pages into a pcpu_chunk | 199 | * pcpu_map_pages - map pages into a pcpu_chunk |
229 | * @chunk: chunk of interest | 200 | * @chunk: chunk of interest |
230 | * @pages: pages array containing pages to be mapped | 201 | * @pages: pages array containing pages to be mapped |
231 | * @populated: populated bitmap | ||
232 | * @page_start: page index of the first page to map | 202 | * @page_start: page index of the first page to map |
233 | * @page_end: page index of the last page to map + 1 | 203 | * @page_end: page index of the last page to map + 1 |
234 | * | 204 | * |
@@ -236,13 +206,11 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages, | |||
236 | * caller is responsible for calling pcpu_post_map_flush() after all | 206 | * caller is responsible for calling pcpu_post_map_flush() after all |
237 | * mappings are complete. | 207 | * mappings are complete. |
238 | * | 208 | * |
239 | * This function is responsible for setting corresponding bits in | 209 | * This function is responsible for setting up whatever is necessary for |
240 | * @chunk->populated bitmap and whatever is necessary for reverse | 210 | * reverse lookup (addr -> chunk). |
241 | * lookup (addr -> chunk). | ||
242 | */ | 211 | */ |
243 | static int pcpu_map_pages(struct pcpu_chunk *chunk, | 212 | static int pcpu_map_pages(struct pcpu_chunk *chunk, |
244 | struct page **pages, unsigned long *populated, | 213 | struct page **pages, int page_start, int page_end) |
245 | int page_start, int page_end) | ||
246 | { | 214 | { |
247 | unsigned int cpu, tcpu; | 215 | unsigned int cpu, tcpu; |
248 | int i, err; | 216 | int i, err; |
@@ -253,18 +221,12 @@ static int pcpu_map_pages(struct pcpu_chunk *chunk, | |||
253 | page_end - page_start); | 221 | page_end - page_start); |
254 | if (err < 0) | 222 | if (err < 0) |
255 | goto err; | 223 | goto err; |
256 | } | ||
257 | 224 | ||
258 | /* mapping successful, link chunk and mark populated */ | 225 | for (i = page_start; i < page_end; i++) |
259 | for (i = page_start; i < page_end; i++) { | ||
260 | for_each_possible_cpu(cpu) | ||
261 | pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], | 226 | pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], |
262 | chunk); | 227 | chunk); |
263 | __set_bit(i, populated); | ||
264 | } | 228 | } |
265 | |||
266 | return 0; | 229 | return 0; |
267 | |||
268 | err: | 230 | err: |
269 | for_each_possible_cpu(tcpu) { | 231 | for_each_possible_cpu(tcpu) { |
270 | if (tcpu == cpu) | 232 | if (tcpu == cpu) |
@@ -299,123 +261,69 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, | |||
299 | /** | 261 | /** |
300 | * pcpu_populate_chunk - populate and map an area of a pcpu_chunk | 262 | * pcpu_populate_chunk - populate and map an area of a pcpu_chunk |
301 | * @chunk: chunk of interest | 263 | * @chunk: chunk of interest |
302 | * @off: offset to the area to populate | 264 | * @page_start: the start page |
303 | * @size: size of the area to populate in bytes | 265 | * @page_end: the end page |
304 | * | 266 | * |
305 | * For each cpu, populate and map pages [@page_start,@page_end) into | 267 | * For each cpu, populate and map pages [@page_start,@page_end) into |
306 | * @chunk. The area is cleared on return. | 268 | * @chunk. |
307 | * | 269 | * |
308 | * CONTEXT: | 270 | * CONTEXT: |
309 | * pcpu_alloc_mutex, does GFP_KERNEL allocation. | 271 | * pcpu_alloc_mutex, does GFP_KERNEL allocation. |
310 | */ | 272 | */ |
311 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) | 273 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, |
274 | int page_start, int page_end) | ||
312 | { | 275 | { |
313 | int page_start = PFN_DOWN(off); | ||
314 | int page_end = PFN_UP(off + size); | ||
315 | int free_end = page_start, unmap_end = page_start; | ||
316 | struct page **pages; | 276 | struct page **pages; |
317 | unsigned long *populated; | ||
318 | unsigned int cpu; | ||
319 | int rs, re, rc; | ||
320 | |||
321 | /* quick path, check whether all pages are already there */ | ||
322 | rs = page_start; | ||
323 | pcpu_next_pop(chunk, &rs, &re, page_end); | ||
324 | if (rs == page_start && re == page_end) | ||
325 | goto clear; | ||
326 | 277 | ||
327 | /* need to allocate and map pages, this chunk can't be immutable */ | 278 | pages = pcpu_get_pages(chunk); |
328 | WARN_ON(chunk->immutable); | ||
329 | |||
330 | pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); | ||
331 | if (!pages) | 279 | if (!pages) |
332 | return -ENOMEM; | 280 | return -ENOMEM; |
333 | 281 | ||
334 | /* alloc and map */ | 282 | if (pcpu_alloc_pages(chunk, pages, page_start, page_end)) |
335 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { | 283 | return -ENOMEM; |
336 | rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); | ||
337 | if (rc) | ||
338 | goto err_free; | ||
339 | free_end = re; | ||
340 | } | ||
341 | 284 | ||
342 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { | 285 | if (pcpu_map_pages(chunk, pages, page_start, page_end)) { |
343 | rc = pcpu_map_pages(chunk, pages, populated, rs, re); | 286 | pcpu_free_pages(chunk, pages, page_start, page_end); |
344 | if (rc) | 287 | return -ENOMEM; |
345 | goto err_unmap; | ||
346 | unmap_end = re; | ||
347 | } | 288 | } |
348 | pcpu_post_map_flush(chunk, page_start, page_end); | 289 | pcpu_post_map_flush(chunk, page_start, page_end); |
349 | 290 | ||
350 | /* commit new bitmap */ | ||
351 | bitmap_copy(chunk->populated, populated, pcpu_unit_pages); | ||
352 | clear: | ||
353 | for_each_possible_cpu(cpu) | ||
354 | memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); | ||
355 | return 0; | 291 | return 0; |
356 | |||
357 | err_unmap: | ||
358 | pcpu_pre_unmap_flush(chunk, page_start, unmap_end); | ||
359 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) | ||
360 | pcpu_unmap_pages(chunk, pages, populated, rs, re); | ||
361 | pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); | ||
362 | err_free: | ||
363 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) | ||
364 | pcpu_free_pages(chunk, pages, populated, rs, re); | ||
365 | return rc; | ||
366 | } | 292 | } |
367 | 293 | ||
368 | /** | 294 | /** |
369 | * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk | 295 | * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk |
370 | * @chunk: chunk to depopulate | 296 | * @chunk: chunk to depopulate |
371 | * @off: offset to the area to depopulate | 297 | * @page_start: the start page |
372 | * @size: size of the area to depopulate in bytes | 298 | * @page_end: the end page |
373 | * | 299 | * |
374 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) | 300 | * For each cpu, depopulate and unmap pages [@page_start,@page_end) |
375 | * from @chunk. If @flush is true, vcache is flushed before unmapping | 301 | * from @chunk. |
376 | * and tlb after. | ||
377 | * | 302 | * |
378 | * CONTEXT: | 303 | * CONTEXT: |
379 | * pcpu_alloc_mutex. | 304 | * pcpu_alloc_mutex. |
380 | */ | 305 | */ |
381 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) | 306 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, |
307 | int page_start, int page_end) | ||
382 | { | 308 | { |
383 | int page_start = PFN_DOWN(off); | ||
384 | int page_end = PFN_UP(off + size); | ||
385 | struct page **pages; | 309 | struct page **pages; |
386 | unsigned long *populated; | ||
387 | int rs, re; | ||
388 | |||
389 | /* quick path, check whether it's empty already */ | ||
390 | rs = page_start; | ||
391 | pcpu_next_unpop(chunk, &rs, &re, page_end); | ||
392 | if (rs == page_start && re == page_end) | ||
393 | return; | ||
394 | |||
395 | /* immutable chunks can't be depopulated */ | ||
396 | WARN_ON(chunk->immutable); | ||
397 | 310 | ||
398 | /* | 311 | /* |
399 | * If control reaches here, there must have been at least one | 312 | * If control reaches here, there must have been at least one |
400 | * successful population attempt so the temp pages array must | 313 | * successful population attempt so the temp pages array must |
401 | * be available now. | 314 | * be available now. |
402 | */ | 315 | */ |
403 | pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); | 316 | pages = pcpu_get_pages(chunk); |
404 | BUG_ON(!pages); | 317 | BUG_ON(!pages); |
405 | 318 | ||
406 | /* unmap and free */ | 319 | /* unmap and free */ |
407 | pcpu_pre_unmap_flush(chunk, page_start, page_end); | 320 | pcpu_pre_unmap_flush(chunk, page_start, page_end); |
408 | 321 | ||
409 | pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) | 322 | pcpu_unmap_pages(chunk, pages, page_start, page_end); |
410 | pcpu_unmap_pages(chunk, pages, populated, rs, re); | ||
411 | 323 | ||
412 | /* no need to flush tlb, vmalloc will handle it lazily */ | 324 | /* no need to flush tlb, vmalloc will handle it lazily */ |
413 | 325 | ||
414 | pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) | 326 | pcpu_free_pages(chunk, pages, page_start, page_end); |
415 | pcpu_free_pages(chunk, pages, populated, rs, re); | ||
416 | |||
417 | /* commit new bitmap */ | ||
418 | bitmap_copy(chunk->populated, populated, pcpu_unit_pages); | ||
419 | } | 327 | } |
420 | 328 | ||
421 | static struct pcpu_chunk *pcpu_create_chunk(void) | 329 | static struct pcpu_chunk *pcpu_create_chunk(void) |
diff --git a/mm/percpu.c b/mm/percpu.c index da997f9800bd..014bab65e0ff 100644 --- a/mm/percpu.c +++ b/mm/percpu.c | |||
@@ -76,6 +76,10 @@ | |||
76 | 76 | ||
77 | #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ | 77 | #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ |
78 | #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ | 78 | #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ |
79 | #define PCPU_ATOMIC_MAP_MARGIN_LOW 32 | ||
80 | #define PCPU_ATOMIC_MAP_MARGIN_HIGH 64 | ||
81 | #define PCPU_EMPTY_POP_PAGES_LOW 2 | ||
82 | #define PCPU_EMPTY_POP_PAGES_HIGH 4 | ||
79 | 83 | ||
80 | #ifdef CONFIG_SMP | 84 | #ifdef CONFIG_SMP |
81 | /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ | 85 | /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ |
@@ -102,12 +106,16 @@ struct pcpu_chunk { | |||
102 | int free_size; /* free bytes in the chunk */ | 106 | int free_size; /* free bytes in the chunk */ |
103 | int contig_hint; /* max contiguous size hint */ | 107 | int contig_hint; /* max contiguous size hint */ |
104 | void *base_addr; /* base address of this chunk */ | 108 | void *base_addr; /* base address of this chunk */ |
109 | |||
105 | int map_used; /* # of map entries used before the sentry */ | 110 | int map_used; /* # of map entries used before the sentry */ |
106 | int map_alloc; /* # of map entries allocated */ | 111 | int map_alloc; /* # of map entries allocated */ |
107 | int *map; /* allocation map */ | 112 | int *map; /* allocation map */ |
113 | struct work_struct map_extend_work;/* async ->map[] extension */ | ||
114 | |||
108 | void *data; /* chunk data */ | 115 | void *data; /* chunk data */ |
109 | int first_free; /* no free below this */ | 116 | int first_free; /* no free below this */ |
110 | bool immutable; /* no [de]population allowed */ | 117 | bool immutable; /* no [de]population allowed */ |
118 | int nr_populated; /* # of populated pages */ | ||
111 | unsigned long populated[]; /* populated bitmap */ | 119 | unsigned long populated[]; /* populated bitmap */ |
112 | }; | 120 | }; |
113 | 121 | ||
@@ -151,38 +159,33 @@ static struct pcpu_chunk *pcpu_first_chunk; | |||
151 | static struct pcpu_chunk *pcpu_reserved_chunk; | 159 | static struct pcpu_chunk *pcpu_reserved_chunk; |
152 | static int pcpu_reserved_chunk_limit; | 160 | static int pcpu_reserved_chunk_limit; |
153 | 161 | ||
162 | static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ | ||
163 | static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ | ||
164 | |||
165 | static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ | ||
166 | |||
154 | /* | 167 | /* |
155 | * Synchronization rules. | 168 | * The number of empty populated pages, protected by pcpu_lock. The |
156 | * | 169 | * reserved chunk doesn't contribute to the count. |
157 | * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former | ||
158 | * protects allocation/reclaim paths, chunks, populated bitmap and | ||
159 | * vmalloc mapping. The latter is a spinlock and protects the index | ||
160 | * data structures - chunk slots, chunks and area maps in chunks. | ||
161 | * | ||
162 | * During allocation, pcpu_alloc_mutex is kept locked all the time and | ||
163 | * pcpu_lock is grabbed and released as necessary. All actual memory | ||
164 | * allocations are done using GFP_KERNEL with pcpu_lock released. In | ||
165 | * general, percpu memory can't be allocated with irq off but | ||
166 | * irqsave/restore are still used in alloc path so that it can be used | ||
167 | * from early init path - sched_init() specifically. | ||
168 | * | ||
169 | * Free path accesses and alters only the index data structures, so it | ||
170 | * can be safely called from atomic context. When memory needs to be | ||
171 | * returned to the system, free path schedules reclaim_work which | ||
172 | * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be | ||
173 | * reclaimed, release both locks and frees the chunks. Note that it's | ||
174 | * necessary to grab both locks to remove a chunk from circulation as | ||
175 | * allocation path might be referencing the chunk with only | ||
176 | * pcpu_alloc_mutex locked. | ||
177 | */ | 170 | */ |
178 | static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ | 171 | static int pcpu_nr_empty_pop_pages; |
179 | static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ | ||
180 | 172 | ||
181 | static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ | 173 | /* |
174 | * Balance work is used to populate or destroy chunks asynchronously. We | ||
175 | * try to keep the number of populated free pages between | ||
176 | * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one | ||
177 | * empty chunk. | ||
178 | */ | ||
179 | static void pcpu_balance_workfn(struct work_struct *work); | ||
180 | static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); | ||
181 | static bool pcpu_async_enabled __read_mostly; | ||
182 | static bool pcpu_atomic_alloc_failed; | ||
182 | 183 | ||
183 | /* reclaim work to release fully free chunks, scheduled from free path */ | 184 | static void pcpu_schedule_balance_work(void) |
184 | static void pcpu_reclaim(struct work_struct *work); | 185 | { |
185 | static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); | 186 | if (pcpu_async_enabled) |
187 | schedule_work(&pcpu_balance_work); | ||
188 | } | ||
186 | 189 | ||
187 | static bool pcpu_addr_in_first_chunk(void *addr) | 190 | static bool pcpu_addr_in_first_chunk(void *addr) |
188 | { | 191 | { |
@@ -315,6 +318,38 @@ static void pcpu_mem_free(void *ptr, size_t size) | |||
315 | } | 318 | } |
316 | 319 | ||
317 | /** | 320 | /** |
321 | * pcpu_count_occupied_pages - count the number of pages an area occupies | ||
322 | * @chunk: chunk of interest | ||
323 | * @i: index of the area in question | ||
324 | * | ||
325 | * Count the number of pages chunk's @i'th area occupies. When the area's | ||
326 | * start and/or end address isn't aligned to page boundary, the straddled | ||
327 | * page is included in the count iff the rest of the page is free. | ||
328 | */ | ||
329 | static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i) | ||
330 | { | ||
331 | int off = chunk->map[i] & ~1; | ||
332 | int end = chunk->map[i + 1] & ~1; | ||
333 | |||
334 | if (!PAGE_ALIGNED(off) && i > 0) { | ||
335 | int prev = chunk->map[i - 1]; | ||
336 | |||
337 | if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE)) | ||
338 | off = round_down(off, PAGE_SIZE); | ||
339 | } | ||
340 | |||
341 | if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) { | ||
342 | int next = chunk->map[i + 1]; | ||
343 | int nend = chunk->map[i + 2] & ~1; | ||
344 | |||
345 | if (!(next & 1) && nend >= round_up(end, PAGE_SIZE)) | ||
346 | end = round_up(end, PAGE_SIZE); | ||
347 | } | ||
348 | |||
349 | return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0); | ||
350 | } | ||
351 | |||
352 | /** | ||
318 | * pcpu_chunk_relocate - put chunk in the appropriate chunk slot | 353 | * pcpu_chunk_relocate - put chunk in the appropriate chunk slot |
319 | * @chunk: chunk of interest | 354 | * @chunk: chunk of interest |
320 | * @oslot: the previous slot it was on | 355 | * @oslot: the previous slot it was on |
@@ -342,9 +377,14 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) | |||
342 | /** | 377 | /** |
343 | * pcpu_need_to_extend - determine whether chunk area map needs to be extended | 378 | * pcpu_need_to_extend - determine whether chunk area map needs to be extended |
344 | * @chunk: chunk of interest | 379 | * @chunk: chunk of interest |
380 | * @is_atomic: the allocation context | ||
345 | * | 381 | * |
346 | * Determine whether area map of @chunk needs to be extended to | 382 | * Determine whether area map of @chunk needs to be extended. If |
347 | * accommodate a new allocation. | 383 | * @is_atomic, only the amount necessary for a new allocation is |
384 | * considered; however, async extension is scheduled if the left amount is | ||
385 | * low. If !@is_atomic, it aims for more empty space. Combined, this | ||
386 | * ensures that the map is likely to have enough available space to | ||
387 | * accomodate atomic allocations which can't extend maps directly. | ||
348 | * | 388 | * |
349 | * CONTEXT: | 389 | * CONTEXT: |
350 | * pcpu_lock. | 390 | * pcpu_lock. |
@@ -353,15 +393,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) | |||
353 | * New target map allocation length if extension is necessary, 0 | 393 | * New target map allocation length if extension is necessary, 0 |
354 | * otherwise. | 394 | * otherwise. |
355 | */ | 395 | */ |
356 | static int pcpu_need_to_extend(struct pcpu_chunk *chunk) | 396 | static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) |
357 | { | 397 | { |
358 | int new_alloc; | 398 | int margin, new_alloc; |
399 | |||
400 | if (is_atomic) { | ||
401 | margin = 3; | ||
402 | |||
403 | if (chunk->map_alloc < | ||
404 | chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW && | ||
405 | pcpu_async_enabled) | ||
406 | schedule_work(&chunk->map_extend_work); | ||
407 | } else { | ||
408 | margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; | ||
409 | } | ||
359 | 410 | ||
360 | if (chunk->map_alloc >= chunk->map_used + 3) | 411 | if (chunk->map_alloc >= chunk->map_used + margin) |
361 | return 0; | 412 | return 0; |
362 | 413 | ||
363 | new_alloc = PCPU_DFL_MAP_ALLOC; | 414 | new_alloc = PCPU_DFL_MAP_ALLOC; |
364 | while (new_alloc < chunk->map_used + 3) | 415 | while (new_alloc < chunk->map_used + margin) |
365 | new_alloc *= 2; | 416 | new_alloc *= 2; |
366 | 417 | ||
367 | return new_alloc; | 418 | return new_alloc; |
@@ -418,11 +469,76 @@ out_unlock: | |||
418 | return 0; | 469 | return 0; |
419 | } | 470 | } |
420 | 471 | ||
472 | static void pcpu_map_extend_workfn(struct work_struct *work) | ||
473 | { | ||
474 | struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk, | ||
475 | map_extend_work); | ||
476 | int new_alloc; | ||
477 | |||
478 | spin_lock_irq(&pcpu_lock); | ||
479 | new_alloc = pcpu_need_to_extend(chunk, false); | ||
480 | spin_unlock_irq(&pcpu_lock); | ||
481 | |||
482 | if (new_alloc) | ||
483 | pcpu_extend_area_map(chunk, new_alloc); | ||
484 | } | ||
485 | |||
486 | /** | ||
487 | * pcpu_fit_in_area - try to fit the requested allocation in a candidate area | ||
488 | * @chunk: chunk the candidate area belongs to | ||
489 | * @off: the offset to the start of the candidate area | ||
490 | * @this_size: the size of the candidate area | ||
491 | * @size: the size of the target allocation | ||
492 | * @align: the alignment of the target allocation | ||
493 | * @pop_only: only allocate from already populated region | ||
494 | * | ||
495 | * We're trying to allocate @size bytes aligned at @align. @chunk's area | ||
496 | * at @off sized @this_size is a candidate. This function determines | ||
497 | * whether the target allocation fits in the candidate area and returns the | ||
498 | * number of bytes to pad after @off. If the target area doesn't fit, -1 | ||
499 | * is returned. | ||
500 | * | ||
501 | * If @pop_only is %true, this function only considers the already | ||
502 | * populated part of the candidate area. | ||
503 | */ | ||
504 | static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size, | ||
505 | int size, int align, bool pop_only) | ||
506 | { | ||
507 | int cand_off = off; | ||
508 | |||
509 | while (true) { | ||
510 | int head = ALIGN(cand_off, align) - off; | ||
511 | int page_start, page_end, rs, re; | ||
512 | |||
513 | if (this_size < head + size) | ||
514 | return -1; | ||
515 | |||
516 | if (!pop_only) | ||
517 | return head; | ||
518 | |||
519 | /* | ||
520 | * If the first unpopulated page is beyond the end of the | ||
521 | * allocation, the whole allocation is populated; | ||
522 | * otherwise, retry from the end of the unpopulated area. | ||
523 | */ | ||
524 | page_start = PFN_DOWN(head + off); | ||
525 | page_end = PFN_UP(head + off + size); | ||
526 | |||
527 | rs = page_start; | ||
528 | pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size)); | ||
529 | if (rs >= page_end) | ||
530 | return head; | ||
531 | cand_off = re * PAGE_SIZE; | ||
532 | } | ||
533 | } | ||
534 | |||
421 | /** | 535 | /** |
422 | * pcpu_alloc_area - allocate area from a pcpu_chunk | 536 | * pcpu_alloc_area - allocate area from a pcpu_chunk |
423 | * @chunk: chunk of interest | 537 | * @chunk: chunk of interest |
424 | * @size: wanted size in bytes | 538 | * @size: wanted size in bytes |
425 | * @align: wanted align | 539 | * @align: wanted align |
540 | * @pop_only: allocate only from the populated area | ||
541 | * @occ_pages_p: out param for the number of pages the area occupies | ||
426 | * | 542 | * |
427 | * Try to allocate @size bytes area aligned at @align from @chunk. | 543 | * Try to allocate @size bytes area aligned at @align from @chunk. |
428 | * Note that this function only allocates the offset. It doesn't | 544 | * Note that this function only allocates the offset. It doesn't |
@@ -437,7 +553,8 @@ out_unlock: | |||
437 | * Allocated offset in @chunk on success, -1 if no matching area is | 553 | * Allocated offset in @chunk on success, -1 if no matching area is |
438 | * found. | 554 | * found. |
439 | */ | 555 | */ |
440 | static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) | 556 | static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align, |
557 | bool pop_only, int *occ_pages_p) | ||
441 | { | 558 | { |
442 | int oslot = pcpu_chunk_slot(chunk); | 559 | int oslot = pcpu_chunk_slot(chunk); |
443 | int max_contig = 0; | 560 | int max_contig = 0; |
@@ -453,11 +570,11 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) | |||
453 | if (off & 1) | 570 | if (off & 1) |
454 | continue; | 571 | continue; |
455 | 572 | ||
456 | /* extra for alignment requirement */ | ||
457 | head = ALIGN(off, align) - off; | ||
458 | |||
459 | this_size = (p[1] & ~1) - off; | 573 | this_size = (p[1] & ~1) - off; |
460 | if (this_size < head + size) { | 574 | |
575 | head = pcpu_fit_in_area(chunk, off, this_size, size, align, | ||
576 | pop_only); | ||
577 | if (head < 0) { | ||
461 | if (!seen_free) { | 578 | if (!seen_free) { |
462 | chunk->first_free = i; | 579 | chunk->first_free = i; |
463 | seen_free = true; | 580 | seen_free = true; |
@@ -526,6 +643,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) | |||
526 | chunk->free_size -= size; | 643 | chunk->free_size -= size; |
527 | *p |= 1; | 644 | *p |= 1; |
528 | 645 | ||
646 | *occ_pages_p = pcpu_count_occupied_pages(chunk, i); | ||
529 | pcpu_chunk_relocate(chunk, oslot); | 647 | pcpu_chunk_relocate(chunk, oslot); |
530 | return off; | 648 | return off; |
531 | } | 649 | } |
@@ -541,6 +659,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) | |||
541 | * pcpu_free_area - free area to a pcpu_chunk | 659 | * pcpu_free_area - free area to a pcpu_chunk |
542 | * @chunk: chunk of interest | 660 | * @chunk: chunk of interest |
543 | * @freeme: offset of area to free | 661 | * @freeme: offset of area to free |
662 | * @occ_pages_p: out param for the number of pages the area occupies | ||
544 | * | 663 | * |
545 | * Free area starting from @freeme to @chunk. Note that this function | 664 | * Free area starting from @freeme to @chunk. Note that this function |
546 | * only modifies the allocation map. It doesn't depopulate or unmap | 665 | * only modifies the allocation map. It doesn't depopulate or unmap |
@@ -549,7 +668,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) | |||
549 | * CONTEXT: | 668 | * CONTEXT: |
550 | * pcpu_lock. | 669 | * pcpu_lock. |
551 | */ | 670 | */ |
552 | static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) | 671 | static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme, |
672 | int *occ_pages_p) | ||
553 | { | 673 | { |
554 | int oslot = pcpu_chunk_slot(chunk); | 674 | int oslot = pcpu_chunk_slot(chunk); |
555 | int off = 0; | 675 | int off = 0; |
@@ -580,6 +700,8 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) | |||
580 | *p = off &= ~1; | 700 | *p = off &= ~1; |
581 | chunk->free_size += (p[1] & ~1) - off; | 701 | chunk->free_size += (p[1] & ~1) - off; |
582 | 702 | ||
703 | *occ_pages_p = pcpu_count_occupied_pages(chunk, i); | ||
704 | |||
583 | /* merge with next? */ | 705 | /* merge with next? */ |
584 | if (!(p[1] & 1)) | 706 | if (!(p[1] & 1)) |
585 | to_free++; | 707 | to_free++; |
@@ -620,6 +742,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) | |||
620 | chunk->map_used = 1; | 742 | chunk->map_used = 1; |
621 | 743 | ||
622 | INIT_LIST_HEAD(&chunk->list); | 744 | INIT_LIST_HEAD(&chunk->list); |
745 | INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn); | ||
623 | chunk->free_size = pcpu_unit_size; | 746 | chunk->free_size = pcpu_unit_size; |
624 | chunk->contig_hint = pcpu_unit_size; | 747 | chunk->contig_hint = pcpu_unit_size; |
625 | 748 | ||
@@ -634,6 +757,50 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) | |||
634 | pcpu_mem_free(chunk, pcpu_chunk_struct_size); | 757 | pcpu_mem_free(chunk, pcpu_chunk_struct_size); |
635 | } | 758 | } |
636 | 759 | ||
760 | /** | ||
761 | * pcpu_chunk_populated - post-population bookkeeping | ||
762 | * @chunk: pcpu_chunk which got populated | ||
763 | * @page_start: the start page | ||
764 | * @page_end: the end page | ||
765 | * | ||
766 | * Pages in [@page_start,@page_end) have been populated to @chunk. Update | ||
767 | * the bookkeeping information accordingly. Must be called after each | ||
768 | * successful population. | ||
769 | */ | ||
770 | static void pcpu_chunk_populated(struct pcpu_chunk *chunk, | ||
771 | int page_start, int page_end) | ||
772 | { | ||
773 | int nr = page_end - page_start; | ||
774 | |||
775 | lockdep_assert_held(&pcpu_lock); | ||
776 | |||
777 | bitmap_set(chunk->populated, page_start, nr); | ||
778 | chunk->nr_populated += nr; | ||
779 | pcpu_nr_empty_pop_pages += nr; | ||
780 | } | ||
781 | |||
782 | /** | ||
783 | * pcpu_chunk_depopulated - post-depopulation bookkeeping | ||
784 | * @chunk: pcpu_chunk which got depopulated | ||
785 | * @page_start: the start page | ||
786 | * @page_end: the end page | ||
787 | * | ||
788 | * Pages in [@page_start,@page_end) have been depopulated from @chunk. | ||
789 | * Update the bookkeeping information accordingly. Must be called after | ||
790 | * each successful depopulation. | ||
791 | */ | ||
792 | static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, | ||
793 | int page_start, int page_end) | ||
794 | { | ||
795 | int nr = page_end - page_start; | ||
796 | |||
797 | lockdep_assert_held(&pcpu_lock); | ||
798 | |||
799 | bitmap_clear(chunk->populated, page_start, nr); | ||
800 | chunk->nr_populated -= nr; | ||
801 | pcpu_nr_empty_pop_pages -= nr; | ||
802 | } | ||
803 | |||
637 | /* | 804 | /* |
638 | * Chunk management implementation. | 805 | * Chunk management implementation. |
639 | * | 806 | * |
@@ -695,21 +862,23 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) | |||
695 | * @size: size of area to allocate in bytes | 862 | * @size: size of area to allocate in bytes |
696 | * @align: alignment of area (max PAGE_SIZE) | 863 | * @align: alignment of area (max PAGE_SIZE) |
697 | * @reserved: allocate from the reserved chunk if available | 864 | * @reserved: allocate from the reserved chunk if available |
865 | * @gfp: allocation flags | ||
698 | * | 866 | * |
699 | * Allocate percpu area of @size bytes aligned at @align. | 867 | * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't |
700 | * | 868 | * contain %GFP_KERNEL, the allocation is atomic. |
701 | * CONTEXT: | ||
702 | * Does GFP_KERNEL allocation. | ||
703 | * | 869 | * |
704 | * RETURNS: | 870 | * RETURNS: |
705 | * Percpu pointer to the allocated area on success, NULL on failure. | 871 | * Percpu pointer to the allocated area on success, NULL on failure. |
706 | */ | 872 | */ |
707 | static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) | 873 | static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, |
874 | gfp_t gfp) | ||
708 | { | 875 | { |
709 | static int warn_limit = 10; | 876 | static int warn_limit = 10; |
710 | struct pcpu_chunk *chunk; | 877 | struct pcpu_chunk *chunk; |
711 | const char *err; | 878 | const char *err; |
712 | int slot, off, new_alloc; | 879 | bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; |
880 | int occ_pages = 0; | ||
881 | int slot, off, new_alloc, cpu, ret; | ||
713 | unsigned long flags; | 882 | unsigned long flags; |
714 | void __percpu *ptr; | 883 | void __percpu *ptr; |
715 | 884 | ||
@@ -728,7 +897,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) | |||
728 | return NULL; | 897 | return NULL; |
729 | } | 898 | } |
730 | 899 | ||
731 | mutex_lock(&pcpu_alloc_mutex); | ||
732 | spin_lock_irqsave(&pcpu_lock, flags); | 900 | spin_lock_irqsave(&pcpu_lock, flags); |
733 | 901 | ||
734 | /* serve reserved allocations from the reserved chunk if available */ | 902 | /* serve reserved allocations from the reserved chunk if available */ |
@@ -740,16 +908,18 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) | |||
740 | goto fail_unlock; | 908 | goto fail_unlock; |
741 | } | 909 | } |
742 | 910 | ||
743 | while ((new_alloc = pcpu_need_to_extend(chunk))) { | 911 | while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) { |
744 | spin_unlock_irqrestore(&pcpu_lock, flags); | 912 | spin_unlock_irqrestore(&pcpu_lock, flags); |
745 | if (pcpu_extend_area_map(chunk, new_alloc) < 0) { | 913 | if (is_atomic || |
914 | pcpu_extend_area_map(chunk, new_alloc) < 0) { | ||
746 | err = "failed to extend area map of reserved chunk"; | 915 | err = "failed to extend area map of reserved chunk"; |
747 | goto fail_unlock_mutex; | 916 | goto fail; |
748 | } | 917 | } |
749 | spin_lock_irqsave(&pcpu_lock, flags); | 918 | spin_lock_irqsave(&pcpu_lock, flags); |
750 | } | 919 | } |
751 | 920 | ||
752 | off = pcpu_alloc_area(chunk, size, align); | 921 | off = pcpu_alloc_area(chunk, size, align, is_atomic, |
922 | &occ_pages); | ||
753 | if (off >= 0) | 923 | if (off >= 0) |
754 | goto area_found; | 924 | goto area_found; |
755 | 925 | ||
@@ -764,13 +934,15 @@ restart: | |||
764 | if (size > chunk->contig_hint) | 934 | if (size > chunk->contig_hint) |
765 | continue; | 935 | continue; |
766 | 936 | ||
767 | new_alloc = pcpu_need_to_extend(chunk); | 937 | new_alloc = pcpu_need_to_extend(chunk, is_atomic); |
768 | if (new_alloc) { | 938 | if (new_alloc) { |
939 | if (is_atomic) | ||
940 | continue; | ||
769 | spin_unlock_irqrestore(&pcpu_lock, flags); | 941 | spin_unlock_irqrestore(&pcpu_lock, flags); |
770 | if (pcpu_extend_area_map(chunk, | 942 | if (pcpu_extend_area_map(chunk, |
771 | new_alloc) < 0) { | 943 | new_alloc) < 0) { |
772 | err = "failed to extend area map"; | 944 | err = "failed to extend area map"; |
773 | goto fail_unlock_mutex; | 945 | goto fail; |
774 | } | 946 | } |
775 | spin_lock_irqsave(&pcpu_lock, flags); | 947 | spin_lock_irqsave(&pcpu_lock, flags); |
776 | /* | 948 | /* |
@@ -780,74 +952,134 @@ restart: | |||
780 | goto restart; | 952 | goto restart; |
781 | } | 953 | } |
782 | 954 | ||
783 | off = pcpu_alloc_area(chunk, size, align); | 955 | off = pcpu_alloc_area(chunk, size, align, is_atomic, |
956 | &occ_pages); | ||
784 | if (off >= 0) | 957 | if (off >= 0) |
785 | goto area_found; | 958 | goto area_found; |
786 | } | 959 | } |
787 | } | 960 | } |
788 | 961 | ||
789 | /* hmmm... no space left, create a new chunk */ | ||
790 | spin_unlock_irqrestore(&pcpu_lock, flags); | 962 | spin_unlock_irqrestore(&pcpu_lock, flags); |
791 | 963 | ||
792 | chunk = pcpu_create_chunk(); | 964 | /* |
793 | if (!chunk) { | 965 | * No space left. Create a new chunk. We don't want multiple |
794 | err = "failed to allocate new chunk"; | 966 | * tasks to create chunks simultaneously. Serialize and create iff |
795 | goto fail_unlock_mutex; | 967 | * there's still no empty chunk after grabbing the mutex. |
968 | */ | ||
969 | if (is_atomic) | ||
970 | goto fail; | ||
971 | |||
972 | mutex_lock(&pcpu_alloc_mutex); | ||
973 | |||
974 | if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { | ||
975 | chunk = pcpu_create_chunk(); | ||
976 | if (!chunk) { | ||
977 | mutex_unlock(&pcpu_alloc_mutex); | ||
978 | err = "failed to allocate new chunk"; | ||
979 | goto fail; | ||
980 | } | ||
981 | |||
982 | spin_lock_irqsave(&pcpu_lock, flags); | ||
983 | pcpu_chunk_relocate(chunk, -1); | ||
984 | } else { | ||
985 | spin_lock_irqsave(&pcpu_lock, flags); | ||
796 | } | 986 | } |
797 | 987 | ||
798 | spin_lock_irqsave(&pcpu_lock, flags); | 988 | mutex_unlock(&pcpu_alloc_mutex); |
799 | pcpu_chunk_relocate(chunk, -1); | ||
800 | goto restart; | 989 | goto restart; |
801 | 990 | ||
802 | area_found: | 991 | area_found: |
803 | spin_unlock_irqrestore(&pcpu_lock, flags); | 992 | spin_unlock_irqrestore(&pcpu_lock, flags); |
804 | 993 | ||
805 | /* populate, map and clear the area */ | 994 | /* populate if not all pages are already there */ |
806 | if (pcpu_populate_chunk(chunk, off, size)) { | 995 | if (!is_atomic) { |
807 | spin_lock_irqsave(&pcpu_lock, flags); | 996 | int page_start, page_end, rs, re; |
808 | pcpu_free_area(chunk, off); | 997 | |
809 | err = "failed to populate"; | 998 | mutex_lock(&pcpu_alloc_mutex); |
810 | goto fail_unlock; | 999 | |
1000 | page_start = PFN_DOWN(off); | ||
1001 | page_end = PFN_UP(off + size); | ||
1002 | |||
1003 | pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { | ||
1004 | WARN_ON(chunk->immutable); | ||
1005 | |||
1006 | ret = pcpu_populate_chunk(chunk, rs, re); | ||
1007 | |||
1008 | spin_lock_irqsave(&pcpu_lock, flags); | ||
1009 | if (ret) { | ||
1010 | mutex_unlock(&pcpu_alloc_mutex); | ||
1011 | pcpu_free_area(chunk, off, &occ_pages); | ||
1012 | err = "failed to populate"; | ||
1013 | goto fail_unlock; | ||
1014 | } | ||
1015 | pcpu_chunk_populated(chunk, rs, re); | ||
1016 | spin_unlock_irqrestore(&pcpu_lock, flags); | ||
1017 | } | ||
1018 | |||
1019 | mutex_unlock(&pcpu_alloc_mutex); | ||
811 | } | 1020 | } |
812 | 1021 | ||
813 | mutex_unlock(&pcpu_alloc_mutex); | 1022 | if (chunk != pcpu_reserved_chunk) |
1023 | pcpu_nr_empty_pop_pages -= occ_pages; | ||
1024 | |||
1025 | if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) | ||
1026 | pcpu_schedule_balance_work(); | ||
1027 | |||
1028 | /* clear the areas and return address relative to base address */ | ||
1029 | for_each_possible_cpu(cpu) | ||
1030 | memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); | ||
814 | 1031 | ||
815 | /* return address relative to base address */ | ||
816 | ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); | 1032 | ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); |
817 | kmemleak_alloc_percpu(ptr, size); | 1033 | kmemleak_alloc_percpu(ptr, size); |
818 | return ptr; | 1034 | return ptr; |
819 | 1035 | ||
820 | fail_unlock: | 1036 | fail_unlock: |
821 | spin_unlock_irqrestore(&pcpu_lock, flags); | 1037 | spin_unlock_irqrestore(&pcpu_lock, flags); |
822 | fail_unlock_mutex: | 1038 | fail: |
823 | mutex_unlock(&pcpu_alloc_mutex); | 1039 | if (!is_atomic && warn_limit) { |
824 | if (warn_limit) { | 1040 | pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n", |
825 | pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " | 1041 | size, align, is_atomic, err); |
826 | "%s\n", size, align, err); | ||
827 | dump_stack(); | 1042 | dump_stack(); |
828 | if (!--warn_limit) | 1043 | if (!--warn_limit) |
829 | pr_info("PERCPU: limit reached, disable warning\n"); | 1044 | pr_info("PERCPU: limit reached, disable warning\n"); |
830 | } | 1045 | } |
1046 | if (is_atomic) { | ||
1047 | /* see the flag handling in pcpu_blance_workfn() */ | ||
1048 | pcpu_atomic_alloc_failed = true; | ||
1049 | pcpu_schedule_balance_work(); | ||
1050 | } | ||
831 | return NULL; | 1051 | return NULL; |
832 | } | 1052 | } |
833 | 1053 | ||
834 | /** | 1054 | /** |
835 | * __alloc_percpu - allocate dynamic percpu area | 1055 | * __alloc_percpu_gfp - allocate dynamic percpu area |
836 | * @size: size of area to allocate in bytes | 1056 | * @size: size of area to allocate in bytes |
837 | * @align: alignment of area (max PAGE_SIZE) | 1057 | * @align: alignment of area (max PAGE_SIZE) |
1058 | * @gfp: allocation flags | ||
838 | * | 1059 | * |
839 | * Allocate zero-filled percpu area of @size bytes aligned at @align. | 1060 | * Allocate zero-filled percpu area of @size bytes aligned at @align. If |
840 | * Might sleep. Might trigger writeouts. | 1061 | * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can |
841 | * | 1062 | * be called from any context but is a lot more likely to fail. |
842 | * CONTEXT: | ||
843 | * Does GFP_KERNEL allocation. | ||
844 | * | 1063 | * |
845 | * RETURNS: | 1064 | * RETURNS: |
846 | * Percpu pointer to the allocated area on success, NULL on failure. | 1065 | * Percpu pointer to the allocated area on success, NULL on failure. |
847 | */ | 1066 | */ |
1067 | void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) | ||
1068 | { | ||
1069 | return pcpu_alloc(size, align, false, gfp); | ||
1070 | } | ||
1071 | EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); | ||
1072 | |||
1073 | /** | ||
1074 | * __alloc_percpu - allocate dynamic percpu area | ||
1075 | * @size: size of area to allocate in bytes | ||
1076 | * @align: alignment of area (max PAGE_SIZE) | ||
1077 | * | ||
1078 | * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). | ||
1079 | */ | ||
848 | void __percpu *__alloc_percpu(size_t size, size_t align) | 1080 | void __percpu *__alloc_percpu(size_t size, size_t align) |
849 | { | 1081 | { |
850 | return pcpu_alloc(size, align, false); | 1082 | return pcpu_alloc(size, align, false, GFP_KERNEL); |
851 | } | 1083 | } |
852 | EXPORT_SYMBOL_GPL(__alloc_percpu); | 1084 | EXPORT_SYMBOL_GPL(__alloc_percpu); |
853 | 1085 | ||
@@ -869,44 +1101,121 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); | |||
869 | */ | 1101 | */ |
870 | void __percpu *__alloc_reserved_percpu(size_t size, size_t align) | 1102 | void __percpu *__alloc_reserved_percpu(size_t size, size_t align) |
871 | { | 1103 | { |
872 | return pcpu_alloc(size, align, true); | 1104 | return pcpu_alloc(size, align, true, GFP_KERNEL); |
873 | } | 1105 | } |
874 | 1106 | ||
875 | /** | 1107 | /** |
876 | * pcpu_reclaim - reclaim fully free chunks, workqueue function | 1108 | * pcpu_balance_workfn - manage the amount of free chunks and populated pages |
877 | * @work: unused | 1109 | * @work: unused |
878 | * | 1110 | * |
879 | * Reclaim all fully free chunks except for the first one. | 1111 | * Reclaim all fully free chunks except for the first one. |
880 | * | ||
881 | * CONTEXT: | ||
882 | * workqueue context. | ||
883 | */ | 1112 | */ |
884 | static void pcpu_reclaim(struct work_struct *work) | 1113 | static void pcpu_balance_workfn(struct work_struct *work) |
885 | { | 1114 | { |
886 | LIST_HEAD(todo); | 1115 | LIST_HEAD(to_free); |
887 | struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; | 1116 | struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; |
888 | struct pcpu_chunk *chunk, *next; | 1117 | struct pcpu_chunk *chunk, *next; |
1118 | int slot, nr_to_pop, ret; | ||
889 | 1119 | ||
1120 | /* | ||
1121 | * There's no reason to keep around multiple unused chunks and VM | ||
1122 | * areas can be scarce. Destroy all free chunks except for one. | ||
1123 | */ | ||
890 | mutex_lock(&pcpu_alloc_mutex); | 1124 | mutex_lock(&pcpu_alloc_mutex); |
891 | spin_lock_irq(&pcpu_lock); | 1125 | spin_lock_irq(&pcpu_lock); |
892 | 1126 | ||
893 | list_for_each_entry_safe(chunk, next, head, list) { | 1127 | list_for_each_entry_safe(chunk, next, free_head, list) { |
894 | WARN_ON(chunk->immutable); | 1128 | WARN_ON(chunk->immutable); |
895 | 1129 | ||
896 | /* spare the first one */ | 1130 | /* spare the first one */ |
897 | if (chunk == list_first_entry(head, struct pcpu_chunk, list)) | 1131 | if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) |
898 | continue; | 1132 | continue; |
899 | 1133 | ||
900 | list_move(&chunk->list, &todo); | 1134 | list_move(&chunk->list, &to_free); |
901 | } | 1135 | } |
902 | 1136 | ||
903 | spin_unlock_irq(&pcpu_lock); | 1137 | spin_unlock_irq(&pcpu_lock); |
904 | 1138 | ||
905 | list_for_each_entry_safe(chunk, next, &todo, list) { | 1139 | list_for_each_entry_safe(chunk, next, &to_free, list) { |
906 | pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); | 1140 | int rs, re; |
1141 | |||
1142 | pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) { | ||
1143 | pcpu_depopulate_chunk(chunk, rs, re); | ||
1144 | spin_lock_irq(&pcpu_lock); | ||
1145 | pcpu_chunk_depopulated(chunk, rs, re); | ||
1146 | spin_unlock_irq(&pcpu_lock); | ||
1147 | } | ||
907 | pcpu_destroy_chunk(chunk); | 1148 | pcpu_destroy_chunk(chunk); |
908 | } | 1149 | } |
909 | 1150 | ||
1151 | /* | ||
1152 | * Ensure there are certain number of free populated pages for | ||
1153 | * atomic allocs. Fill up from the most packed so that atomic | ||
1154 | * allocs don't increase fragmentation. If atomic allocation | ||
1155 | * failed previously, always populate the maximum amount. This | ||
1156 | * should prevent atomic allocs larger than PAGE_SIZE from keeping | ||
1157 | * failing indefinitely; however, large atomic allocs are not | ||
1158 | * something we support properly and can be highly unreliable and | ||
1159 | * inefficient. | ||
1160 | */ | ||
1161 | retry_pop: | ||
1162 | if (pcpu_atomic_alloc_failed) { | ||
1163 | nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; | ||
1164 | /* best effort anyway, don't worry about synchronization */ | ||
1165 | pcpu_atomic_alloc_failed = false; | ||
1166 | } else { | ||
1167 | nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - | ||
1168 | pcpu_nr_empty_pop_pages, | ||
1169 | 0, PCPU_EMPTY_POP_PAGES_HIGH); | ||
1170 | } | ||
1171 | |||
1172 | for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { | ||
1173 | int nr_unpop = 0, rs, re; | ||
1174 | |||
1175 | if (!nr_to_pop) | ||
1176 | break; | ||
1177 | |||
1178 | spin_lock_irq(&pcpu_lock); | ||
1179 | list_for_each_entry(chunk, &pcpu_slot[slot], list) { | ||
1180 | nr_unpop = pcpu_unit_pages - chunk->nr_populated; | ||
1181 | if (nr_unpop) | ||
1182 | break; | ||
1183 | } | ||
1184 | spin_unlock_irq(&pcpu_lock); | ||
1185 | |||
1186 | if (!nr_unpop) | ||
1187 | continue; | ||
1188 | |||
1189 | /* @chunk can't go away while pcpu_alloc_mutex is held */ | ||
1190 | pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) { | ||
1191 | int nr = min(re - rs, nr_to_pop); | ||
1192 | |||
1193 | ret = pcpu_populate_chunk(chunk, rs, rs + nr); | ||
1194 | if (!ret) { | ||
1195 | nr_to_pop -= nr; | ||
1196 | spin_lock_irq(&pcpu_lock); | ||
1197 | pcpu_chunk_populated(chunk, rs, rs + nr); | ||
1198 | spin_unlock_irq(&pcpu_lock); | ||
1199 | } else { | ||
1200 | nr_to_pop = 0; | ||
1201 | } | ||
1202 | |||
1203 | if (!nr_to_pop) | ||
1204 | break; | ||
1205 | } | ||
1206 | } | ||
1207 | |||
1208 | if (nr_to_pop) { | ||
1209 | /* ran out of chunks to populate, create a new one and retry */ | ||
1210 | chunk = pcpu_create_chunk(); | ||
1211 | if (chunk) { | ||
1212 | spin_lock_irq(&pcpu_lock); | ||
1213 | pcpu_chunk_relocate(chunk, -1); | ||
1214 | spin_unlock_irq(&pcpu_lock); | ||
1215 | goto retry_pop; | ||
1216 | } | ||
1217 | } | ||
1218 | |||
910 | mutex_unlock(&pcpu_alloc_mutex); | 1219 | mutex_unlock(&pcpu_alloc_mutex); |
911 | } | 1220 | } |
912 | 1221 | ||
@@ -924,7 +1233,7 @@ void free_percpu(void __percpu *ptr) | |||
924 | void *addr; | 1233 | void *addr; |
925 | struct pcpu_chunk *chunk; | 1234 | struct pcpu_chunk *chunk; |
926 | unsigned long flags; | 1235 | unsigned long flags; |
927 | int off; | 1236 | int off, occ_pages; |
928 | 1237 | ||
929 | if (!ptr) | 1238 | if (!ptr) |
930 | return; | 1239 | return; |
@@ -938,7 +1247,10 @@ void free_percpu(void __percpu *ptr) | |||
938 | chunk = pcpu_chunk_addr_search(addr); | 1247 | chunk = pcpu_chunk_addr_search(addr); |
939 | off = addr - chunk->base_addr; | 1248 | off = addr - chunk->base_addr; |
940 | 1249 | ||
941 | pcpu_free_area(chunk, off); | 1250 | pcpu_free_area(chunk, off, &occ_pages); |
1251 | |||
1252 | if (chunk != pcpu_reserved_chunk) | ||
1253 | pcpu_nr_empty_pop_pages += occ_pages; | ||
942 | 1254 | ||
943 | /* if there are more than one fully free chunks, wake up grim reaper */ | 1255 | /* if there are more than one fully free chunks, wake up grim reaper */ |
944 | if (chunk->free_size == pcpu_unit_size) { | 1256 | if (chunk->free_size == pcpu_unit_size) { |
@@ -946,7 +1258,7 @@ void free_percpu(void __percpu *ptr) | |||
946 | 1258 | ||
947 | list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) | 1259 | list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) |
948 | if (pos != chunk) { | 1260 | if (pos != chunk) { |
949 | schedule_work(&pcpu_reclaim_work); | 1261 | pcpu_schedule_balance_work(); |
950 | break; | 1262 | break; |
951 | } | 1263 | } |
952 | } | 1264 | } |
@@ -1336,11 +1648,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1336 | */ | 1648 | */ |
1337 | schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); | 1649 | schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); |
1338 | INIT_LIST_HEAD(&schunk->list); | 1650 | INIT_LIST_HEAD(&schunk->list); |
1651 | INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn); | ||
1339 | schunk->base_addr = base_addr; | 1652 | schunk->base_addr = base_addr; |
1340 | schunk->map = smap; | 1653 | schunk->map = smap; |
1341 | schunk->map_alloc = ARRAY_SIZE(smap); | 1654 | schunk->map_alloc = ARRAY_SIZE(smap); |
1342 | schunk->immutable = true; | 1655 | schunk->immutable = true; |
1343 | bitmap_fill(schunk->populated, pcpu_unit_pages); | 1656 | bitmap_fill(schunk->populated, pcpu_unit_pages); |
1657 | schunk->nr_populated = pcpu_unit_pages; | ||
1344 | 1658 | ||
1345 | if (ai->reserved_size) { | 1659 | if (ai->reserved_size) { |
1346 | schunk->free_size = ai->reserved_size; | 1660 | schunk->free_size = ai->reserved_size; |
@@ -1364,11 +1678,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1364 | if (dyn_size) { | 1678 | if (dyn_size) { |
1365 | dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); | 1679 | dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); |
1366 | INIT_LIST_HEAD(&dchunk->list); | 1680 | INIT_LIST_HEAD(&dchunk->list); |
1681 | INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn); | ||
1367 | dchunk->base_addr = base_addr; | 1682 | dchunk->base_addr = base_addr; |
1368 | dchunk->map = dmap; | 1683 | dchunk->map = dmap; |
1369 | dchunk->map_alloc = ARRAY_SIZE(dmap); | 1684 | dchunk->map_alloc = ARRAY_SIZE(dmap); |
1370 | dchunk->immutable = true; | 1685 | dchunk->immutable = true; |
1371 | bitmap_fill(dchunk->populated, pcpu_unit_pages); | 1686 | bitmap_fill(dchunk->populated, pcpu_unit_pages); |
1687 | dchunk->nr_populated = pcpu_unit_pages; | ||
1372 | 1688 | ||
1373 | dchunk->contig_hint = dchunk->free_size = dyn_size; | 1689 | dchunk->contig_hint = dchunk->free_size = dyn_size; |
1374 | dchunk->map[0] = 1; | 1690 | dchunk->map[0] = 1; |
@@ -1379,6 +1695,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, | |||
1379 | 1695 | ||
1380 | /* link the first chunk in */ | 1696 | /* link the first chunk in */ |
1381 | pcpu_first_chunk = dchunk ?: schunk; | 1697 | pcpu_first_chunk = dchunk ?: schunk; |
1698 | pcpu_nr_empty_pop_pages += | ||
1699 | pcpu_count_occupied_pages(pcpu_first_chunk, 1); | ||
1382 | pcpu_chunk_relocate(pcpu_first_chunk, -1); | 1700 | pcpu_chunk_relocate(pcpu_first_chunk, -1); |
1383 | 1701 | ||
1384 | /* we're done */ | 1702 | /* we're done */ |
@@ -1932,8 +2250,6 @@ void __init setup_per_cpu_areas(void) | |||
1932 | 2250 | ||
1933 | if (pcpu_setup_first_chunk(ai, fc) < 0) | 2251 | if (pcpu_setup_first_chunk(ai, fc) < 0) |
1934 | panic("Failed to initialize percpu areas."); | 2252 | panic("Failed to initialize percpu areas."); |
1935 | |||
1936 | pcpu_free_alloc_info(ai); | ||
1937 | } | 2253 | } |
1938 | 2254 | ||
1939 | #endif /* CONFIG_SMP */ | 2255 | #endif /* CONFIG_SMP */ |
@@ -1967,3 +2283,15 @@ void __init percpu_init_late(void) | |||
1967 | spin_unlock_irqrestore(&pcpu_lock, flags); | 2283 | spin_unlock_irqrestore(&pcpu_lock, flags); |
1968 | } | 2284 | } |
1969 | } | 2285 | } |
2286 | |||
2287 | /* | ||
2288 | * Percpu allocator is initialized early during boot when neither slab or | ||
2289 | * workqueue is available. Plug async management until everything is up | ||
2290 | * and running. | ||
2291 | */ | ||
2292 | static int __init percpu_enable_async(void) | ||
2293 | { | ||
2294 | pcpu_async_enabled = true; | ||
2295 | return 0; | ||
2296 | } | ||
2297 | subsys_initcall(percpu_enable_async); | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 4fad61bb41e5..cd6fc7590e54 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -2995,7 +2995,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) | |||
2995 | #endif | 2995 | #endif |
2996 | 2996 | ||
2997 | spin_lock_init(&sbinfo->stat_lock); | 2997 | spin_lock_init(&sbinfo->stat_lock); |
2998 | if (percpu_counter_init(&sbinfo->used_blocks, 0)) | 2998 | if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) |
2999 | goto failed; | 2999 | goto failed; |
3000 | sbinfo->free_inodes = sbinfo->max_inodes; | 3000 | sbinfo->free_inodes = sbinfo->max_inodes; |
3001 | 3001 | ||
diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 97b0fcc79547..5ab6627cf370 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c | |||
@@ -1115,7 +1115,7 @@ static int __init dccp_init(void) | |||
1115 | 1115 | ||
1116 | BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > | 1116 | BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > |
1117 | FIELD_SIZEOF(struct sk_buff, cb)); | 1117 | FIELD_SIZEOF(struct sk_buff, cb)); |
1118 | rc = percpu_counter_init(&dccp_orphan_count, 0); | 1118 | rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL); |
1119 | if (rc) | 1119 | if (rc) |
1120 | goto out_fail; | 1120 | goto out_fail; |
1121 | rc = -ENOBUFS; | 1121 | rc = -ENOBUFS; |
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 461003d258ba..86023b9be47f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c | |||
@@ -3071,8 +3071,8 @@ void __init tcp_init(void) | |||
3071 | 3071 | ||
3072 | BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); | 3072 | BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); |
3073 | 3073 | ||
3074 | percpu_counter_init(&tcp_sockets_allocated, 0); | 3074 | percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL); |
3075 | percpu_counter_init(&tcp_orphan_count, 0); | 3075 | percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL); |
3076 | tcp_hashinfo.bind_bucket_cachep = | 3076 | tcp_hashinfo.bind_bucket_cachep = |
3077 | kmem_cache_create("tcp_bind_bucket", | 3077 | kmem_cache_create("tcp_bind_bucket", |
3078 | sizeof(struct inet_bind_bucket), 0, | 3078 | sizeof(struct inet_bind_bucket), 0, |
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c index 3af522622fad..1d191357bf88 100644 --- a/net/ipv4/tcp_memcontrol.c +++ b/net/ipv4/tcp_memcontrol.c | |||
@@ -32,7 +32,7 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss) | |||
32 | res_parent = &parent_cg->memory_allocated; | 32 | res_parent = &parent_cg->memory_allocated; |
33 | 33 | ||
34 | res_counter_init(&cg_proto->memory_allocated, res_parent); | 34 | res_counter_init(&cg_proto->memory_allocated, res_parent); |
35 | percpu_counter_init(&cg_proto->sockets_allocated, 0); | 35 | percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL); |
36 | 36 | ||
37 | return 0; | 37 | return 0; |
38 | } | 38 | } |
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 9d2c6c9facb6..8f34b27d5775 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c | |||
@@ -1341,7 +1341,7 @@ static __init int sctp_init(void) | |||
1341 | if (!sctp_chunk_cachep) | 1341 | if (!sctp_chunk_cachep) |
1342 | goto err_chunk_cachep; | 1342 | goto err_chunk_cachep; |
1343 | 1343 | ||
1344 | status = percpu_counter_init(&sctp_sockets_allocated, 0); | 1344 | status = percpu_counter_init(&sctp_sockets_allocated, 0, GFP_KERNEL); |
1345 | if (status) | 1345 | if (status) |
1346 | goto err_percpu_counter_init; | 1346 | goto err_percpu_counter_init; |
1347 | 1347 | ||