aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-10-10 07:26:02 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-10-10 07:26:02 -0400
commitc798360cd1438090d51eeaa8e67985da11362eba (patch)
tree0107d3b9ee7476264c3357287787d393545bd2d9
parentb211e9d7c861bdb37b86d6384da9edfb80949ceb (diff)
parent6ae833c7fe0c6ef1f0ab13cc775da230d6f4c256 (diff)
Merge branch 'for-3.18' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu
Pull percpu updates from Tejun Heo: "A lot of activities on percpu front. Notable changes are... - percpu allocator now can take @gfp. If @gfp doesn't contain GFP_KERNEL, it tries to allocate from what's already available to the allocator and a work item tries to keep the reserve around certain level so that these atomic allocations usually succeed. This will replace the ad-hoc percpu memory pool used by blk-throttle and also be used by the planned blkcg support for writeback IOs. Please note that I noticed a bug in how @gfp is interpreted while preparing this pull request and applied the fix 6ae833c7fe0c ("percpu: fix how @gfp is interpreted by the percpu allocator") just now. - percpu_ref now uses longs for percpu and global counters instead of ints. It leads to more sparse packing of the percpu counters on 64bit machines but the overhead should be negligible and this allows using percpu_ref for refcnting pages and in-memory objects directly. - The switching between percpu and single counter modes of a percpu_ref is made independent of putting the base ref and a percpu_ref can now optionally be initialized in single or killed mode. This allows avoiding percpu shutdown latency for cases where the refcounted objects may be synchronously created and destroyed in rapid succession with only a fraction of them reaching fully operational status (SCSI probing does this when combined with blk-mq support). It's also planned to be used to implement forced single mode to detect underflow more timely for debugging. There's a separate branch percpu/for-3.18-consistent-ops which cleans up the duplicate percpu accessors. That branch causes a number of conflicts with s390 and other trees. I'll send a separate pull request w/ resolutions once other branches are merged" * 'for-3.18' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/percpu: (33 commits) percpu: fix how @gfp is interpreted by the percpu allocator blk-mq, percpu_ref: start q->mq_usage_counter in atomic mode percpu_ref: make INIT_ATOMIC and switch_to_atomic() sticky percpu_ref: add PERCPU_REF_INIT_* flags percpu_ref: decouple switching to percpu mode and reinit percpu_ref: decouple switching to atomic mode and killing percpu_ref: add PCPU_REF_DEAD percpu_ref: rename things to prepare for decoupling percpu/atomic mode switch percpu_ref: replace pcpu_ prefix with percpu_ percpu_ref: minor code and comment updates percpu_ref: relocate percpu_ref_reinit() Revert "blk-mq, percpu_ref: implement a kludge for SCSI blk-mq stall during probe" Revert "percpu: free percpu allocation info for uniprocessor system" percpu-refcount: make percpu_ref based on longs instead of ints percpu-refcount: improve WARN messages percpu: fix locking regression in the failure path of pcpu_alloc() percpu-refcount: add @gfp to percpu_ref_init() proportions: add @gfp to init functions percpu_counter: add @gfp to percpu_counter_init() percpu_counter: make percpu_counters_lock irq-safe ...
-rw-r--r--arch/x86/kvm/mmu.c2
-rw-r--r--block/blk-mq-sysfs.c6
-rw-r--r--block/blk-mq.c18
-rw-r--r--block/blk-sysfs.c11
-rw-r--r--drivers/target/target_core_tpg.c3
-rw-r--r--fs/aio.c4
-rw-r--r--fs/btrfs/disk-io.c8
-rw-r--r--fs/btrfs/extent-tree.c2
-rw-r--r--fs/ext2/super.c6
-rw-r--r--fs/ext3/super.c6
-rw-r--r--fs/ext4/super.c14
-rw-r--r--fs/file_table.c2
-rw-r--r--fs/quota/dquot.c2
-rw-r--r--fs/super.c3
-rw-r--r--include/linux/blk-mq.h1
-rw-r--r--include/linux/flex_proportions.h5
-rw-r--r--include/linux/percpu-refcount.h122
-rw-r--r--include/linux/percpu.h13
-rw-r--r--include/linux/percpu_counter.h10
-rw-r--r--include/linux/proportions.h5
-rw-r--r--include/net/dst_ops.h2
-rw-r--r--include/net/inet_frag.h2
-rw-r--r--kernel/cgroup.c7
-rw-r--r--lib/flex_proportions.c8
-rw-r--r--lib/percpu-refcount.c305
-rw-r--r--lib/percpu_counter.c20
-rw-r--r--lib/proportions.c10
-rw-r--r--mm/backing-dev.c4
-rw-r--r--mm/mmap.c2
-rw-r--r--mm/nommu.c2
-rw-r--r--mm/page-writeback.c2
-rw-r--r--mm/percpu-km.c16
-rw-r--r--mm/percpu-vm.c162
-rw-r--r--mm/percpu.c526
-rw-r--r--mm/shmem.c2
-rw-r--r--net/dccp/proto.c2
-rw-r--r--net/ipv4/tcp.c4
-rw-r--r--net/ipv4/tcp_memcontrol.c2
-rw-r--r--net/sctp/protocol.c2
39 files changed, 879 insertions, 444 deletions
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 3201e93ebd07..ac1c4de3a484 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4549,7 +4549,7 @@ int kvm_mmu_module_init(void)
4549 if (!mmu_page_header_cache) 4549 if (!mmu_page_header_cache)
4550 goto nomem; 4550 goto nomem;
4551 4551
4552 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0)) 4552 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
4553 goto nomem; 4553 goto nomem;
4554 4554
4555 register_shrinker(&mmu_shrinker); 4555 register_shrinker(&mmu_shrinker);
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index ed5217867555..371d8800b48a 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -402,6 +402,12 @@ static void blk_mq_sysfs_init(struct request_queue *q)
402 } 402 }
403} 403}
404 404
405/* see blk_register_queue() */
406void blk_mq_finish_init(struct request_queue *q)
407{
408 percpu_ref_switch_to_percpu(&q->mq_usage_counter);
409}
410
405int blk_mq_register_disk(struct gendisk *disk) 411int blk_mq_register_disk(struct gendisk *disk)
406{ 412{
407 struct device *dev = disk_to_dev(disk); 413 struct device *dev = disk_to_dev(disk);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index df8e1e09dd17..38f4a165640d 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -119,16 +119,7 @@ void blk_mq_freeze_queue(struct request_queue *q)
119 spin_unlock_irq(q->queue_lock); 119 spin_unlock_irq(q->queue_lock);
120 120
121 if (freeze) { 121 if (freeze) {
122 /* 122 percpu_ref_kill(&q->mq_usage_counter);
123 * XXX: Temporary kludge to work around SCSI blk-mq stall.
124 * SCSI synchronously creates and destroys many queues
125 * back-to-back during probe leading to lengthy stalls.
126 * This will be fixed by keeping ->mq_usage_counter in
127 * atomic mode until genhd registration, but, for now,
128 * let's work around using expedited synchronization.
129 */
130 __percpu_ref_kill_expedited(&q->mq_usage_counter);
131
132 blk_mq_run_queues(q, false); 123 blk_mq_run_queues(q, false);
133 } 124 }
134 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); 125 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
@@ -1804,7 +1795,12 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
1804 if (!q) 1795 if (!q)
1805 goto err_hctxs; 1796 goto err_hctxs;
1806 1797
1807 if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release)) 1798 /*
1799 * Init percpu_ref in atomic mode so that it's faster to shutdown.
1800 * See blk_register_queue() for details.
1801 */
1802 if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,
1803 PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
1808 goto err_map; 1804 goto err_map;
1809 1805
1810 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); 1806 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 17f5c84ce7bf..521ae9089c50 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -551,12 +551,19 @@ int blk_register_queue(struct gendisk *disk)
551 return -ENXIO; 551 return -ENXIO;
552 552
553 /* 553 /*
554 * Initialization must be complete by now. Finish the initial 554 * SCSI probing may synchronously create and destroy a lot of
555 * bypass from queue allocation. 555 * request_queues for non-existent devices. Shutting down a fully
556 * functional queue takes measureable wallclock time as RCU grace
557 * periods are involved. To avoid excessive latency in these
558 * cases, a request_queue starts out in a degraded mode which is
559 * faster to shut down and is made fully functional here as
560 * request_queues for non-existent devices never get registered.
556 */ 561 */
557 if (!blk_queue_init_done(q)) { 562 if (!blk_queue_init_done(q)) {
558 queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); 563 queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q);
559 blk_queue_bypass_end(q); 564 blk_queue_bypass_end(q);
565 if (q->mq_ops)
566 blk_mq_finish_init(q);
560 } 567 }
561 568
562 ret = blk_trace_init_sysfs(dev); 569 ret = blk_trace_init_sysfs(dev);
diff --git a/drivers/target/target_core_tpg.c b/drivers/target/target_core_tpg.c
index fddfae61222f..be783f717f19 100644
--- a/drivers/target/target_core_tpg.c
+++ b/drivers/target/target_core_tpg.c
@@ -819,7 +819,8 @@ int core_tpg_add_lun(
819{ 819{
820 int ret; 820 int ret;
821 821
822 ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release); 822 ret = percpu_ref_init(&lun->lun_ref, core_tpg_lun_ref_release, 0,
823 GFP_KERNEL);
823 if (ret < 0) 824 if (ret < 0)
824 return ret; 825 return ret;
825 826
diff --git a/fs/aio.c b/fs/aio.c
index 733750096b71..84a751005f5b 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -661,10 +661,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
661 661
662 INIT_LIST_HEAD(&ctx->active_reqs); 662 INIT_LIST_HEAD(&ctx->active_reqs);
663 663
664 if (percpu_ref_init(&ctx->users, free_ioctx_users)) 664 if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL))
665 goto err; 665 goto err;
666 666
667 if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) 667 if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL))
668 goto err; 668 goto err;
669 669
670 ctx->cpu = alloc_percpu(struct kioctx_cpu); 670 ctx->cpu = alloc_percpu(struct kioctx_cpu);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a1d36e62179c..d0d78dc07792 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1183,7 +1183,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void)
1183 if (!writers) 1183 if (!writers)
1184 return ERR_PTR(-ENOMEM); 1184 return ERR_PTR(-ENOMEM);
1185 1185
1186 ret = percpu_counter_init(&writers->counter, 0); 1186 ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL);
1187 if (ret < 0) { 1187 if (ret < 0) {
1188 kfree(writers); 1188 kfree(writers);
1189 return ERR_PTR(ret); 1189 return ERR_PTR(ret);
@@ -2188,7 +2188,7 @@ int open_ctree(struct super_block *sb,
2188 goto fail_srcu; 2188 goto fail_srcu;
2189 } 2189 }
2190 2190
2191 ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); 2191 ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL);
2192 if (ret) { 2192 if (ret) {
2193 err = ret; 2193 err = ret;
2194 goto fail_bdi; 2194 goto fail_bdi;
@@ -2196,13 +2196,13 @@ int open_ctree(struct super_block *sb,
2196 fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * 2196 fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
2197 (1 + ilog2(nr_cpu_ids)); 2197 (1 + ilog2(nr_cpu_ids));
2198 2198
2199 ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); 2199 ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL);
2200 if (ret) { 2200 if (ret) {
2201 err = ret; 2201 err = ret;
2202 goto fail_dirty_metadata_bytes; 2202 goto fail_dirty_metadata_bytes;
2203 } 2203 }
2204 2204
2205 ret = percpu_counter_init(&fs_info->bio_counter, 0); 2205 ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
2206 if (ret) { 2206 if (ret) {
2207 err = ret; 2207 err = ret;
2208 goto fail_delalloc_bytes; 2208 goto fail_delalloc_bytes;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3efe1c3877bf..caaf015d6e4b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3494,7 +3494,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3494 if (!found) 3494 if (!found)
3495 return -ENOMEM; 3495 return -ENOMEM;
3496 3496
3497 ret = percpu_counter_init(&found->total_bytes_pinned, 0); 3497 ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL);
3498 if (ret) { 3498 if (ret) {
3499 kfree(found); 3499 kfree(found);
3500 return ret; 3500 return ret;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index b88edc05c230..170dc41e8bf4 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1067,14 +1067,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
1067 ext2_rsv_window_add(sb, &sbi->s_rsv_window_head); 1067 ext2_rsv_window_add(sb, &sbi->s_rsv_window_head);
1068 1068
1069 err = percpu_counter_init(&sbi->s_freeblocks_counter, 1069 err = percpu_counter_init(&sbi->s_freeblocks_counter,
1070 ext2_count_free_blocks(sb)); 1070 ext2_count_free_blocks(sb), GFP_KERNEL);
1071 if (!err) { 1071 if (!err) {
1072 err = percpu_counter_init(&sbi->s_freeinodes_counter, 1072 err = percpu_counter_init(&sbi->s_freeinodes_counter,
1073 ext2_count_free_inodes(sb)); 1073 ext2_count_free_inodes(sb), GFP_KERNEL);
1074 } 1074 }
1075 if (!err) { 1075 if (!err) {
1076 err = percpu_counter_init(&sbi->s_dirs_counter, 1076 err = percpu_counter_init(&sbi->s_dirs_counter,
1077 ext2_count_dirs(sb)); 1077 ext2_count_dirs(sb), GFP_KERNEL);
1078 } 1078 }
1079 if (err) { 1079 if (err) {
1080 ext2_msg(sb, KERN_ERR, "error: insufficient memory"); 1080 ext2_msg(sb, KERN_ERR, "error: insufficient memory");
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 622e88249024..bb0fdacad058 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2039,14 +2039,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
2039 goto failed_mount2; 2039 goto failed_mount2;
2040 } 2040 }
2041 err = percpu_counter_init(&sbi->s_freeblocks_counter, 2041 err = percpu_counter_init(&sbi->s_freeblocks_counter,
2042 ext3_count_free_blocks(sb)); 2042 ext3_count_free_blocks(sb), GFP_KERNEL);
2043 if (!err) { 2043 if (!err) {
2044 err = percpu_counter_init(&sbi->s_freeinodes_counter, 2044 err = percpu_counter_init(&sbi->s_freeinodes_counter,
2045 ext3_count_free_inodes(sb)); 2045 ext3_count_free_inodes(sb), GFP_KERNEL);
2046 } 2046 }
2047 if (!err) { 2047 if (!err) {
2048 err = percpu_counter_init(&sbi->s_dirs_counter, 2048 err = percpu_counter_init(&sbi->s_dirs_counter,
2049 ext3_count_dirs(sb)); 2049 ext3_count_dirs(sb), GFP_KERNEL);
2050 } 2050 }
2051 if (err) { 2051 if (err) {
2052 ext3_msg(sb, KERN_ERR, "error: insufficient memory"); 2052 ext3_msg(sb, KERN_ERR, "error: insufficient memory");
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 0b28b36e7915..05c159218bc2 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3892,7 +3892,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
3892 /* Register extent status tree shrinker */ 3892 /* Register extent status tree shrinker */
3893 ext4_es_register_shrinker(sbi); 3893 ext4_es_register_shrinker(sbi);
3894 3894
3895 if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) { 3895 err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL);
3896 if (err) {
3896 ext4_msg(sb, KERN_ERR, "insufficient memory"); 3897 ext4_msg(sb, KERN_ERR, "insufficient memory");
3897 goto failed_mount3; 3898 goto failed_mount3;
3898 } 3899 }
@@ -4106,17 +4107,20 @@ no_journal:
4106 block = ext4_count_free_clusters(sb); 4107 block = ext4_count_free_clusters(sb);
4107 ext4_free_blocks_count_set(sbi->s_es, 4108 ext4_free_blocks_count_set(sbi->s_es,
4108 EXT4_C2B(sbi, block)); 4109 EXT4_C2B(sbi, block));
4109 err = percpu_counter_init(&sbi->s_freeclusters_counter, block); 4110 err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
4111 GFP_KERNEL);
4110 if (!err) { 4112 if (!err) {
4111 unsigned long freei = ext4_count_free_inodes(sb); 4113 unsigned long freei = ext4_count_free_inodes(sb);
4112 sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); 4114 sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
4113 err = percpu_counter_init(&sbi->s_freeinodes_counter, freei); 4115 err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
4116 GFP_KERNEL);
4114 } 4117 }
4115 if (!err) 4118 if (!err)
4116 err = percpu_counter_init(&sbi->s_dirs_counter, 4119 err = percpu_counter_init(&sbi->s_dirs_counter,
4117 ext4_count_dirs(sb)); 4120 ext4_count_dirs(sb), GFP_KERNEL);
4118 if (!err) 4121 if (!err)
4119 err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); 4122 err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0,
4123 GFP_KERNEL);
4120 if (err) { 4124 if (err) {
4121 ext4_msg(sb, KERN_ERR, "insufficient memory"); 4125 ext4_msg(sb, KERN_ERR, "insufficient memory");
4122 goto failed_mount6; 4126 goto failed_mount6;
diff --git a/fs/file_table.c b/fs/file_table.c
index 385bfd31512a..0bab12b20460 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -331,5 +331,5 @@ void __init files_init(unsigned long mempages)
331 331
332 n = (mempages * (PAGE_SIZE / 1024)) / 10; 332 n = (mempages * (PAGE_SIZE / 1024)) / 10;
333 files_stat.max_files = max_t(unsigned long, n, NR_FILE); 333 files_stat.max_files = max_t(unsigned long, n, NR_FILE);
334 percpu_counter_init(&nr_files, 0); 334 percpu_counter_init(&nr_files, 0, GFP_KERNEL);
335} 335}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index f2d0eee9d1f1..8b663b2d9562 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2725,7 +2725,7 @@ static int __init dquot_init(void)
2725 panic("Cannot create dquot hash table"); 2725 panic("Cannot create dquot hash table");
2726 2726
2727 for (i = 0; i < _DQST_DQSTAT_LAST; i++) { 2727 for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
2728 ret = percpu_counter_init(&dqstats.counter[i], 0); 2728 ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL);
2729 if (ret) 2729 if (ret)
2730 panic("Cannot create dquot stat counters"); 2730 panic("Cannot create dquot stat counters");
2731 } 2731 }
diff --git a/fs/super.c b/fs/super.c
index b9a214d2fe98..1b836107acee 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -175,7 +175,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
175 goto fail; 175 goto fail;
176 176
177 for (i = 0; i < SB_FREEZE_LEVELS; i++) { 177 for (i = 0; i < SB_FREEZE_LEVELS; i++) {
178 if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0) 178 if (percpu_counter_init(&s->s_writers.counter[i], 0,
179 GFP_KERNEL) < 0)
179 goto fail; 180 goto fail;
180 lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], 181 lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i],
181 &type->s_writers_key[i], 0); 182 &type->s_writers_key[i], 0);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index a1e31f274fcd..c13a0c09faea 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -140,6 +140,7 @@ enum {
140}; 140};
141 141
142struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); 142struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
143void blk_mq_finish_init(struct request_queue *q);
143int blk_mq_register_disk(struct gendisk *); 144int blk_mq_register_disk(struct gendisk *);
144void blk_mq_unregister_disk(struct gendisk *); 145void blk_mq_unregister_disk(struct gendisk *);
145 146
diff --git a/include/linux/flex_proportions.h b/include/linux/flex_proportions.h
index 4ebc49fae391..0d348e011a6e 100644
--- a/include/linux/flex_proportions.h
+++ b/include/linux/flex_proportions.h
@@ -10,6 +10,7 @@
10#include <linux/percpu_counter.h> 10#include <linux/percpu_counter.h>
11#include <linux/spinlock.h> 11#include <linux/spinlock.h>
12#include <linux/seqlock.h> 12#include <linux/seqlock.h>
13#include <linux/gfp.h>
13 14
14/* 15/*
15 * When maximum proportion of some event type is specified, this is the 16 * When maximum proportion of some event type is specified, this is the
@@ -32,7 +33,7 @@ struct fprop_global {
32 seqcount_t sequence; 33 seqcount_t sequence;
33}; 34};
34 35
35int fprop_global_init(struct fprop_global *p); 36int fprop_global_init(struct fprop_global *p, gfp_t gfp);
36void fprop_global_destroy(struct fprop_global *p); 37void fprop_global_destroy(struct fprop_global *p);
37bool fprop_new_period(struct fprop_global *p, int periods); 38bool fprop_new_period(struct fprop_global *p, int periods);
38 39
@@ -79,7 +80,7 @@ struct fprop_local_percpu {
79 raw_spinlock_t lock; /* Protect period and numerator */ 80 raw_spinlock_t lock; /* Protect period and numerator */
80}; 81};
81 82
82int fprop_local_init_percpu(struct fprop_local_percpu *pl); 83int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp);
83void fprop_local_destroy_percpu(struct fprop_local_percpu *pl); 84void fprop_local_destroy_percpu(struct fprop_local_percpu *pl);
84void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl); 85void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl);
85void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl, 86void __fprop_inc_percpu_max(struct fprop_global *p, struct fprop_local_percpu *pl,
diff --git a/include/linux/percpu-refcount.h b/include/linux/percpu-refcount.h
index 68a64f11ce02..d5c89e0dd0e6 100644
--- a/include/linux/percpu-refcount.h
+++ b/include/linux/percpu-refcount.h
@@ -13,7 +13,7 @@
13 * 13 *
14 * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less 14 * The refcount will have a range of 0 to ((1U << 31) - 1), i.e. one bit less
15 * than an atomic_t - this is because of the way shutdown works, see 15 * than an atomic_t - this is because of the way shutdown works, see
16 * percpu_ref_kill()/PCPU_COUNT_BIAS. 16 * percpu_ref_kill()/PERCPU_COUNT_BIAS.
17 * 17 *
18 * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the 18 * Before you call percpu_ref_kill(), percpu_ref_put() does not check for the
19 * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill() 19 * refcount hitting 0 - it can't, if it was in percpu mode. percpu_ref_kill()
@@ -49,29 +49,60 @@
49#include <linux/kernel.h> 49#include <linux/kernel.h>
50#include <linux/percpu.h> 50#include <linux/percpu.h>
51#include <linux/rcupdate.h> 51#include <linux/rcupdate.h>
52#include <linux/gfp.h>
52 53
53struct percpu_ref; 54struct percpu_ref;
54typedef void (percpu_ref_func_t)(struct percpu_ref *); 55typedef void (percpu_ref_func_t)(struct percpu_ref *);
55 56
57/* flags set in the lower bits of percpu_ref->percpu_count_ptr */
58enum {
59 __PERCPU_REF_ATOMIC = 1LU << 0, /* operating in atomic mode */
60 __PERCPU_REF_DEAD = 1LU << 1, /* (being) killed */
61 __PERCPU_REF_ATOMIC_DEAD = __PERCPU_REF_ATOMIC | __PERCPU_REF_DEAD,
62
63 __PERCPU_REF_FLAG_BITS = 2,
64};
65
66/* @flags for percpu_ref_init() */
67enum {
68 /*
69 * Start w/ ref == 1 in atomic mode. Can be switched to percpu
70 * operation using percpu_ref_switch_to_percpu(). If initialized
71 * with this flag, the ref will stay in atomic mode until
72 * percpu_ref_switch_to_percpu() is invoked on it.
73 */
74 PERCPU_REF_INIT_ATOMIC = 1 << 0,
75
76 /*
77 * Start dead w/ ref == 0 in atomic mode. Must be revived with
78 * percpu_ref_reinit() before used. Implies INIT_ATOMIC.
79 */
80 PERCPU_REF_INIT_DEAD = 1 << 1,
81};
82
56struct percpu_ref { 83struct percpu_ref {
57 atomic_t count; 84 atomic_long_t count;
58 /* 85 /*
59 * The low bit of the pointer indicates whether the ref is in percpu 86 * The low bit of the pointer indicates whether the ref is in percpu
60 * mode; if set, then get/put will manipulate the atomic_t. 87 * mode; if set, then get/put will manipulate the atomic_t.
61 */ 88 */
62 unsigned long pcpu_count_ptr; 89 unsigned long percpu_count_ptr;
63 percpu_ref_func_t *release; 90 percpu_ref_func_t *release;
64 percpu_ref_func_t *confirm_kill; 91 percpu_ref_func_t *confirm_switch;
92 bool force_atomic:1;
65 struct rcu_head rcu; 93 struct rcu_head rcu;
66}; 94};
67 95
68int __must_check percpu_ref_init(struct percpu_ref *ref, 96int __must_check percpu_ref_init(struct percpu_ref *ref,
69 percpu_ref_func_t *release); 97 percpu_ref_func_t *release, unsigned int flags,
70void percpu_ref_reinit(struct percpu_ref *ref); 98 gfp_t gfp);
71void percpu_ref_exit(struct percpu_ref *ref); 99void percpu_ref_exit(struct percpu_ref *ref);
100void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
101 percpu_ref_func_t *confirm_switch);
102void percpu_ref_switch_to_percpu(struct percpu_ref *ref);
72void percpu_ref_kill_and_confirm(struct percpu_ref *ref, 103void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
73 percpu_ref_func_t *confirm_kill); 104 percpu_ref_func_t *confirm_kill);
74void __percpu_ref_kill_expedited(struct percpu_ref *ref); 105void percpu_ref_reinit(struct percpu_ref *ref);
75 106
76/** 107/**
77 * percpu_ref_kill - drop the initial ref 108 * percpu_ref_kill - drop the initial ref
@@ -88,26 +119,24 @@ static inline void percpu_ref_kill(struct percpu_ref *ref)
88 return percpu_ref_kill_and_confirm(ref, NULL); 119 return percpu_ref_kill_and_confirm(ref, NULL);
89} 120}
90 121
91#define PCPU_REF_DEAD 1
92
93/* 122/*
94 * Internal helper. Don't use outside percpu-refcount proper. The 123 * Internal helper. Don't use outside percpu-refcount proper. The
95 * function doesn't return the pointer and let the caller test it for NULL 124 * function doesn't return the pointer and let the caller test it for NULL
96 * because doing so forces the compiler to generate two conditional 125 * because doing so forces the compiler to generate two conditional
97 * branches as it can't assume that @ref->pcpu_count is not NULL. 126 * branches as it can't assume that @ref->percpu_count is not NULL.
98 */ 127 */
99static inline bool __pcpu_ref_alive(struct percpu_ref *ref, 128static inline bool __ref_is_percpu(struct percpu_ref *ref,
100 unsigned __percpu **pcpu_countp) 129 unsigned long __percpu **percpu_countp)
101{ 130{
102 unsigned long pcpu_ptr = ACCESS_ONCE(ref->pcpu_count_ptr); 131 unsigned long percpu_ptr = ACCESS_ONCE(ref->percpu_count_ptr);
103 132
104 /* paired with smp_store_release() in percpu_ref_reinit() */ 133 /* paired with smp_store_release() in percpu_ref_reinit() */
105 smp_read_barrier_depends(); 134 smp_read_barrier_depends();
106 135
107 if (unlikely(pcpu_ptr & PCPU_REF_DEAD)) 136 if (unlikely(percpu_ptr & __PERCPU_REF_ATOMIC))
108 return false; 137 return false;
109 138
110 *pcpu_countp = (unsigned __percpu *)pcpu_ptr; 139 *percpu_countp = (unsigned long __percpu *)percpu_ptr;
111 return true; 140 return true;
112} 141}
113 142
@@ -115,18 +144,20 @@ static inline bool __pcpu_ref_alive(struct percpu_ref *ref,
115 * percpu_ref_get - increment a percpu refcount 144 * percpu_ref_get - increment a percpu refcount
116 * @ref: percpu_ref to get 145 * @ref: percpu_ref to get
117 * 146 *
118 * Analagous to atomic_inc(). 147 * Analagous to atomic_long_inc().
119 */ 148 *
149 * This function is safe to call as long as @ref is between init and exit.
150 */
120static inline void percpu_ref_get(struct percpu_ref *ref) 151static inline void percpu_ref_get(struct percpu_ref *ref)
121{ 152{
122 unsigned __percpu *pcpu_count; 153 unsigned long __percpu *percpu_count;
123 154
124 rcu_read_lock_sched(); 155 rcu_read_lock_sched();
125 156
126 if (__pcpu_ref_alive(ref, &pcpu_count)) 157 if (__ref_is_percpu(ref, &percpu_count))
127 this_cpu_inc(*pcpu_count); 158 this_cpu_inc(*percpu_count);
128 else 159 else
129 atomic_inc(&ref->count); 160 atomic_long_inc(&ref->count);
130 161
131 rcu_read_unlock_sched(); 162 rcu_read_unlock_sched();
132} 163}
@@ -138,20 +169,20 @@ static inline void percpu_ref_get(struct percpu_ref *ref)
138 * Increment a percpu refcount unless its count already reached zero. 169 * Increment a percpu refcount unless its count already reached zero.
139 * Returns %true on success; %false on failure. 170 * Returns %true on success; %false on failure.
140 * 171 *
141 * The caller is responsible for ensuring that @ref stays accessible. 172 * This function is safe to call as long as @ref is between init and exit.
142 */ 173 */
143static inline bool percpu_ref_tryget(struct percpu_ref *ref) 174static inline bool percpu_ref_tryget(struct percpu_ref *ref)
144{ 175{
145 unsigned __percpu *pcpu_count; 176 unsigned long __percpu *percpu_count;
146 int ret = false; 177 int ret;
147 178
148 rcu_read_lock_sched(); 179 rcu_read_lock_sched();
149 180
150 if (__pcpu_ref_alive(ref, &pcpu_count)) { 181 if (__ref_is_percpu(ref, &percpu_count)) {
151 this_cpu_inc(*pcpu_count); 182 this_cpu_inc(*percpu_count);
152 ret = true; 183 ret = true;
153 } else { 184 } else {
154 ret = atomic_inc_not_zero(&ref->count); 185 ret = atomic_long_inc_not_zero(&ref->count);
155 } 186 }
156 187
157 rcu_read_unlock_sched(); 188 rcu_read_unlock_sched();
@@ -166,23 +197,26 @@ static inline bool percpu_ref_tryget(struct percpu_ref *ref)
166 * Increment a percpu refcount unless it has already been killed. Returns 197 * Increment a percpu refcount unless it has already been killed. Returns
167 * %true on success; %false on failure. 198 * %true on success; %false on failure.
168 * 199 *
169 * Completion of percpu_ref_kill() in itself doesn't guarantee that tryget 200 * Completion of percpu_ref_kill() in itself doesn't guarantee that this
170 * will fail. For such guarantee, percpu_ref_kill_and_confirm() should be 201 * function will fail. For such guarantee, percpu_ref_kill_and_confirm()
171 * used. After the confirm_kill callback is invoked, it's guaranteed that 202 * should be used. After the confirm_kill callback is invoked, it's
172 * no new reference will be given out by percpu_ref_tryget(). 203 * guaranteed that no new reference will be given out by
204 * percpu_ref_tryget_live().
173 * 205 *
174 * The caller is responsible for ensuring that @ref stays accessible. 206 * This function is safe to call as long as @ref is between init and exit.
175 */ 207 */
176static inline bool percpu_ref_tryget_live(struct percpu_ref *ref) 208static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
177{ 209{
178 unsigned __percpu *pcpu_count; 210 unsigned long __percpu *percpu_count;
179 int ret = false; 211 int ret = false;
180 212
181 rcu_read_lock_sched(); 213 rcu_read_lock_sched();
182 214
183 if (__pcpu_ref_alive(ref, &pcpu_count)) { 215 if (__ref_is_percpu(ref, &percpu_count)) {
184 this_cpu_inc(*pcpu_count); 216 this_cpu_inc(*percpu_count);
185 ret = true; 217 ret = true;
218 } else if (!(ACCESS_ONCE(ref->percpu_count_ptr) & __PERCPU_REF_DEAD)) {
219 ret = atomic_long_inc_not_zero(&ref->count);
186 } 220 }
187 221
188 rcu_read_unlock_sched(); 222 rcu_read_unlock_sched();
@@ -196,16 +230,18 @@ static inline bool percpu_ref_tryget_live(struct percpu_ref *ref)
196 * 230 *
197 * Decrement the refcount, and if 0, call the release function (which was passed 231 * Decrement the refcount, and if 0, call the release function (which was passed
198 * to percpu_ref_init()) 232 * to percpu_ref_init())
233 *
234 * This function is safe to call as long as @ref is between init and exit.
199 */ 235 */
200static inline void percpu_ref_put(struct percpu_ref *ref) 236static inline void percpu_ref_put(struct percpu_ref *ref)
201{ 237{
202 unsigned __percpu *pcpu_count; 238 unsigned long __percpu *percpu_count;
203 239
204 rcu_read_lock_sched(); 240 rcu_read_lock_sched();
205 241
206 if (__pcpu_ref_alive(ref, &pcpu_count)) 242 if (__ref_is_percpu(ref, &percpu_count))
207 this_cpu_dec(*pcpu_count); 243 this_cpu_dec(*percpu_count);
208 else if (unlikely(atomic_dec_and_test(&ref->count))) 244 else if (unlikely(atomic_long_dec_and_test(&ref->count)))
209 ref->release(ref); 245 ref->release(ref);
210 246
211 rcu_read_unlock_sched(); 247 rcu_read_unlock_sched();
@@ -216,14 +252,16 @@ static inline void percpu_ref_put(struct percpu_ref *ref)
216 * @ref: percpu_ref to test 252 * @ref: percpu_ref to test
217 * 253 *
218 * Returns %true if @ref reached zero. 254 * Returns %true if @ref reached zero.
255 *
256 * This function is safe to call as long as @ref is between init and exit.
219 */ 257 */
220static inline bool percpu_ref_is_zero(struct percpu_ref *ref) 258static inline bool percpu_ref_is_zero(struct percpu_ref *ref)
221{ 259{
222 unsigned __percpu *pcpu_count; 260 unsigned long __percpu *percpu_count;
223 261
224 if (__pcpu_ref_alive(ref, &pcpu_count)) 262 if (__ref_is_percpu(ref, &percpu_count))
225 return false; 263 return false;
226 return !atomic_read(&ref->count); 264 return !atomic_long_read(&ref->count);
227} 265}
228 266
229#endif 267#endif
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 6f61b61b7996..a3aa63e47637 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -48,9 +48,9 @@
48 * intelligent way to determine this would be nice. 48 * intelligent way to determine this would be nice.
49 */ 49 */
50#if BITS_PER_LONG > 32 50#if BITS_PER_LONG > 32
51#define PERCPU_DYNAMIC_RESERVE (20 << 10) 51#define PERCPU_DYNAMIC_RESERVE (28 << 10)
52#else 52#else
53#define PERCPU_DYNAMIC_RESERVE (12 << 10) 53#define PERCPU_DYNAMIC_RESERVE (20 << 10)
54#endif 54#endif
55 55
56extern void *pcpu_base_addr; 56extern void *pcpu_base_addr;
@@ -122,11 +122,16 @@ extern void __init setup_per_cpu_areas(void);
122#endif 122#endif
123extern void __init percpu_init_late(void); 123extern void __init percpu_init_late(void);
124 124
125extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp);
125extern void __percpu *__alloc_percpu(size_t size, size_t align); 126extern void __percpu *__alloc_percpu(size_t size, size_t align);
126extern void free_percpu(void __percpu *__pdata); 127extern void free_percpu(void __percpu *__pdata);
127extern phys_addr_t per_cpu_ptr_to_phys(void *addr); 128extern phys_addr_t per_cpu_ptr_to_phys(void *addr);
128 129
129#define alloc_percpu(type) \ 130#define alloc_percpu_gfp(type, gfp) \
130 (typeof(type) __percpu *)__alloc_percpu(sizeof(type), __alignof__(type)) 131 (typeof(type) __percpu *)__alloc_percpu_gfp(sizeof(type), \
132 __alignof__(type), gfp)
133#define alloc_percpu(type) \
134 (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \
135 __alignof__(type))
131 136
132#endif /* __LINUX_PERCPU_H */ 137#endif /* __LINUX_PERCPU_H */
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index d5dd4657c8d6..50e50095c8d1 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -12,6 +12,7 @@
12#include <linux/threads.h> 12#include <linux/threads.h>
13#include <linux/percpu.h> 13#include <linux/percpu.h>
14#include <linux/types.h> 14#include <linux/types.h>
15#include <linux/gfp.h>
15 16
16#ifdef CONFIG_SMP 17#ifdef CONFIG_SMP
17 18
@@ -26,14 +27,14 @@ struct percpu_counter {
26 27
27extern int percpu_counter_batch; 28extern int percpu_counter_batch;
28 29
29int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, 30int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
30 struct lock_class_key *key); 31 struct lock_class_key *key);
31 32
32#define percpu_counter_init(fbc, value) \ 33#define percpu_counter_init(fbc, value, gfp) \
33 ({ \ 34 ({ \
34 static struct lock_class_key __key; \ 35 static struct lock_class_key __key; \
35 \ 36 \
36 __percpu_counter_init(fbc, value, &__key); \ 37 __percpu_counter_init(fbc, value, gfp, &__key); \
37 }) 38 })
38 39
39void percpu_counter_destroy(struct percpu_counter *fbc); 40void percpu_counter_destroy(struct percpu_counter *fbc);
@@ -89,7 +90,8 @@ struct percpu_counter {
89 s64 count; 90 s64 count;
90}; 91};
91 92
92static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount) 93static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount,
94 gfp_t gfp)
93{ 95{
94 fbc->count = amount; 96 fbc->count = amount;
95 return 0; 97 return 0;
diff --git a/include/linux/proportions.h b/include/linux/proportions.h
index 26a8a4ed9b07..00e8e8fa7358 100644
--- a/include/linux/proportions.h
+++ b/include/linux/proportions.h
@@ -12,6 +12,7 @@
12#include <linux/percpu_counter.h> 12#include <linux/percpu_counter.h>
13#include <linux/spinlock.h> 13#include <linux/spinlock.h>
14#include <linux/mutex.h> 14#include <linux/mutex.h>
15#include <linux/gfp.h>
15 16
16struct prop_global { 17struct prop_global {
17 /* 18 /*
@@ -40,7 +41,7 @@ struct prop_descriptor {
40 struct mutex mutex; /* serialize the prop_global switch */ 41 struct mutex mutex; /* serialize the prop_global switch */
41}; 42};
42 43
43int prop_descriptor_init(struct prop_descriptor *pd, int shift); 44int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp);
44void prop_change_shift(struct prop_descriptor *pd, int new_shift); 45void prop_change_shift(struct prop_descriptor *pd, int new_shift);
45 46
46/* 47/*
@@ -61,7 +62,7 @@ struct prop_local_percpu {
61 raw_spinlock_t lock; /* protect the snapshot state */ 62 raw_spinlock_t lock; /* protect the snapshot state */
62}; 63};
63 64
64int prop_local_init_percpu(struct prop_local_percpu *pl); 65int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp);
65void prop_local_destroy_percpu(struct prop_local_percpu *pl); 66void prop_local_destroy_percpu(struct prop_local_percpu *pl);
66void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl); 67void __prop_inc_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl);
67void prop_fraction_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl, 68void prop_fraction_percpu(struct prop_descriptor *pd, struct prop_local_percpu *pl,
diff --git a/include/net/dst_ops.h b/include/net/dst_ops.h
index 2f26dfb8450e..1f99a1de0e4f 100644
--- a/include/net/dst_ops.h
+++ b/include/net/dst_ops.h
@@ -63,7 +63,7 @@ static inline void dst_entries_add(struct dst_ops *dst, int val)
63 63
64static inline int dst_entries_init(struct dst_ops *dst) 64static inline int dst_entries_init(struct dst_ops *dst)
65{ 65{
66 return percpu_counter_init(&dst->pcpuc_entries, 0); 66 return percpu_counter_init(&dst->pcpuc_entries, 0, GFP_KERNEL);
67} 67}
68 68
69static inline void dst_entries_destroy(struct dst_ops *dst) 69static inline void dst_entries_destroy(struct dst_ops *dst)
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 65a8855e99fe..8d1765577acc 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -151,7 +151,7 @@ static inline void add_frag_mem_limit(struct inet_frag_queue *q, int i)
151 151
152static inline void init_frag_mem_limit(struct netns_frags *nf) 152static inline void init_frag_mem_limit(struct netns_frags *nf)
153{ 153{
154 percpu_counter_init(&nf->mem, 0); 154 percpu_counter_init(&nf->mem, 0, GFP_KERNEL);
155} 155}
156 156
157static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf) 157static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index cab7dc4284dc..136eceadeed1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1607,7 +1607,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
1607 goto out; 1607 goto out;
1608 root_cgrp->id = ret; 1608 root_cgrp->id = ret;
1609 1609
1610 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release); 1610 ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
1611 GFP_KERNEL);
1611 if (ret) 1612 if (ret)
1612 goto out; 1613 goto out;
1613 1614
@@ -4482,7 +4483,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
4482 4483
4483 init_and_link_css(css, ss, cgrp); 4484 init_and_link_css(css, ss, cgrp);
4484 4485
4485 err = percpu_ref_init(&css->refcnt, css_release); 4486 err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
4486 if (err) 4487 if (err)
4487 goto err_free_css; 4488 goto err_free_css;
4488 4489
@@ -4555,7 +4556,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
4555 goto out_unlock; 4556 goto out_unlock;
4556 } 4557 }
4557 4558
4558 ret = percpu_ref_init(&cgrp->self.refcnt, css_release); 4559 ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
4559 if (ret) 4560 if (ret)
4560 goto out_free_cgrp; 4561 goto out_free_cgrp;
4561 4562
diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c
index ebf3bac460b0..8f25652f40d4 100644
--- a/lib/flex_proportions.c
+++ b/lib/flex_proportions.c
@@ -34,13 +34,13 @@
34 */ 34 */
35#include <linux/flex_proportions.h> 35#include <linux/flex_proportions.h>
36 36
37int fprop_global_init(struct fprop_global *p) 37int fprop_global_init(struct fprop_global *p, gfp_t gfp)
38{ 38{
39 int err; 39 int err;
40 40
41 p->period = 0; 41 p->period = 0;
42 /* Use 1 to avoid dealing with periods with 0 events... */ 42 /* Use 1 to avoid dealing with periods with 0 events... */
43 err = percpu_counter_init(&p->events, 1); 43 err = percpu_counter_init(&p->events, 1, gfp);
44 if (err) 44 if (err)
45 return err; 45 return err;
46 seqcount_init(&p->sequence); 46 seqcount_init(&p->sequence);
@@ -168,11 +168,11 @@ void fprop_fraction_single(struct fprop_global *p,
168 */ 168 */
169#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) 169#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids)))
170 170
171int fprop_local_init_percpu(struct fprop_local_percpu *pl) 171int fprop_local_init_percpu(struct fprop_local_percpu *pl, gfp_t gfp)
172{ 172{
173 int err; 173 int err;
174 174
175 err = percpu_counter_init(&pl->events, 0); 175 err = percpu_counter_init(&pl->events, 0, gfp);
176 if (err) 176 if (err)
177 return err; 177 return err;
178 pl->period = 0; 178 pl->period = 0;
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c
index a89cf09a8268..6111bcb28376 100644
--- a/lib/percpu-refcount.c
+++ b/lib/percpu-refcount.c
@@ -1,6 +1,8 @@
1#define pr_fmt(fmt) "%s: " fmt "\n", __func__ 1#define pr_fmt(fmt) "%s: " fmt "\n", __func__
2 2
3#include <linux/kernel.h> 3#include <linux/kernel.h>
4#include <linux/sched.h>
5#include <linux/wait.h>
4#include <linux/percpu-refcount.h> 6#include <linux/percpu-refcount.h>
5 7
6/* 8/*
@@ -11,8 +13,8 @@
11 * percpu counters will all sum to the correct value 13 * percpu counters will all sum to the correct value
12 * 14 *
13 * (More precisely: because moduler arithmatic is commutative the sum of all the 15 * (More precisely: because moduler arithmatic is commutative the sum of all the
14 * pcpu_count vars will be equal to what it would have been if all the gets and 16 * percpu_count vars will be equal to what it would have been if all the gets
15 * puts were done to a single integer, even if some of the percpu integers 17 * and puts were done to a single integer, even if some of the percpu integers
16 * overflow or underflow). 18 * overflow or underflow).
17 * 19 *
18 * The real trick to implementing percpu refcounts is shutdown. We can't detect 20 * The real trick to implementing percpu refcounts is shutdown. We can't detect
@@ -25,75 +27,64 @@
25 * works. 27 * works.
26 * 28 *
27 * Converting to non percpu mode is done with some RCUish stuff in 29 * Converting to non percpu mode is done with some RCUish stuff in
28 * percpu_ref_kill. Additionally, we need a bias value so that the atomic_t 30 * percpu_ref_kill. Additionally, we need a bias value so that the
29 * can't hit 0 before we've added up all the percpu refs. 31 * atomic_long_t can't hit 0 before we've added up all the percpu refs.
30 */ 32 */
31 33
32#define PCPU_COUNT_BIAS (1U << 31) 34#define PERCPU_COUNT_BIAS (1LU << (BITS_PER_LONG - 1))
33 35
34static unsigned __percpu *pcpu_count_ptr(struct percpu_ref *ref) 36static DECLARE_WAIT_QUEUE_HEAD(percpu_ref_switch_waitq);
37
38static unsigned long __percpu *percpu_count_ptr(struct percpu_ref *ref)
35{ 39{
36 return (unsigned __percpu *)(ref->pcpu_count_ptr & ~PCPU_REF_DEAD); 40 return (unsigned long __percpu *)
41 (ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC_DEAD);
37} 42}
38 43
39/** 44/**
40 * percpu_ref_init - initialize a percpu refcount 45 * percpu_ref_init - initialize a percpu refcount
41 * @ref: percpu_ref to initialize 46 * @ref: percpu_ref to initialize
42 * @release: function which will be called when refcount hits 0 47 * @release: function which will be called when refcount hits 0
48 * @flags: PERCPU_REF_INIT_* flags
49 * @gfp: allocation mask to use
43 * 50 *
44 * Initializes the refcount in single atomic counter mode with a refcount of 1; 51 * Initializes @ref. If @flags is zero, @ref starts in percpu mode with a
45 * analagous to atomic_set(ref, 1). 52 * refcount of 1; analagous to atomic_long_set(ref, 1). See the
53 * definitions of PERCPU_REF_INIT_* flags for flag behaviors.
46 * 54 *
47 * Note that @release must not sleep - it may potentially be called from RCU 55 * Note that @release must not sleep - it may potentially be called from RCU
48 * callback context by percpu_ref_kill(). 56 * callback context by percpu_ref_kill().
49 */ 57 */
50int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release) 58int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release,
59 unsigned int flags, gfp_t gfp)
51{ 60{
52 atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); 61 size_t align = max_t(size_t, 1 << __PERCPU_REF_FLAG_BITS,
62 __alignof__(unsigned long));
63 unsigned long start_count = 0;
53 64
54 ref->pcpu_count_ptr = (unsigned long)alloc_percpu(unsigned); 65 ref->percpu_count_ptr = (unsigned long)
55 if (!ref->pcpu_count_ptr) 66 __alloc_percpu_gfp(sizeof(unsigned long), align, gfp);
67 if (!ref->percpu_count_ptr)
56 return -ENOMEM; 68 return -ENOMEM;
57 69
58 ref->release = release; 70 ref->force_atomic = flags & PERCPU_REF_INIT_ATOMIC;
59 return 0;
60}
61EXPORT_SYMBOL_GPL(percpu_ref_init);
62
63/**
64 * percpu_ref_reinit - re-initialize a percpu refcount
65 * @ref: perpcu_ref to re-initialize
66 *
67 * Re-initialize @ref so that it's in the same state as when it finished
68 * percpu_ref_init(). @ref must have been initialized successfully, killed
69 * and reached 0 but not exited.
70 *
71 * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while
72 * this function is in progress.
73 */
74void percpu_ref_reinit(struct percpu_ref *ref)
75{
76 unsigned __percpu *pcpu_count = pcpu_count_ptr(ref);
77 int cpu;
78 71
79 BUG_ON(!pcpu_count); 72 if (flags & (PERCPU_REF_INIT_ATOMIC | PERCPU_REF_INIT_DEAD))
80 WARN_ON(!percpu_ref_is_zero(ref)); 73 ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
74 else
75 start_count += PERCPU_COUNT_BIAS;
81 76
82 atomic_set(&ref->count, 1 + PCPU_COUNT_BIAS); 77 if (flags & PERCPU_REF_INIT_DEAD)
78 ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
79 else
80 start_count++;
83 81
84 /* 82 atomic_long_set(&ref->count, start_count);
85 * Restore per-cpu operation. smp_store_release() is paired with
86 * smp_read_barrier_depends() in __pcpu_ref_alive() and guarantees
87 * that the zeroing is visible to all percpu accesses which can see
88 * the following PCPU_REF_DEAD clearing.
89 */
90 for_each_possible_cpu(cpu)
91 *per_cpu_ptr(pcpu_count, cpu) = 0;
92 83
93 smp_store_release(&ref->pcpu_count_ptr, 84 ref->release = release;
94 ref->pcpu_count_ptr & ~PCPU_REF_DEAD); 85 return 0;
95} 86}
96EXPORT_SYMBOL_GPL(percpu_ref_reinit); 87EXPORT_SYMBOL_GPL(percpu_ref_init);
97 88
98/** 89/**
99 * percpu_ref_exit - undo percpu_ref_init() 90 * percpu_ref_exit - undo percpu_ref_init()
@@ -107,26 +98,39 @@ EXPORT_SYMBOL_GPL(percpu_ref_reinit);
107 */ 98 */
108void percpu_ref_exit(struct percpu_ref *ref) 99void percpu_ref_exit(struct percpu_ref *ref)
109{ 100{
110 unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); 101 unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
111 102
112 if (pcpu_count) { 103 if (percpu_count) {
113 free_percpu(pcpu_count); 104 free_percpu(percpu_count);
114 ref->pcpu_count_ptr = PCPU_REF_DEAD; 105 ref->percpu_count_ptr = __PERCPU_REF_ATOMIC_DEAD;
115 } 106 }
116} 107}
117EXPORT_SYMBOL_GPL(percpu_ref_exit); 108EXPORT_SYMBOL_GPL(percpu_ref_exit);
118 109
119static void percpu_ref_kill_rcu(struct rcu_head *rcu) 110static void percpu_ref_call_confirm_rcu(struct rcu_head *rcu)
111{
112 struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu);
113
114 ref->confirm_switch(ref);
115 ref->confirm_switch = NULL;
116 wake_up_all(&percpu_ref_switch_waitq);
117
118 /* drop ref from percpu_ref_switch_to_atomic() */
119 percpu_ref_put(ref);
120}
121
122static void percpu_ref_switch_to_atomic_rcu(struct rcu_head *rcu)
120{ 123{
121 struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu); 124 struct percpu_ref *ref = container_of(rcu, struct percpu_ref, rcu);
122 unsigned __percpu *pcpu_count = pcpu_count_ptr(ref); 125 unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
123 unsigned count = 0; 126 unsigned long count = 0;
124 int cpu; 127 int cpu;
125 128
126 for_each_possible_cpu(cpu) 129 for_each_possible_cpu(cpu)
127 count += *per_cpu_ptr(pcpu_count, cpu); 130 count += *per_cpu_ptr(percpu_count, cpu);
128 131
129 pr_debug("global %i pcpu %i", atomic_read(&ref->count), (int) count); 132 pr_debug("global %ld percpu %ld",
133 atomic_long_read(&ref->count), (long)count);
130 134
131 /* 135 /*
132 * It's crucial that we sum the percpu counters _before_ adding the sum 136 * It's crucial that we sum the percpu counters _before_ adding the sum
@@ -140,21 +144,137 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu)
140 * reaching 0 before we add the percpu counts. But doing it at the same 144 * reaching 0 before we add the percpu counts. But doing it at the same
141 * time is equivalent and saves us atomic operations: 145 * time is equivalent and saves us atomic operations:
142 */ 146 */
147 atomic_long_add((long)count - PERCPU_COUNT_BIAS, &ref->count);
148
149 WARN_ONCE(atomic_long_read(&ref->count) <= 0,
150 "percpu ref (%pf) <= 0 (%ld) after switching to atomic",
151 ref->release, atomic_long_read(&ref->count));
152
153 /* @ref is viewed as dead on all CPUs, send out switch confirmation */
154 percpu_ref_call_confirm_rcu(rcu);
155}
156
157static void percpu_ref_noop_confirm_switch(struct percpu_ref *ref)
158{
159}
160
161static void __percpu_ref_switch_to_atomic(struct percpu_ref *ref,
162 percpu_ref_func_t *confirm_switch)
163{
164 if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC)) {
165 /* switching from percpu to atomic */
166 ref->percpu_count_ptr |= __PERCPU_REF_ATOMIC;
167
168 /*
169 * Non-NULL ->confirm_switch is used to indicate that
170 * switching is in progress. Use noop one if unspecified.
171 */
172 WARN_ON_ONCE(ref->confirm_switch);
173 ref->confirm_switch =
174 confirm_switch ?: percpu_ref_noop_confirm_switch;
175
176 percpu_ref_get(ref); /* put after confirmation */
177 call_rcu_sched(&ref->rcu, percpu_ref_switch_to_atomic_rcu);
178 } else if (confirm_switch) {
179 /*
180 * Somebody already set ATOMIC. Switching may still be in
181 * progress. @confirm_switch must be invoked after the
182 * switching is complete and a full sched RCU grace period
183 * has passed. Wait synchronously for the previous
184 * switching and schedule @confirm_switch invocation.
185 */
186 wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
187 ref->confirm_switch = confirm_switch;
143 188
144 atomic_add((int) count - PCPU_COUNT_BIAS, &ref->count); 189 percpu_ref_get(ref); /* put after confirmation */
190 call_rcu_sched(&ref->rcu, percpu_ref_call_confirm_rcu);
191 }
192}
193
194/**
195 * percpu_ref_switch_to_atomic - switch a percpu_ref to atomic mode
196 * @ref: percpu_ref to switch to atomic mode
197 * @confirm_switch: optional confirmation callback
198 *
199 * There's no reason to use this function for the usual reference counting.
200 * Use percpu_ref_kill[_and_confirm]().
201 *
202 * Schedule switching of @ref to atomic mode. All its percpu counts will
203 * be collected to the main atomic counter. On completion, when all CPUs
204 * are guaraneed to be in atomic mode, @confirm_switch, which may not
205 * block, is invoked. This function may be invoked concurrently with all
206 * the get/put operations and can safely be mixed with kill and reinit
207 * operations. Note that @ref will stay in atomic mode across kill/reinit
208 * cycles until percpu_ref_switch_to_percpu() is called.
209 *
210 * This function normally doesn't block and can be called from any context
211 * but it may block if @confirm_kill is specified and @ref is already in
212 * the process of switching to atomic mode. In such cases, @confirm_switch
213 * will be invoked after the switching is complete.
214 *
215 * Due to the way percpu_ref is implemented, @confirm_switch will be called
216 * after at least one full sched RCU grace period has passed but this is an
217 * implementation detail and must not be depended upon.
218 */
219void percpu_ref_switch_to_atomic(struct percpu_ref *ref,
220 percpu_ref_func_t *confirm_switch)
221{
222 ref->force_atomic = true;
223 __percpu_ref_switch_to_atomic(ref, confirm_switch);
224}
145 225
146 WARN_ONCE(atomic_read(&ref->count) <= 0, "percpu ref <= 0 (%i)", 226static void __percpu_ref_switch_to_percpu(struct percpu_ref *ref)
147 atomic_read(&ref->count)); 227{
228 unsigned long __percpu *percpu_count = percpu_count_ptr(ref);
229 int cpu;
230
231 BUG_ON(!percpu_count);
148 232
149 /* @ref is viewed as dead on all CPUs, send out kill confirmation */ 233 if (!(ref->percpu_count_ptr & __PERCPU_REF_ATOMIC))
150 if (ref->confirm_kill) 234 return;
151 ref->confirm_kill(ref); 235
236 wait_event(percpu_ref_switch_waitq, !ref->confirm_switch);
237
238 atomic_long_add(PERCPU_COUNT_BIAS, &ref->count);
152 239
153 /* 240 /*
154 * Now we're in single atomic_t mode with a consistent refcount, so it's 241 * Restore per-cpu operation. smp_store_release() is paired with
155 * safe to drop our initial ref: 242 * smp_read_barrier_depends() in __ref_is_percpu() and guarantees
243 * that the zeroing is visible to all percpu accesses which can see
244 * the following __PERCPU_REF_ATOMIC clearing.
156 */ 245 */
157 percpu_ref_put(ref); 246 for_each_possible_cpu(cpu)
247 *per_cpu_ptr(percpu_count, cpu) = 0;
248
249 smp_store_release(&ref->percpu_count_ptr,
250 ref->percpu_count_ptr & ~__PERCPU_REF_ATOMIC);
251}
252
253/**
254 * percpu_ref_switch_to_percpu - switch a percpu_ref to percpu mode
255 * @ref: percpu_ref to switch to percpu mode
256 *
257 * There's no reason to use this function for the usual reference counting.
258 * To re-use an expired ref, use percpu_ref_reinit().
259 *
260 * Switch @ref to percpu mode. This function may be invoked concurrently
261 * with all the get/put operations and can safely be mixed with kill and
262 * reinit operations. This function reverses the sticky atomic state set
263 * by PERCPU_REF_INIT_ATOMIC or percpu_ref_switch_to_atomic(). If @ref is
264 * dying or dead, the actual switching takes place on the following
265 * percpu_ref_reinit().
266 *
267 * This function normally doesn't block and can be called from any context
268 * but it may block if @ref is in the process of switching to atomic mode
269 * by percpu_ref_switch_atomic().
270 */
271void percpu_ref_switch_to_percpu(struct percpu_ref *ref)
272{
273 ref->force_atomic = false;
274
275 /* a dying or dead ref can't be switched to percpu mode w/o reinit */
276 if (!(ref->percpu_count_ptr & __PERCPU_REF_DEAD))
277 __percpu_ref_switch_to_percpu(ref);
158} 278}
159 279
160/** 280/**
@@ -164,39 +284,48 @@ static void percpu_ref_kill_rcu(struct rcu_head *rcu)
164 * 284 *
165 * Equivalent to percpu_ref_kill() but also schedules kill confirmation if 285 * Equivalent to percpu_ref_kill() but also schedules kill confirmation if
166 * @confirm_kill is not NULL. @confirm_kill, which may not block, will be 286 * @confirm_kill is not NULL. @confirm_kill, which may not block, will be
167 * called after @ref is seen as dead from all CPUs - all further 287 * called after @ref is seen as dead from all CPUs at which point all
168 * invocations of percpu_ref_tryget() will fail. See percpu_ref_tryget() 288 * further invocations of percpu_ref_tryget_live() will fail. See
169 * for more details. 289 * percpu_ref_tryget_live() for details.
290 *
291 * This function normally doesn't block and can be called from any context
292 * but it may block if @confirm_kill is specified and @ref is in the
293 * process of switching to atomic mode by percpu_ref_switch_atomic().
170 * 294 *
171 * Due to the way percpu_ref is implemented, @confirm_kill will be called 295 * Due to the way percpu_ref is implemented, @confirm_switch will be called
172 * after at least one full RCU grace period has passed but this is an 296 * after at least one full sched RCU grace period has passed but this is an
173 * implementation detail and callers must not depend on it. 297 * implementation detail and must not be depended upon.
174 */ 298 */
175void percpu_ref_kill_and_confirm(struct percpu_ref *ref, 299void percpu_ref_kill_and_confirm(struct percpu_ref *ref,
176 percpu_ref_func_t *confirm_kill) 300 percpu_ref_func_t *confirm_kill)
177{ 301{
178 WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, 302 WARN_ONCE(ref->percpu_count_ptr & __PERCPU_REF_DEAD,
179 "percpu_ref_kill() called more than once!\n"); 303 "%s called more than once on %pf!", __func__, ref->release);
180 304
181 ref->pcpu_count_ptr |= PCPU_REF_DEAD; 305 ref->percpu_count_ptr |= __PERCPU_REF_DEAD;
182 ref->confirm_kill = confirm_kill; 306 __percpu_ref_switch_to_atomic(ref, confirm_kill);
183 307 percpu_ref_put(ref);
184 call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu);
185} 308}
186EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); 309EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm);
187 310
188/* 311/**
189 * XXX: Temporary kludge to work around SCSI blk-mq stall. Used only by 312 * percpu_ref_reinit - re-initialize a percpu refcount
190 * block/blk-mq.c::blk_mq_freeze_queue(). Will be removed during v3.18 313 * @ref: perpcu_ref to re-initialize
191 * devel cycle. Do not use anywhere else. 314 *
315 * Re-initialize @ref so that it's in the same state as when it finished
316 * percpu_ref_init() ignoring %PERCPU_REF_INIT_DEAD. @ref must have been
317 * initialized successfully and reached 0 but not exited.
318 *
319 * Note that percpu_ref_tryget[_live]() are safe to perform on @ref while
320 * this function is in progress.
192 */ 321 */
193void __percpu_ref_kill_expedited(struct percpu_ref *ref) 322void percpu_ref_reinit(struct percpu_ref *ref)
194{ 323{
195 WARN_ONCE(ref->pcpu_count_ptr & PCPU_REF_DEAD, 324 WARN_ON_ONCE(!percpu_ref_is_zero(ref));
196 "percpu_ref_kill() called more than once on %pf!",
197 ref->release);
198 325
199 ref->pcpu_count_ptr |= PCPU_REF_DEAD; 326 ref->percpu_count_ptr &= ~__PERCPU_REF_DEAD;
200 synchronize_sched_expedited(); 327 percpu_ref_get(ref);
201 percpu_ref_kill_rcu(&ref->rcu); 328 if (!ref->force_atomic)
329 __percpu_ref_switch_to_percpu(ref);
202} 330}
331EXPORT_SYMBOL_GPL(percpu_ref_reinit);
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 7dd33577b905..48144cdae819 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -112,13 +112,15 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc)
112} 112}
113EXPORT_SYMBOL(__percpu_counter_sum); 113EXPORT_SYMBOL(__percpu_counter_sum);
114 114
115int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, 115int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
116 struct lock_class_key *key) 116 struct lock_class_key *key)
117{ 117{
118 unsigned long flags __maybe_unused;
119
118 raw_spin_lock_init(&fbc->lock); 120 raw_spin_lock_init(&fbc->lock);
119 lockdep_set_class(&fbc->lock, key); 121 lockdep_set_class(&fbc->lock, key);
120 fbc->count = amount; 122 fbc->count = amount;
121 fbc->counters = alloc_percpu(s32); 123 fbc->counters = alloc_percpu_gfp(s32, gfp);
122 if (!fbc->counters) 124 if (!fbc->counters)
123 return -ENOMEM; 125 return -ENOMEM;
124 126
@@ -126,9 +128,9 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount,
126 128
127#ifdef CONFIG_HOTPLUG_CPU 129#ifdef CONFIG_HOTPLUG_CPU
128 INIT_LIST_HEAD(&fbc->list); 130 INIT_LIST_HEAD(&fbc->list);
129 spin_lock(&percpu_counters_lock); 131 spin_lock_irqsave(&percpu_counters_lock, flags);
130 list_add(&fbc->list, &percpu_counters); 132 list_add(&fbc->list, &percpu_counters);
131 spin_unlock(&percpu_counters_lock); 133 spin_unlock_irqrestore(&percpu_counters_lock, flags);
132#endif 134#endif
133 return 0; 135 return 0;
134} 136}
@@ -136,15 +138,17 @@ EXPORT_SYMBOL(__percpu_counter_init);
136 138
137void percpu_counter_destroy(struct percpu_counter *fbc) 139void percpu_counter_destroy(struct percpu_counter *fbc)
138{ 140{
141 unsigned long flags __maybe_unused;
142
139 if (!fbc->counters) 143 if (!fbc->counters)
140 return; 144 return;
141 145
142 debug_percpu_counter_deactivate(fbc); 146 debug_percpu_counter_deactivate(fbc);
143 147
144#ifdef CONFIG_HOTPLUG_CPU 148#ifdef CONFIG_HOTPLUG_CPU
145 spin_lock(&percpu_counters_lock); 149 spin_lock_irqsave(&percpu_counters_lock, flags);
146 list_del(&fbc->list); 150 list_del(&fbc->list);
147 spin_unlock(&percpu_counters_lock); 151 spin_unlock_irqrestore(&percpu_counters_lock, flags);
148#endif 152#endif
149 free_percpu(fbc->counters); 153 free_percpu(fbc->counters);
150 fbc->counters = NULL; 154 fbc->counters = NULL;
@@ -173,7 +177,7 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb,
173 return NOTIFY_OK; 177 return NOTIFY_OK;
174 178
175 cpu = (unsigned long)hcpu; 179 cpu = (unsigned long)hcpu;
176 spin_lock(&percpu_counters_lock); 180 spin_lock_irq(&percpu_counters_lock);
177 list_for_each_entry(fbc, &percpu_counters, list) { 181 list_for_each_entry(fbc, &percpu_counters, list) {
178 s32 *pcount; 182 s32 *pcount;
179 unsigned long flags; 183 unsigned long flags;
@@ -184,7 +188,7 @@ static int percpu_counter_hotcpu_callback(struct notifier_block *nb,
184 *pcount = 0; 188 *pcount = 0;
185 raw_spin_unlock_irqrestore(&fbc->lock, flags); 189 raw_spin_unlock_irqrestore(&fbc->lock, flags);
186 } 190 }
187 spin_unlock(&percpu_counters_lock); 191 spin_unlock_irq(&percpu_counters_lock);
188#endif 192#endif
189 return NOTIFY_OK; 193 return NOTIFY_OK;
190} 194}
diff --git a/lib/proportions.c b/lib/proportions.c
index 05df84801b56..6f724298f67a 100644
--- a/lib/proportions.c
+++ b/lib/proportions.c
@@ -73,7 +73,7 @@
73#include <linux/proportions.h> 73#include <linux/proportions.h>
74#include <linux/rcupdate.h> 74#include <linux/rcupdate.h>
75 75
76int prop_descriptor_init(struct prop_descriptor *pd, int shift) 76int prop_descriptor_init(struct prop_descriptor *pd, int shift, gfp_t gfp)
77{ 77{
78 int err; 78 int err;
79 79
@@ -83,11 +83,11 @@ int prop_descriptor_init(struct prop_descriptor *pd, int shift)
83 pd->index = 0; 83 pd->index = 0;
84 pd->pg[0].shift = shift; 84 pd->pg[0].shift = shift;
85 mutex_init(&pd->mutex); 85 mutex_init(&pd->mutex);
86 err = percpu_counter_init(&pd->pg[0].events, 0); 86 err = percpu_counter_init(&pd->pg[0].events, 0, gfp);
87 if (err) 87 if (err)
88 goto out; 88 goto out;
89 89
90 err = percpu_counter_init(&pd->pg[1].events, 0); 90 err = percpu_counter_init(&pd->pg[1].events, 0, gfp);
91 if (err) 91 if (err)
92 percpu_counter_destroy(&pd->pg[0].events); 92 percpu_counter_destroy(&pd->pg[0].events);
93 93
@@ -188,12 +188,12 @@ prop_adjust_shift(int *pl_shift, unsigned long *pl_period, int new_shift)
188 188
189#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids))) 189#define PROP_BATCH (8*(1+ilog2(nr_cpu_ids)))
190 190
191int prop_local_init_percpu(struct prop_local_percpu *pl) 191int prop_local_init_percpu(struct prop_local_percpu *pl, gfp_t gfp)
192{ 192{
193 raw_spin_lock_init(&pl->lock); 193 raw_spin_lock_init(&pl->lock);
194 pl->shift = 0; 194 pl->shift = 0;
195 pl->period = 0; 195 pl->period = 0;
196 return percpu_counter_init(&pl->events, 0); 196 return percpu_counter_init(&pl->events, 0, gfp);
197} 197}
198 198
199void prop_local_destroy_percpu(struct prop_local_percpu *pl) 199void prop_local_destroy_percpu(struct prop_local_percpu *pl)
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index b27714f1b40f..12a992b62576 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -455,7 +455,7 @@ int bdi_init(struct backing_dev_info *bdi)
455 bdi_wb_init(&bdi->wb, bdi); 455 bdi_wb_init(&bdi->wb, bdi);
456 456
457 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { 457 for (i = 0; i < NR_BDI_STAT_ITEMS; i++) {
458 err = percpu_counter_init(&bdi->bdi_stat[i], 0); 458 err = percpu_counter_init(&bdi->bdi_stat[i], 0, GFP_KERNEL);
459 if (err) 459 if (err)
460 goto err; 460 goto err;
461 } 461 }
@@ -470,7 +470,7 @@ int bdi_init(struct backing_dev_info *bdi)
470 bdi->write_bandwidth = INIT_BW; 470 bdi->write_bandwidth = INIT_BW;
471 bdi->avg_write_bandwidth = INIT_BW; 471 bdi->avg_write_bandwidth = INIT_BW;
472 472
473 err = fprop_local_init_percpu(&bdi->completions); 473 err = fprop_local_init_percpu(&bdi->completions, GFP_KERNEL);
474 474
475 if (err) { 475 if (err) {
476err: 476err:
diff --git a/mm/mmap.c b/mm/mmap.c
index 16d19b48e2ad..93d28c7e5420 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3202,7 +3202,7 @@ void __init mmap_init(void)
3202{ 3202{
3203 int ret; 3203 int ret;
3204 3204
3205 ret = percpu_counter_init(&vm_committed_as, 0); 3205 ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
3206 VM_BUG_ON(ret); 3206 VM_BUG_ON(ret);
3207} 3207}
3208 3208
diff --git a/mm/nommu.c b/mm/nommu.c
index a881d9673c6b..bd1808e194a7 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -539,7 +539,7 @@ void __init mmap_init(void)
539{ 539{
540 int ret; 540 int ret;
541 541
542 ret = percpu_counter_init(&vm_committed_as, 0); 542 ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
543 VM_BUG_ON(ret); 543 VM_BUG_ON(ret);
544 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC); 544 vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
545} 545}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 35ca7102d421..ff24c9d83112 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1777,7 +1777,7 @@ void __init page_writeback_init(void)
1777 writeback_set_ratelimit(); 1777 writeback_set_ratelimit();
1778 register_cpu_notifier(&ratelimit_nb); 1778 register_cpu_notifier(&ratelimit_nb);
1779 1779
1780 fprop_global_init(&writeout_completions); 1780 fprop_global_init(&writeout_completions, GFP_KERNEL);
1781} 1781}
1782 1782
1783/** 1783/**
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 89633fefc6a2..10e3d0b8a86d 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -33,17 +33,14 @@
33 33
34#include <linux/log2.h> 34#include <linux/log2.h>
35 35
36static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 36static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
37 int page_start, int page_end)
37{ 38{
38 unsigned int cpu;
39
40 for_each_possible_cpu(cpu)
41 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
42
43 return 0; 39 return 0;
44} 40}
45 41
46static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) 42static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
43 int page_start, int page_end)
47{ 44{
48 /* nada */ 45 /* nada */
49} 46}
@@ -70,6 +67,11 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
70 67
71 chunk->data = pages; 68 chunk->data = pages;
72 chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; 69 chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
70
71 spin_lock_irq(&pcpu_lock);
72 pcpu_chunk_populated(chunk, 0, nr_pages);
73 spin_unlock_irq(&pcpu_lock);
74
73 return chunk; 75 return chunk;
74} 76}
75 77
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 51108165f829..538998a137d2 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -20,46 +20,25 @@ static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
20} 20}
21 21
22/** 22/**
23 * pcpu_get_pages_and_bitmap - get temp pages array and bitmap 23 * pcpu_get_pages - get temp pages array
24 * @chunk: chunk of interest 24 * @chunk: chunk of interest
25 * @bitmapp: output parameter for bitmap
26 * @may_alloc: may allocate the array
27 * 25 *
28 * Returns pointer to array of pointers to struct page and bitmap, 26 * Returns pointer to array of pointers to struct page which can be indexed
29 * both of which can be indexed with pcpu_page_idx(). The returned 27 * with pcpu_page_idx(). Note that there is only one array and accesses
30 * array is cleared to zero and *@bitmapp is copied from 28 * should be serialized by pcpu_alloc_mutex.
31 * @chunk->populated. Note that there is only one array and bitmap
32 * and access exclusion is the caller's responsibility.
33 *
34 * CONTEXT:
35 * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
36 * Otherwise, don't care.
37 * 29 *
38 * RETURNS: 30 * RETURNS:
39 * Pointer to temp pages array on success, NULL on failure. 31 * Pointer to temp pages array on success.
40 */ 32 */
41static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, 33static struct page **pcpu_get_pages(struct pcpu_chunk *chunk_alloc)
42 unsigned long **bitmapp,
43 bool may_alloc)
44{ 34{
45 static struct page **pages; 35 static struct page **pages;
46 static unsigned long *bitmap;
47 size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); 36 size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
48 size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
49 sizeof(unsigned long);
50
51 if (!pages || !bitmap) {
52 if (may_alloc && !pages)
53 pages = pcpu_mem_zalloc(pages_size);
54 if (may_alloc && !bitmap)
55 bitmap = pcpu_mem_zalloc(bitmap_size);
56 if (!pages || !bitmap)
57 return NULL;
58 }
59 37
60 bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); 38 lockdep_assert_held(&pcpu_alloc_mutex);
61 39
62 *bitmapp = bitmap; 40 if (!pages)
41 pages = pcpu_mem_zalloc(pages_size);
63 return pages; 42 return pages;
64} 43}
65 44
@@ -67,7 +46,6 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
67 * pcpu_free_pages - free pages which were allocated for @chunk 46 * pcpu_free_pages - free pages which were allocated for @chunk
68 * @chunk: chunk pages were allocated for 47 * @chunk: chunk pages were allocated for
69 * @pages: array of pages to be freed, indexed by pcpu_page_idx() 48 * @pages: array of pages to be freed, indexed by pcpu_page_idx()
70 * @populated: populated bitmap
71 * @page_start: page index of the first page to be freed 49 * @page_start: page index of the first page to be freed
72 * @page_end: page index of the last page to be freed + 1 50 * @page_end: page index of the last page to be freed + 1
73 * 51 *
@@ -75,8 +53,7 @@ static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
75 * The pages were allocated for @chunk. 53 * The pages were allocated for @chunk.
76 */ 54 */
77static void pcpu_free_pages(struct pcpu_chunk *chunk, 55static void pcpu_free_pages(struct pcpu_chunk *chunk,
78 struct page **pages, unsigned long *populated, 56 struct page **pages, int page_start, int page_end)
79 int page_start, int page_end)
80{ 57{
81 unsigned int cpu; 58 unsigned int cpu;
82 int i; 59 int i;
@@ -95,7 +72,6 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
95 * pcpu_alloc_pages - allocates pages for @chunk 72 * pcpu_alloc_pages - allocates pages for @chunk
96 * @chunk: target chunk 73 * @chunk: target chunk
97 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() 74 * @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
98 * @populated: populated bitmap
99 * @page_start: page index of the first page to be allocated 75 * @page_start: page index of the first page to be allocated
100 * @page_end: page index of the last page to be allocated + 1 76 * @page_end: page index of the last page to be allocated + 1
101 * 77 *
@@ -104,8 +80,7 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk,
104 * content of @pages and will pass it verbatim to pcpu_map_pages(). 80 * content of @pages and will pass it verbatim to pcpu_map_pages().
105 */ 81 */
106static int pcpu_alloc_pages(struct pcpu_chunk *chunk, 82static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
107 struct page **pages, unsigned long *populated, 83 struct page **pages, int page_start, int page_end)
108 int page_start, int page_end)
109{ 84{
110 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; 85 const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
111 unsigned int cpu, tcpu; 86 unsigned int cpu, tcpu;
@@ -164,7 +139,6 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
164 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk 139 * pcpu_unmap_pages - unmap pages out of a pcpu_chunk
165 * @chunk: chunk of interest 140 * @chunk: chunk of interest
166 * @pages: pages array which can be used to pass information to free 141 * @pages: pages array which can be used to pass information to free
167 * @populated: populated bitmap
168 * @page_start: page index of the first page to unmap 142 * @page_start: page index of the first page to unmap
169 * @page_end: page index of the last page to unmap + 1 143 * @page_end: page index of the last page to unmap + 1
170 * 144 *
@@ -175,8 +149,7 @@ static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
175 * proper pre/post flush functions. 149 * proper pre/post flush functions.
176 */ 150 */
177static void pcpu_unmap_pages(struct pcpu_chunk *chunk, 151static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
178 struct page **pages, unsigned long *populated, 152 struct page **pages, int page_start, int page_end)
179 int page_start, int page_end)
180{ 153{
181 unsigned int cpu; 154 unsigned int cpu;
182 int i; 155 int i;
@@ -192,8 +165,6 @@ static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
192 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), 165 __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
193 page_end - page_start); 166 page_end - page_start);
194 } 167 }
195
196 bitmap_clear(populated, page_start, page_end - page_start);
197} 168}
198 169
199/** 170/**
@@ -228,7 +199,6 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages,
228 * pcpu_map_pages - map pages into a pcpu_chunk 199 * pcpu_map_pages - map pages into a pcpu_chunk
229 * @chunk: chunk of interest 200 * @chunk: chunk of interest
230 * @pages: pages array containing pages to be mapped 201 * @pages: pages array containing pages to be mapped
231 * @populated: populated bitmap
232 * @page_start: page index of the first page to map 202 * @page_start: page index of the first page to map
233 * @page_end: page index of the last page to map + 1 203 * @page_end: page index of the last page to map + 1
234 * 204 *
@@ -236,13 +206,11 @@ static int __pcpu_map_pages(unsigned long addr, struct page **pages,
236 * caller is responsible for calling pcpu_post_map_flush() after all 206 * caller is responsible for calling pcpu_post_map_flush() after all
237 * mappings are complete. 207 * mappings are complete.
238 * 208 *
239 * This function is responsible for setting corresponding bits in 209 * This function is responsible for setting up whatever is necessary for
240 * @chunk->populated bitmap and whatever is necessary for reverse 210 * reverse lookup (addr -> chunk).
241 * lookup (addr -> chunk).
242 */ 211 */
243static int pcpu_map_pages(struct pcpu_chunk *chunk, 212static int pcpu_map_pages(struct pcpu_chunk *chunk,
244 struct page **pages, unsigned long *populated, 213 struct page **pages, int page_start, int page_end)
245 int page_start, int page_end)
246{ 214{
247 unsigned int cpu, tcpu; 215 unsigned int cpu, tcpu;
248 int i, err; 216 int i, err;
@@ -253,18 +221,12 @@ static int pcpu_map_pages(struct pcpu_chunk *chunk,
253 page_end - page_start); 221 page_end - page_start);
254 if (err < 0) 222 if (err < 0)
255 goto err; 223 goto err;
256 }
257 224
258 /* mapping successful, link chunk and mark populated */ 225 for (i = page_start; i < page_end; i++)
259 for (i = page_start; i < page_end; i++) {
260 for_each_possible_cpu(cpu)
261 pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], 226 pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
262 chunk); 227 chunk);
263 __set_bit(i, populated);
264 } 228 }
265
266 return 0; 229 return 0;
267
268err: 230err:
269 for_each_possible_cpu(tcpu) { 231 for_each_possible_cpu(tcpu) {
270 if (tcpu == cpu) 232 if (tcpu == cpu)
@@ -299,123 +261,69 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
299/** 261/**
300 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk 262 * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
301 * @chunk: chunk of interest 263 * @chunk: chunk of interest
302 * @off: offset to the area to populate 264 * @page_start: the start page
303 * @size: size of the area to populate in bytes 265 * @page_end: the end page
304 * 266 *
305 * For each cpu, populate and map pages [@page_start,@page_end) into 267 * For each cpu, populate and map pages [@page_start,@page_end) into
306 * @chunk. The area is cleared on return. 268 * @chunk.
307 * 269 *
308 * CONTEXT: 270 * CONTEXT:
309 * pcpu_alloc_mutex, does GFP_KERNEL allocation. 271 * pcpu_alloc_mutex, does GFP_KERNEL allocation.
310 */ 272 */
311static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) 273static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
274 int page_start, int page_end)
312{ 275{
313 int page_start = PFN_DOWN(off);
314 int page_end = PFN_UP(off + size);
315 int free_end = page_start, unmap_end = page_start;
316 struct page **pages; 276 struct page **pages;
317 unsigned long *populated;
318 unsigned int cpu;
319 int rs, re, rc;
320
321 /* quick path, check whether all pages are already there */
322 rs = page_start;
323 pcpu_next_pop(chunk, &rs, &re, page_end);
324 if (rs == page_start && re == page_end)
325 goto clear;
326 277
327 /* need to allocate and map pages, this chunk can't be immutable */ 278 pages = pcpu_get_pages(chunk);
328 WARN_ON(chunk->immutable);
329
330 pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
331 if (!pages) 279 if (!pages)
332 return -ENOMEM; 280 return -ENOMEM;
333 281
334 /* alloc and map */ 282 if (pcpu_alloc_pages(chunk, pages, page_start, page_end))
335 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 283 return -ENOMEM;
336 rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
337 if (rc)
338 goto err_free;
339 free_end = re;
340 }
341 284
342 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { 285 if (pcpu_map_pages(chunk, pages, page_start, page_end)) {
343 rc = pcpu_map_pages(chunk, pages, populated, rs, re); 286 pcpu_free_pages(chunk, pages, page_start, page_end);
344 if (rc) 287 return -ENOMEM;
345 goto err_unmap;
346 unmap_end = re;
347 } 288 }
348 pcpu_post_map_flush(chunk, page_start, page_end); 289 pcpu_post_map_flush(chunk, page_start, page_end);
349 290
350 /* commit new bitmap */
351 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
352clear:
353 for_each_possible_cpu(cpu)
354 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
355 return 0; 291 return 0;
356
357err_unmap:
358 pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
359 pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
360 pcpu_unmap_pages(chunk, pages, populated, rs, re);
361 pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
362err_free:
363 pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
364 pcpu_free_pages(chunk, pages, populated, rs, re);
365 return rc;
366} 292}
367 293
368/** 294/**
369 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk 295 * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
370 * @chunk: chunk to depopulate 296 * @chunk: chunk to depopulate
371 * @off: offset to the area to depopulate 297 * @page_start: the start page
372 * @size: size of the area to depopulate in bytes 298 * @page_end: the end page
373 * 299 *
374 * For each cpu, depopulate and unmap pages [@page_start,@page_end) 300 * For each cpu, depopulate and unmap pages [@page_start,@page_end)
375 * from @chunk. If @flush is true, vcache is flushed before unmapping 301 * from @chunk.
376 * and tlb after.
377 * 302 *
378 * CONTEXT: 303 * CONTEXT:
379 * pcpu_alloc_mutex. 304 * pcpu_alloc_mutex.
380 */ 305 */
381static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) 306static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
307 int page_start, int page_end)
382{ 308{
383 int page_start = PFN_DOWN(off);
384 int page_end = PFN_UP(off + size);
385 struct page **pages; 309 struct page **pages;
386 unsigned long *populated;
387 int rs, re;
388
389 /* quick path, check whether it's empty already */
390 rs = page_start;
391 pcpu_next_unpop(chunk, &rs, &re, page_end);
392 if (rs == page_start && re == page_end)
393 return;
394
395 /* immutable chunks can't be depopulated */
396 WARN_ON(chunk->immutable);
397 310
398 /* 311 /*
399 * If control reaches here, there must have been at least one 312 * If control reaches here, there must have been at least one
400 * successful population attempt so the temp pages array must 313 * successful population attempt so the temp pages array must
401 * be available now. 314 * be available now.
402 */ 315 */
403 pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); 316 pages = pcpu_get_pages(chunk);
404 BUG_ON(!pages); 317 BUG_ON(!pages);
405 318
406 /* unmap and free */ 319 /* unmap and free */
407 pcpu_pre_unmap_flush(chunk, page_start, page_end); 320 pcpu_pre_unmap_flush(chunk, page_start, page_end);
408 321
409 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) 322 pcpu_unmap_pages(chunk, pages, page_start, page_end);
410 pcpu_unmap_pages(chunk, pages, populated, rs, re);
411 323
412 /* no need to flush tlb, vmalloc will handle it lazily */ 324 /* no need to flush tlb, vmalloc will handle it lazily */
413 325
414 pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) 326 pcpu_free_pages(chunk, pages, page_start, page_end);
415 pcpu_free_pages(chunk, pages, populated, rs, re);
416
417 /* commit new bitmap */
418 bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
419} 327}
420 328
421static struct pcpu_chunk *pcpu_create_chunk(void) 329static struct pcpu_chunk *pcpu_create_chunk(void)
diff --git a/mm/percpu.c b/mm/percpu.c
index da997f9800bd..014bab65e0ff 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -76,6 +76,10 @@
76 76
77#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ 77#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
78#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ 78#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
79#define PCPU_ATOMIC_MAP_MARGIN_LOW 32
80#define PCPU_ATOMIC_MAP_MARGIN_HIGH 64
81#define PCPU_EMPTY_POP_PAGES_LOW 2
82#define PCPU_EMPTY_POP_PAGES_HIGH 4
79 83
80#ifdef CONFIG_SMP 84#ifdef CONFIG_SMP
81/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ 85/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
@@ -102,12 +106,16 @@ struct pcpu_chunk {
102 int free_size; /* free bytes in the chunk */ 106 int free_size; /* free bytes in the chunk */
103 int contig_hint; /* max contiguous size hint */ 107 int contig_hint; /* max contiguous size hint */
104 void *base_addr; /* base address of this chunk */ 108 void *base_addr; /* base address of this chunk */
109
105 int map_used; /* # of map entries used before the sentry */ 110 int map_used; /* # of map entries used before the sentry */
106 int map_alloc; /* # of map entries allocated */ 111 int map_alloc; /* # of map entries allocated */
107 int *map; /* allocation map */ 112 int *map; /* allocation map */
113 struct work_struct map_extend_work;/* async ->map[] extension */
114
108 void *data; /* chunk data */ 115 void *data; /* chunk data */
109 int first_free; /* no free below this */ 116 int first_free; /* no free below this */
110 bool immutable; /* no [de]population allowed */ 117 bool immutable; /* no [de]population allowed */
118 int nr_populated; /* # of populated pages */
111 unsigned long populated[]; /* populated bitmap */ 119 unsigned long populated[]; /* populated bitmap */
112}; 120};
113 121
@@ -151,38 +159,33 @@ static struct pcpu_chunk *pcpu_first_chunk;
151static struct pcpu_chunk *pcpu_reserved_chunk; 159static struct pcpu_chunk *pcpu_reserved_chunk;
152static int pcpu_reserved_chunk_limit; 160static int pcpu_reserved_chunk_limit;
153 161
162static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
163static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */
164
165static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
166
154/* 167/*
155 * Synchronization rules. 168 * The number of empty populated pages, protected by pcpu_lock. The
156 * 169 * reserved chunk doesn't contribute to the count.
157 * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
158 * protects allocation/reclaim paths, chunks, populated bitmap and
159 * vmalloc mapping. The latter is a spinlock and protects the index
160 * data structures - chunk slots, chunks and area maps in chunks.
161 *
162 * During allocation, pcpu_alloc_mutex is kept locked all the time and
163 * pcpu_lock is grabbed and released as necessary. All actual memory
164 * allocations are done using GFP_KERNEL with pcpu_lock released. In
165 * general, percpu memory can't be allocated with irq off but
166 * irqsave/restore are still used in alloc path so that it can be used
167 * from early init path - sched_init() specifically.
168 *
169 * Free path accesses and alters only the index data structures, so it
170 * can be safely called from atomic context. When memory needs to be
171 * returned to the system, free path schedules reclaim_work which
172 * grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
173 * reclaimed, release both locks and frees the chunks. Note that it's
174 * necessary to grab both locks to remove a chunk from circulation as
175 * allocation path might be referencing the chunk with only
176 * pcpu_alloc_mutex locked.
177 */ 170 */
178static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ 171static int pcpu_nr_empty_pop_pages;
179static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
180 172
181static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ 173/*
174 * Balance work is used to populate or destroy chunks asynchronously. We
175 * try to keep the number of populated free pages between
176 * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one
177 * empty chunk.
178 */
179static void pcpu_balance_workfn(struct work_struct *work);
180static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn);
181static bool pcpu_async_enabled __read_mostly;
182static bool pcpu_atomic_alloc_failed;
182 183
183/* reclaim work to release fully free chunks, scheduled from free path */ 184static void pcpu_schedule_balance_work(void)
184static void pcpu_reclaim(struct work_struct *work); 185{
185static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); 186 if (pcpu_async_enabled)
187 schedule_work(&pcpu_balance_work);
188}
186 189
187static bool pcpu_addr_in_first_chunk(void *addr) 190static bool pcpu_addr_in_first_chunk(void *addr)
188{ 191{
@@ -315,6 +318,38 @@ static void pcpu_mem_free(void *ptr, size_t size)
315} 318}
316 319
317/** 320/**
321 * pcpu_count_occupied_pages - count the number of pages an area occupies
322 * @chunk: chunk of interest
323 * @i: index of the area in question
324 *
325 * Count the number of pages chunk's @i'th area occupies. When the area's
326 * start and/or end address isn't aligned to page boundary, the straddled
327 * page is included in the count iff the rest of the page is free.
328 */
329static int pcpu_count_occupied_pages(struct pcpu_chunk *chunk, int i)
330{
331 int off = chunk->map[i] & ~1;
332 int end = chunk->map[i + 1] & ~1;
333
334 if (!PAGE_ALIGNED(off) && i > 0) {
335 int prev = chunk->map[i - 1];
336
337 if (!(prev & 1) && prev <= round_down(off, PAGE_SIZE))
338 off = round_down(off, PAGE_SIZE);
339 }
340
341 if (!PAGE_ALIGNED(end) && i + 1 < chunk->map_used) {
342 int next = chunk->map[i + 1];
343 int nend = chunk->map[i + 2] & ~1;
344
345 if (!(next & 1) && nend >= round_up(end, PAGE_SIZE))
346 end = round_up(end, PAGE_SIZE);
347 }
348
349 return max_t(int, PFN_DOWN(end) - PFN_UP(off), 0);
350}
351
352/**
318 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot 353 * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
319 * @chunk: chunk of interest 354 * @chunk: chunk of interest
320 * @oslot: the previous slot it was on 355 * @oslot: the previous slot it was on
@@ -342,9 +377,14 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
342/** 377/**
343 * pcpu_need_to_extend - determine whether chunk area map needs to be extended 378 * pcpu_need_to_extend - determine whether chunk area map needs to be extended
344 * @chunk: chunk of interest 379 * @chunk: chunk of interest
380 * @is_atomic: the allocation context
345 * 381 *
346 * Determine whether area map of @chunk needs to be extended to 382 * Determine whether area map of @chunk needs to be extended. If
347 * accommodate a new allocation. 383 * @is_atomic, only the amount necessary for a new allocation is
384 * considered; however, async extension is scheduled if the left amount is
385 * low. If !@is_atomic, it aims for more empty space. Combined, this
386 * ensures that the map is likely to have enough available space to
387 * accomodate atomic allocations which can't extend maps directly.
348 * 388 *
349 * CONTEXT: 389 * CONTEXT:
350 * pcpu_lock. 390 * pcpu_lock.
@@ -353,15 +393,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
353 * New target map allocation length if extension is necessary, 0 393 * New target map allocation length if extension is necessary, 0
354 * otherwise. 394 * otherwise.
355 */ 395 */
356static int pcpu_need_to_extend(struct pcpu_chunk *chunk) 396static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic)
357{ 397{
358 int new_alloc; 398 int margin, new_alloc;
399
400 if (is_atomic) {
401 margin = 3;
402
403 if (chunk->map_alloc <
404 chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW &&
405 pcpu_async_enabled)
406 schedule_work(&chunk->map_extend_work);
407 } else {
408 margin = PCPU_ATOMIC_MAP_MARGIN_HIGH;
409 }
359 410
360 if (chunk->map_alloc >= chunk->map_used + 3) 411 if (chunk->map_alloc >= chunk->map_used + margin)
361 return 0; 412 return 0;
362 413
363 new_alloc = PCPU_DFL_MAP_ALLOC; 414 new_alloc = PCPU_DFL_MAP_ALLOC;
364 while (new_alloc < chunk->map_used + 3) 415 while (new_alloc < chunk->map_used + margin)
365 new_alloc *= 2; 416 new_alloc *= 2;
366 417
367 return new_alloc; 418 return new_alloc;
@@ -418,11 +469,76 @@ out_unlock:
418 return 0; 469 return 0;
419} 470}
420 471
472static void pcpu_map_extend_workfn(struct work_struct *work)
473{
474 struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk,
475 map_extend_work);
476 int new_alloc;
477
478 spin_lock_irq(&pcpu_lock);
479 new_alloc = pcpu_need_to_extend(chunk, false);
480 spin_unlock_irq(&pcpu_lock);
481
482 if (new_alloc)
483 pcpu_extend_area_map(chunk, new_alloc);
484}
485
486/**
487 * pcpu_fit_in_area - try to fit the requested allocation in a candidate area
488 * @chunk: chunk the candidate area belongs to
489 * @off: the offset to the start of the candidate area
490 * @this_size: the size of the candidate area
491 * @size: the size of the target allocation
492 * @align: the alignment of the target allocation
493 * @pop_only: only allocate from already populated region
494 *
495 * We're trying to allocate @size bytes aligned at @align. @chunk's area
496 * at @off sized @this_size is a candidate. This function determines
497 * whether the target allocation fits in the candidate area and returns the
498 * number of bytes to pad after @off. If the target area doesn't fit, -1
499 * is returned.
500 *
501 * If @pop_only is %true, this function only considers the already
502 * populated part of the candidate area.
503 */
504static int pcpu_fit_in_area(struct pcpu_chunk *chunk, int off, int this_size,
505 int size, int align, bool pop_only)
506{
507 int cand_off = off;
508
509 while (true) {
510 int head = ALIGN(cand_off, align) - off;
511 int page_start, page_end, rs, re;
512
513 if (this_size < head + size)
514 return -1;
515
516 if (!pop_only)
517 return head;
518
519 /*
520 * If the first unpopulated page is beyond the end of the
521 * allocation, the whole allocation is populated;
522 * otherwise, retry from the end of the unpopulated area.
523 */
524 page_start = PFN_DOWN(head + off);
525 page_end = PFN_UP(head + off + size);
526
527 rs = page_start;
528 pcpu_next_unpop(chunk, &rs, &re, PFN_UP(off + this_size));
529 if (rs >= page_end)
530 return head;
531 cand_off = re * PAGE_SIZE;
532 }
533}
534
421/** 535/**
422 * pcpu_alloc_area - allocate area from a pcpu_chunk 536 * pcpu_alloc_area - allocate area from a pcpu_chunk
423 * @chunk: chunk of interest 537 * @chunk: chunk of interest
424 * @size: wanted size in bytes 538 * @size: wanted size in bytes
425 * @align: wanted align 539 * @align: wanted align
540 * @pop_only: allocate only from the populated area
541 * @occ_pages_p: out param for the number of pages the area occupies
426 * 542 *
427 * Try to allocate @size bytes area aligned at @align from @chunk. 543 * Try to allocate @size bytes area aligned at @align from @chunk.
428 * Note that this function only allocates the offset. It doesn't 544 * Note that this function only allocates the offset. It doesn't
@@ -437,7 +553,8 @@ out_unlock:
437 * Allocated offset in @chunk on success, -1 if no matching area is 553 * Allocated offset in @chunk on success, -1 if no matching area is
438 * found. 554 * found.
439 */ 555 */
440static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align) 556static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align,
557 bool pop_only, int *occ_pages_p)
441{ 558{
442 int oslot = pcpu_chunk_slot(chunk); 559 int oslot = pcpu_chunk_slot(chunk);
443 int max_contig = 0; 560 int max_contig = 0;
@@ -453,11 +570,11 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
453 if (off & 1) 570 if (off & 1)
454 continue; 571 continue;
455 572
456 /* extra for alignment requirement */
457 head = ALIGN(off, align) - off;
458
459 this_size = (p[1] & ~1) - off; 573 this_size = (p[1] & ~1) - off;
460 if (this_size < head + size) { 574
575 head = pcpu_fit_in_area(chunk, off, this_size, size, align,
576 pop_only);
577 if (head < 0) {
461 if (!seen_free) { 578 if (!seen_free) {
462 chunk->first_free = i; 579 chunk->first_free = i;
463 seen_free = true; 580 seen_free = true;
@@ -526,6 +643,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
526 chunk->free_size -= size; 643 chunk->free_size -= size;
527 *p |= 1; 644 *p |= 1;
528 645
646 *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
529 pcpu_chunk_relocate(chunk, oslot); 647 pcpu_chunk_relocate(chunk, oslot);
530 return off; 648 return off;
531 } 649 }
@@ -541,6 +659,7 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
541 * pcpu_free_area - free area to a pcpu_chunk 659 * pcpu_free_area - free area to a pcpu_chunk
542 * @chunk: chunk of interest 660 * @chunk: chunk of interest
543 * @freeme: offset of area to free 661 * @freeme: offset of area to free
662 * @occ_pages_p: out param for the number of pages the area occupies
544 * 663 *
545 * Free area starting from @freeme to @chunk. Note that this function 664 * Free area starting from @freeme to @chunk. Note that this function
546 * only modifies the allocation map. It doesn't depopulate or unmap 665 * only modifies the allocation map. It doesn't depopulate or unmap
@@ -549,7 +668,8 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
549 * CONTEXT: 668 * CONTEXT:
550 * pcpu_lock. 669 * pcpu_lock.
551 */ 670 */
552static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) 671static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
672 int *occ_pages_p)
553{ 673{
554 int oslot = pcpu_chunk_slot(chunk); 674 int oslot = pcpu_chunk_slot(chunk);
555 int off = 0; 675 int off = 0;
@@ -580,6 +700,8 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
580 *p = off &= ~1; 700 *p = off &= ~1;
581 chunk->free_size += (p[1] & ~1) - off; 701 chunk->free_size += (p[1] & ~1) - off;
582 702
703 *occ_pages_p = pcpu_count_occupied_pages(chunk, i);
704
583 /* merge with next? */ 705 /* merge with next? */
584 if (!(p[1] & 1)) 706 if (!(p[1] & 1))
585 to_free++; 707 to_free++;
@@ -620,6 +742,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
620 chunk->map_used = 1; 742 chunk->map_used = 1;
621 743
622 INIT_LIST_HEAD(&chunk->list); 744 INIT_LIST_HEAD(&chunk->list);
745 INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn);
623 chunk->free_size = pcpu_unit_size; 746 chunk->free_size = pcpu_unit_size;
624 chunk->contig_hint = pcpu_unit_size; 747 chunk->contig_hint = pcpu_unit_size;
625 748
@@ -634,6 +757,50 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
634 pcpu_mem_free(chunk, pcpu_chunk_struct_size); 757 pcpu_mem_free(chunk, pcpu_chunk_struct_size);
635} 758}
636 759
760/**
761 * pcpu_chunk_populated - post-population bookkeeping
762 * @chunk: pcpu_chunk which got populated
763 * @page_start: the start page
764 * @page_end: the end page
765 *
766 * Pages in [@page_start,@page_end) have been populated to @chunk. Update
767 * the bookkeeping information accordingly. Must be called after each
768 * successful population.
769 */
770static void pcpu_chunk_populated(struct pcpu_chunk *chunk,
771 int page_start, int page_end)
772{
773 int nr = page_end - page_start;
774
775 lockdep_assert_held(&pcpu_lock);
776
777 bitmap_set(chunk->populated, page_start, nr);
778 chunk->nr_populated += nr;
779 pcpu_nr_empty_pop_pages += nr;
780}
781
782/**
783 * pcpu_chunk_depopulated - post-depopulation bookkeeping
784 * @chunk: pcpu_chunk which got depopulated
785 * @page_start: the start page
786 * @page_end: the end page
787 *
788 * Pages in [@page_start,@page_end) have been depopulated from @chunk.
789 * Update the bookkeeping information accordingly. Must be called after
790 * each successful depopulation.
791 */
792static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk,
793 int page_start, int page_end)
794{
795 int nr = page_end - page_start;
796
797 lockdep_assert_held(&pcpu_lock);
798
799 bitmap_clear(chunk->populated, page_start, nr);
800 chunk->nr_populated -= nr;
801 pcpu_nr_empty_pop_pages -= nr;
802}
803
637/* 804/*
638 * Chunk management implementation. 805 * Chunk management implementation.
639 * 806 *
@@ -695,21 +862,23 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
695 * @size: size of area to allocate in bytes 862 * @size: size of area to allocate in bytes
696 * @align: alignment of area (max PAGE_SIZE) 863 * @align: alignment of area (max PAGE_SIZE)
697 * @reserved: allocate from the reserved chunk if available 864 * @reserved: allocate from the reserved chunk if available
865 * @gfp: allocation flags
698 * 866 *
699 * Allocate percpu area of @size bytes aligned at @align. 867 * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't
700 * 868 * contain %GFP_KERNEL, the allocation is atomic.
701 * CONTEXT:
702 * Does GFP_KERNEL allocation.
703 * 869 *
704 * RETURNS: 870 * RETURNS:
705 * Percpu pointer to the allocated area on success, NULL on failure. 871 * Percpu pointer to the allocated area on success, NULL on failure.
706 */ 872 */
707static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) 873static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
874 gfp_t gfp)
708{ 875{
709 static int warn_limit = 10; 876 static int warn_limit = 10;
710 struct pcpu_chunk *chunk; 877 struct pcpu_chunk *chunk;
711 const char *err; 878 const char *err;
712 int slot, off, new_alloc; 879 bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
880 int occ_pages = 0;
881 int slot, off, new_alloc, cpu, ret;
713 unsigned long flags; 882 unsigned long flags;
714 void __percpu *ptr; 883 void __percpu *ptr;
715 884
@@ -728,7 +897,6 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
728 return NULL; 897 return NULL;
729 } 898 }
730 899
731 mutex_lock(&pcpu_alloc_mutex);
732 spin_lock_irqsave(&pcpu_lock, flags); 900 spin_lock_irqsave(&pcpu_lock, flags);
733 901
734 /* serve reserved allocations from the reserved chunk if available */ 902 /* serve reserved allocations from the reserved chunk if available */
@@ -740,16 +908,18 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved)
740 goto fail_unlock; 908 goto fail_unlock;
741 } 909 }
742 910
743 while ((new_alloc = pcpu_need_to_extend(chunk))) { 911 while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
744 spin_unlock_irqrestore(&pcpu_lock, flags); 912 spin_unlock_irqrestore(&pcpu_lock, flags);
745 if (pcpu_extend_area_map(chunk, new_alloc) < 0) { 913 if (is_atomic ||
914 pcpu_extend_area_map(chunk, new_alloc) < 0) {
746 err = "failed to extend area map of reserved chunk"; 915 err = "failed to extend area map of reserved chunk";
747 goto fail_unlock_mutex; 916 goto fail;
748 } 917 }
749 spin_lock_irqsave(&pcpu_lock, flags); 918 spin_lock_irqsave(&pcpu_lock, flags);
750 } 919 }
751 920
752 off = pcpu_alloc_area(chunk, size, align); 921 off = pcpu_alloc_area(chunk, size, align, is_atomic,
922 &occ_pages);
753 if (off >= 0) 923 if (off >= 0)
754 goto area_found; 924 goto area_found;
755 925
@@ -764,13 +934,15 @@ restart:
764 if (size > chunk->contig_hint) 934 if (size > chunk->contig_hint)
765 continue; 935 continue;
766 936
767 new_alloc = pcpu_need_to_extend(chunk); 937 new_alloc = pcpu_need_to_extend(chunk, is_atomic);
768 if (new_alloc) { 938 if (new_alloc) {
939 if (is_atomic)
940 continue;
769 spin_unlock_irqrestore(&pcpu_lock, flags); 941 spin_unlock_irqrestore(&pcpu_lock, flags);
770 if (pcpu_extend_area_map(chunk, 942 if (pcpu_extend_area_map(chunk,
771 new_alloc) < 0) { 943 new_alloc) < 0) {
772 err = "failed to extend area map"; 944 err = "failed to extend area map";
773 goto fail_unlock_mutex; 945 goto fail;
774 } 946 }
775 spin_lock_irqsave(&pcpu_lock, flags); 947 spin_lock_irqsave(&pcpu_lock, flags);
776 /* 948 /*
@@ -780,74 +952,134 @@ restart:
780 goto restart; 952 goto restart;
781 } 953 }
782 954
783 off = pcpu_alloc_area(chunk, size, align); 955 off = pcpu_alloc_area(chunk, size, align, is_atomic,
956 &occ_pages);
784 if (off >= 0) 957 if (off >= 0)
785 goto area_found; 958 goto area_found;
786 } 959 }
787 } 960 }
788 961
789 /* hmmm... no space left, create a new chunk */
790 spin_unlock_irqrestore(&pcpu_lock, flags); 962 spin_unlock_irqrestore(&pcpu_lock, flags);
791 963
792 chunk = pcpu_create_chunk(); 964 /*
793 if (!chunk) { 965 * No space left. Create a new chunk. We don't want multiple
794 err = "failed to allocate new chunk"; 966 * tasks to create chunks simultaneously. Serialize and create iff
795 goto fail_unlock_mutex; 967 * there's still no empty chunk after grabbing the mutex.
968 */
969 if (is_atomic)
970 goto fail;
971
972 mutex_lock(&pcpu_alloc_mutex);
973
974 if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
975 chunk = pcpu_create_chunk();
976 if (!chunk) {
977 mutex_unlock(&pcpu_alloc_mutex);
978 err = "failed to allocate new chunk";
979 goto fail;
980 }
981
982 spin_lock_irqsave(&pcpu_lock, flags);
983 pcpu_chunk_relocate(chunk, -1);
984 } else {
985 spin_lock_irqsave(&pcpu_lock, flags);
796 } 986 }
797 987
798 spin_lock_irqsave(&pcpu_lock, flags); 988 mutex_unlock(&pcpu_alloc_mutex);
799 pcpu_chunk_relocate(chunk, -1);
800 goto restart; 989 goto restart;
801 990
802area_found: 991area_found:
803 spin_unlock_irqrestore(&pcpu_lock, flags); 992 spin_unlock_irqrestore(&pcpu_lock, flags);
804 993
805 /* populate, map and clear the area */ 994 /* populate if not all pages are already there */
806 if (pcpu_populate_chunk(chunk, off, size)) { 995 if (!is_atomic) {
807 spin_lock_irqsave(&pcpu_lock, flags); 996 int page_start, page_end, rs, re;
808 pcpu_free_area(chunk, off); 997
809 err = "failed to populate"; 998 mutex_lock(&pcpu_alloc_mutex);
810 goto fail_unlock; 999
1000 page_start = PFN_DOWN(off);
1001 page_end = PFN_UP(off + size);
1002
1003 pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
1004 WARN_ON(chunk->immutable);
1005
1006 ret = pcpu_populate_chunk(chunk, rs, re);
1007
1008 spin_lock_irqsave(&pcpu_lock, flags);
1009 if (ret) {
1010 mutex_unlock(&pcpu_alloc_mutex);
1011 pcpu_free_area(chunk, off, &occ_pages);
1012 err = "failed to populate";
1013 goto fail_unlock;
1014 }
1015 pcpu_chunk_populated(chunk, rs, re);
1016 spin_unlock_irqrestore(&pcpu_lock, flags);
1017 }
1018
1019 mutex_unlock(&pcpu_alloc_mutex);
811 } 1020 }
812 1021
813 mutex_unlock(&pcpu_alloc_mutex); 1022 if (chunk != pcpu_reserved_chunk)
1023 pcpu_nr_empty_pop_pages -= occ_pages;
1024
1025 if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW)
1026 pcpu_schedule_balance_work();
1027
1028 /* clear the areas and return address relative to base address */
1029 for_each_possible_cpu(cpu)
1030 memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
814 1031
815 /* return address relative to base address */
816 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); 1032 ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
817 kmemleak_alloc_percpu(ptr, size); 1033 kmemleak_alloc_percpu(ptr, size);
818 return ptr; 1034 return ptr;
819 1035
820fail_unlock: 1036fail_unlock:
821 spin_unlock_irqrestore(&pcpu_lock, flags); 1037 spin_unlock_irqrestore(&pcpu_lock, flags);
822fail_unlock_mutex: 1038fail:
823 mutex_unlock(&pcpu_alloc_mutex); 1039 if (!is_atomic && warn_limit) {
824 if (warn_limit) { 1040 pr_warning("PERCPU: allocation failed, size=%zu align=%zu atomic=%d, %s\n",
825 pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " 1041 size, align, is_atomic, err);
826 "%s\n", size, align, err);
827 dump_stack(); 1042 dump_stack();
828 if (!--warn_limit) 1043 if (!--warn_limit)
829 pr_info("PERCPU: limit reached, disable warning\n"); 1044 pr_info("PERCPU: limit reached, disable warning\n");
830 } 1045 }
1046 if (is_atomic) {
1047 /* see the flag handling in pcpu_blance_workfn() */
1048 pcpu_atomic_alloc_failed = true;
1049 pcpu_schedule_balance_work();
1050 }
831 return NULL; 1051 return NULL;
832} 1052}
833 1053
834/** 1054/**
835 * __alloc_percpu - allocate dynamic percpu area 1055 * __alloc_percpu_gfp - allocate dynamic percpu area
836 * @size: size of area to allocate in bytes 1056 * @size: size of area to allocate in bytes
837 * @align: alignment of area (max PAGE_SIZE) 1057 * @align: alignment of area (max PAGE_SIZE)
1058 * @gfp: allocation flags
838 * 1059 *
839 * Allocate zero-filled percpu area of @size bytes aligned at @align. 1060 * Allocate zero-filled percpu area of @size bytes aligned at @align. If
840 * Might sleep. Might trigger writeouts. 1061 * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can
841 * 1062 * be called from any context but is a lot more likely to fail.
842 * CONTEXT:
843 * Does GFP_KERNEL allocation.
844 * 1063 *
845 * RETURNS: 1064 * RETURNS:
846 * Percpu pointer to the allocated area on success, NULL on failure. 1065 * Percpu pointer to the allocated area on success, NULL on failure.
847 */ 1066 */
1067void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp)
1068{
1069 return pcpu_alloc(size, align, false, gfp);
1070}
1071EXPORT_SYMBOL_GPL(__alloc_percpu_gfp);
1072
1073/**
1074 * __alloc_percpu - allocate dynamic percpu area
1075 * @size: size of area to allocate in bytes
1076 * @align: alignment of area (max PAGE_SIZE)
1077 *
1078 * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL).
1079 */
848void __percpu *__alloc_percpu(size_t size, size_t align) 1080void __percpu *__alloc_percpu(size_t size, size_t align)
849{ 1081{
850 return pcpu_alloc(size, align, false); 1082 return pcpu_alloc(size, align, false, GFP_KERNEL);
851} 1083}
852EXPORT_SYMBOL_GPL(__alloc_percpu); 1084EXPORT_SYMBOL_GPL(__alloc_percpu);
853 1085
@@ -869,44 +1101,121 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
869 */ 1101 */
870void __percpu *__alloc_reserved_percpu(size_t size, size_t align) 1102void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
871{ 1103{
872 return pcpu_alloc(size, align, true); 1104 return pcpu_alloc(size, align, true, GFP_KERNEL);
873} 1105}
874 1106
875/** 1107/**
876 * pcpu_reclaim - reclaim fully free chunks, workqueue function 1108 * pcpu_balance_workfn - manage the amount of free chunks and populated pages
877 * @work: unused 1109 * @work: unused
878 * 1110 *
879 * Reclaim all fully free chunks except for the first one. 1111 * Reclaim all fully free chunks except for the first one.
880 *
881 * CONTEXT:
882 * workqueue context.
883 */ 1112 */
884static void pcpu_reclaim(struct work_struct *work) 1113static void pcpu_balance_workfn(struct work_struct *work)
885{ 1114{
886 LIST_HEAD(todo); 1115 LIST_HEAD(to_free);
887 struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1]; 1116 struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
888 struct pcpu_chunk *chunk, *next; 1117 struct pcpu_chunk *chunk, *next;
1118 int slot, nr_to_pop, ret;
889 1119
1120 /*
1121 * There's no reason to keep around multiple unused chunks and VM
1122 * areas can be scarce. Destroy all free chunks except for one.
1123 */
890 mutex_lock(&pcpu_alloc_mutex); 1124 mutex_lock(&pcpu_alloc_mutex);
891 spin_lock_irq(&pcpu_lock); 1125 spin_lock_irq(&pcpu_lock);
892 1126
893 list_for_each_entry_safe(chunk, next, head, list) { 1127 list_for_each_entry_safe(chunk, next, free_head, list) {
894 WARN_ON(chunk->immutable); 1128 WARN_ON(chunk->immutable);
895 1129
896 /* spare the first one */ 1130 /* spare the first one */
897 if (chunk == list_first_entry(head, struct pcpu_chunk, list)) 1131 if (chunk == list_first_entry(free_head, struct pcpu_chunk, list))
898 continue; 1132 continue;
899 1133
900 list_move(&chunk->list, &todo); 1134 list_move(&chunk->list, &to_free);
901 } 1135 }
902 1136
903 spin_unlock_irq(&pcpu_lock); 1137 spin_unlock_irq(&pcpu_lock);
904 1138
905 list_for_each_entry_safe(chunk, next, &todo, list) { 1139 list_for_each_entry_safe(chunk, next, &to_free, list) {
906 pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); 1140 int rs, re;
1141
1142 pcpu_for_each_pop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1143 pcpu_depopulate_chunk(chunk, rs, re);
1144 spin_lock_irq(&pcpu_lock);
1145 pcpu_chunk_depopulated(chunk, rs, re);
1146 spin_unlock_irq(&pcpu_lock);
1147 }
907 pcpu_destroy_chunk(chunk); 1148 pcpu_destroy_chunk(chunk);
908 } 1149 }
909 1150
1151 /*
1152 * Ensure there are certain number of free populated pages for
1153 * atomic allocs. Fill up from the most packed so that atomic
1154 * allocs don't increase fragmentation. If atomic allocation
1155 * failed previously, always populate the maximum amount. This
1156 * should prevent atomic allocs larger than PAGE_SIZE from keeping
1157 * failing indefinitely; however, large atomic allocs are not
1158 * something we support properly and can be highly unreliable and
1159 * inefficient.
1160 */
1161retry_pop:
1162 if (pcpu_atomic_alloc_failed) {
1163 nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH;
1164 /* best effort anyway, don't worry about synchronization */
1165 pcpu_atomic_alloc_failed = false;
1166 } else {
1167 nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH -
1168 pcpu_nr_empty_pop_pages,
1169 0, PCPU_EMPTY_POP_PAGES_HIGH);
1170 }
1171
1172 for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) {
1173 int nr_unpop = 0, rs, re;
1174
1175 if (!nr_to_pop)
1176 break;
1177
1178 spin_lock_irq(&pcpu_lock);
1179 list_for_each_entry(chunk, &pcpu_slot[slot], list) {
1180 nr_unpop = pcpu_unit_pages - chunk->nr_populated;
1181 if (nr_unpop)
1182 break;
1183 }
1184 spin_unlock_irq(&pcpu_lock);
1185
1186 if (!nr_unpop)
1187 continue;
1188
1189 /* @chunk can't go away while pcpu_alloc_mutex is held */
1190 pcpu_for_each_unpop_region(chunk, rs, re, 0, pcpu_unit_pages) {
1191 int nr = min(re - rs, nr_to_pop);
1192
1193 ret = pcpu_populate_chunk(chunk, rs, rs + nr);
1194 if (!ret) {
1195 nr_to_pop -= nr;
1196 spin_lock_irq(&pcpu_lock);
1197 pcpu_chunk_populated(chunk, rs, rs + nr);
1198 spin_unlock_irq(&pcpu_lock);
1199 } else {
1200 nr_to_pop = 0;
1201 }
1202
1203 if (!nr_to_pop)
1204 break;
1205 }
1206 }
1207
1208 if (nr_to_pop) {
1209 /* ran out of chunks to populate, create a new one and retry */
1210 chunk = pcpu_create_chunk();
1211 if (chunk) {
1212 spin_lock_irq(&pcpu_lock);
1213 pcpu_chunk_relocate(chunk, -1);
1214 spin_unlock_irq(&pcpu_lock);
1215 goto retry_pop;
1216 }
1217 }
1218
910 mutex_unlock(&pcpu_alloc_mutex); 1219 mutex_unlock(&pcpu_alloc_mutex);
911} 1220}
912 1221
@@ -924,7 +1233,7 @@ void free_percpu(void __percpu *ptr)
924 void *addr; 1233 void *addr;
925 struct pcpu_chunk *chunk; 1234 struct pcpu_chunk *chunk;
926 unsigned long flags; 1235 unsigned long flags;
927 int off; 1236 int off, occ_pages;
928 1237
929 if (!ptr) 1238 if (!ptr)
930 return; 1239 return;
@@ -938,7 +1247,10 @@ void free_percpu(void __percpu *ptr)
938 chunk = pcpu_chunk_addr_search(addr); 1247 chunk = pcpu_chunk_addr_search(addr);
939 off = addr - chunk->base_addr; 1248 off = addr - chunk->base_addr;
940 1249
941 pcpu_free_area(chunk, off); 1250 pcpu_free_area(chunk, off, &occ_pages);
1251
1252 if (chunk != pcpu_reserved_chunk)
1253 pcpu_nr_empty_pop_pages += occ_pages;
942 1254
943 /* if there are more than one fully free chunks, wake up grim reaper */ 1255 /* if there are more than one fully free chunks, wake up grim reaper */
944 if (chunk->free_size == pcpu_unit_size) { 1256 if (chunk->free_size == pcpu_unit_size) {
@@ -946,7 +1258,7 @@ void free_percpu(void __percpu *ptr)
946 1258
947 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) 1259 list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
948 if (pos != chunk) { 1260 if (pos != chunk) {
949 schedule_work(&pcpu_reclaim_work); 1261 pcpu_schedule_balance_work();
950 break; 1262 break;
951 } 1263 }
952 } 1264 }
@@ -1336,11 +1648,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1336 */ 1648 */
1337 schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); 1649 schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1338 INIT_LIST_HEAD(&schunk->list); 1650 INIT_LIST_HEAD(&schunk->list);
1651 INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn);
1339 schunk->base_addr = base_addr; 1652 schunk->base_addr = base_addr;
1340 schunk->map = smap; 1653 schunk->map = smap;
1341 schunk->map_alloc = ARRAY_SIZE(smap); 1654 schunk->map_alloc = ARRAY_SIZE(smap);
1342 schunk->immutable = true; 1655 schunk->immutable = true;
1343 bitmap_fill(schunk->populated, pcpu_unit_pages); 1656 bitmap_fill(schunk->populated, pcpu_unit_pages);
1657 schunk->nr_populated = pcpu_unit_pages;
1344 1658
1345 if (ai->reserved_size) { 1659 if (ai->reserved_size) {
1346 schunk->free_size = ai->reserved_size; 1660 schunk->free_size = ai->reserved_size;
@@ -1364,11 +1678,13 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1364 if (dyn_size) { 1678 if (dyn_size) {
1365 dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); 1679 dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
1366 INIT_LIST_HEAD(&dchunk->list); 1680 INIT_LIST_HEAD(&dchunk->list);
1681 INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn);
1367 dchunk->base_addr = base_addr; 1682 dchunk->base_addr = base_addr;
1368 dchunk->map = dmap; 1683 dchunk->map = dmap;
1369 dchunk->map_alloc = ARRAY_SIZE(dmap); 1684 dchunk->map_alloc = ARRAY_SIZE(dmap);
1370 dchunk->immutable = true; 1685 dchunk->immutable = true;
1371 bitmap_fill(dchunk->populated, pcpu_unit_pages); 1686 bitmap_fill(dchunk->populated, pcpu_unit_pages);
1687 dchunk->nr_populated = pcpu_unit_pages;
1372 1688
1373 dchunk->contig_hint = dchunk->free_size = dyn_size; 1689 dchunk->contig_hint = dchunk->free_size = dyn_size;
1374 dchunk->map[0] = 1; 1690 dchunk->map[0] = 1;
@@ -1379,6 +1695,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
1379 1695
1380 /* link the first chunk in */ 1696 /* link the first chunk in */
1381 pcpu_first_chunk = dchunk ?: schunk; 1697 pcpu_first_chunk = dchunk ?: schunk;
1698 pcpu_nr_empty_pop_pages +=
1699 pcpu_count_occupied_pages(pcpu_first_chunk, 1);
1382 pcpu_chunk_relocate(pcpu_first_chunk, -1); 1700 pcpu_chunk_relocate(pcpu_first_chunk, -1);
1383 1701
1384 /* we're done */ 1702 /* we're done */
@@ -1932,8 +2250,6 @@ void __init setup_per_cpu_areas(void)
1932 2250
1933 if (pcpu_setup_first_chunk(ai, fc) < 0) 2251 if (pcpu_setup_first_chunk(ai, fc) < 0)
1934 panic("Failed to initialize percpu areas."); 2252 panic("Failed to initialize percpu areas.");
1935
1936 pcpu_free_alloc_info(ai);
1937} 2253}
1938 2254
1939#endif /* CONFIG_SMP */ 2255#endif /* CONFIG_SMP */
@@ -1967,3 +2283,15 @@ void __init percpu_init_late(void)
1967 spin_unlock_irqrestore(&pcpu_lock, flags); 2283 spin_unlock_irqrestore(&pcpu_lock, flags);
1968 } 2284 }
1969} 2285}
2286
2287/*
2288 * Percpu allocator is initialized early during boot when neither slab or
2289 * workqueue is available. Plug async management until everything is up
2290 * and running.
2291 */
2292static int __init percpu_enable_async(void)
2293{
2294 pcpu_async_enabled = true;
2295 return 0;
2296}
2297subsys_initcall(percpu_enable_async);
diff --git a/mm/shmem.c b/mm/shmem.c
index 4fad61bb41e5..cd6fc7590e54 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2995,7 +2995,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
2995#endif 2995#endif
2996 2996
2997 spin_lock_init(&sbinfo->stat_lock); 2997 spin_lock_init(&sbinfo->stat_lock);
2998 if (percpu_counter_init(&sbinfo->used_blocks, 0)) 2998 if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
2999 goto failed; 2999 goto failed;
3000 sbinfo->free_inodes = sbinfo->max_inodes; 3000 sbinfo->free_inodes = sbinfo->max_inodes;
3001 3001
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 97b0fcc79547..5ab6627cf370 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1115,7 +1115,7 @@ static int __init dccp_init(void)
1115 1115
1116 BUILD_BUG_ON(sizeof(struct dccp_skb_cb) > 1116 BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
1117 FIELD_SIZEOF(struct sk_buff, cb)); 1117 FIELD_SIZEOF(struct sk_buff, cb));
1118 rc = percpu_counter_init(&dccp_orphan_count, 0); 1118 rc = percpu_counter_init(&dccp_orphan_count, 0, GFP_KERNEL);
1119 if (rc) 1119 if (rc)
1120 goto out_fail; 1120 goto out_fail;
1121 rc = -ENOBUFS; 1121 rc = -ENOBUFS;
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 461003d258ba..86023b9be47f 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3071,8 +3071,8 @@ void __init tcp_init(void)
3071 3071
3072 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); 3072 BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb));
3073 3073
3074 percpu_counter_init(&tcp_sockets_allocated, 0); 3074 percpu_counter_init(&tcp_sockets_allocated, 0, GFP_KERNEL);
3075 percpu_counter_init(&tcp_orphan_count, 0); 3075 percpu_counter_init(&tcp_orphan_count, 0, GFP_KERNEL);
3076 tcp_hashinfo.bind_bucket_cachep = 3076 tcp_hashinfo.bind_bucket_cachep =
3077 kmem_cache_create("tcp_bind_bucket", 3077 kmem_cache_create("tcp_bind_bucket",
3078 sizeof(struct inet_bind_bucket), 0, 3078 sizeof(struct inet_bind_bucket), 0,
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
index 3af522622fad..1d191357bf88 100644
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -32,7 +32,7 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
32 res_parent = &parent_cg->memory_allocated; 32 res_parent = &parent_cg->memory_allocated;
33 33
34 res_counter_init(&cg_proto->memory_allocated, res_parent); 34 res_counter_init(&cg_proto->memory_allocated, res_parent);
35 percpu_counter_init(&cg_proto->sockets_allocated, 0); 35 percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL);
36 36
37 return 0; 37 return 0;
38} 38}
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 9d2c6c9facb6..8f34b27d5775 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1341,7 +1341,7 @@ static __init int sctp_init(void)
1341 if (!sctp_chunk_cachep) 1341 if (!sctp_chunk_cachep)
1342 goto err_chunk_cachep; 1342 goto err_chunk_cachep;
1343 1343
1344 status = percpu_counter_init(&sctp_sockets_allocated, 0); 1344 status = percpu_counter_init(&sctp_sockets_allocated, 0, GFP_KERNEL);
1345 if (status) 1345 if (status)
1346 goto err_percpu_counter_init; 1346 goto err_percpu_counter_init;
1347 1347