aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan Williams <dan.j.williams@intel.com>2018-04-09 13:50:17 -0400
committerDan Williams <dan.j.williams@intel.com>2018-04-09 13:50:17 -0400
commite13e75b86ef2f88e3a47d672dd4c52a293efb95b (patch)
tree2617aebd952d1aec09d323f6b2484b93f659e753
parent1ed41b5696ccc3ff40a1dee39fe14eff273faf82 (diff)
parent976431b02c2ef92ae3f8b6a7d699fc554025e118 (diff)
Merge branch 'for-4.17/dax' into libnvdimm-for-next
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt11
-rw-r--r--drivers/dax/Kconfig5
-rw-r--r--drivers/dax/super.c15
-rw-r--r--drivers/md/Kconfig2
-rw-r--r--drivers/md/dm-linear.c6
-rw-r--r--drivers/md/dm-log-writes.c95
-rw-r--r--drivers/md/dm-stripe.c6
-rw-r--r--drivers/md/dm.c10
-rw-r--r--drivers/nvdimm/Kconfig2
-rw-r--r--drivers/s390/block/Kconfig2
-rw-r--r--fs/block_dev.c5
-rw-r--r--fs/dax.c146
-rw-r--r--fs/ext2/ext2.h1
-rw-r--r--fs/ext2/inode.c46
-rw-r--r--fs/ext2/namei.c18
-rw-r--r--fs/ext4/inode.c42
-rw-r--r--fs/libfs.c39
-rw-r--r--fs/xfs/xfs_aops.c34
-rw-r--r--fs/xfs/xfs_aops.h1
-rw-r--r--fs/xfs/xfs_iops.c5
-rw-r--r--include/linux/dax.h42
-rw-r--r--include/linux/fs.h4
-rw-r--r--include/linux/sched/deadline.h6
-rw-r--r--include/linux/sched/isolation.h1
-rw-r--r--include/linux/sched/nohz.h4
-rw-r--r--include/linux/tick.h4
-rw-r--r--include/linux/wait_bit.h70
-rw-r--r--kernel/sched/Makefile5
-rw-r--r--kernel/sched/autogroup.c21
-rw-r--r--kernel/sched/autogroup.h12
-rw-r--r--kernel/sched/clock.c36
-rw-r--r--kernel/sched/completion.c5
-rw-r--r--kernel/sched/core.c165
-rw-r--r--kernel/sched/cpuacct.c33
-rw-r--r--kernel/sched/cpudeadline.c23
-rw-r--r--kernel/sched/cpudeadline.h29
-rw-r--r--kernel/sched/cpufreq.c1
-rw-r--r--kernel/sched/cpufreq_schedutil.c137
-rw-r--r--kernel/sched/cpupri.c15
-rw-r--r--kernel/sched/cpupri.h25
-rw-r--r--kernel/sched/cputime.c58
-rw-r--r--kernel/sched/deadline.c78
-rw-r--r--kernel/sched/debug.c99
-rw-r--r--kernel/sched/fair.c299
-rw-r--r--kernel/sched/idle.c142
-rw-r--r--kernel/sched/idle_task.c110
-rw-r--r--kernel/sched/isolation.c14
-rw-r--r--kernel/sched/loadavg.c34
-rw-r--r--kernel/sched/membarrier.c27
-rw-r--r--kernel/sched/rt.c51
-rw-r--r--kernel/sched/sched.h623
-rw-r--r--kernel/sched/stats.c20
-rw-r--r--kernel/sched/stats.h86
-rw-r--r--kernel/sched/stop_task.c11
-rw-r--r--kernel/sched/swait.c6
-rw-r--r--kernel/sched/topology.c46
-rw-r--r--kernel/sched/wait.c13
-rw-r--r--kernel/sched/wait_bit.c71
-rw-r--r--kernel/time/tick-sched.c15
-rw-r--r--kernel/workqueue.c3
60 files changed, 1637 insertions, 1298 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 1d1d53f85ddd..50b9837e985b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1766,6 +1766,17 @@
1766 1766
1767 nohz 1767 nohz
1768 Disable the tick when a single task runs. 1768 Disable the tick when a single task runs.
1769
1770 A residual 1Hz tick is offloaded to workqueues, which you
1771 need to affine to housekeeping through the global
1772 workqueue's affinity configured via the
1773 /sys/devices/virtual/workqueue/cpumask sysfs file, or
1774 by using the 'domain' flag described below.
1775
1776 NOTE: by default the global workqueue runs on all CPUs,
1777 so to protect individual CPUs the 'cpumask' file has to
1778 be configured manually after bootup.
1779
1769 domain 1780 domain
1770 Isolate from the general SMP balancing and scheduling 1781 Isolate from the general SMP balancing and scheduling
1771 algorithms. Note that performing domain isolation this way 1782 algorithms. Note that performing domain isolation this way
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index b79aa8f7a497..e0700bf4893a 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -1,3 +1,7 @@
1config DAX_DRIVER
2 select DAX
3 bool
4
1menuconfig DAX 5menuconfig DAX
2 tristate "DAX: direct access to differentiated memory" 6 tristate "DAX: direct access to differentiated memory"
3 select SRCU 7 select SRCU
@@ -16,7 +20,6 @@ config DEV_DAX
16 baseline memory pool. Mappings of a /dev/daxX.Y device impose 20 baseline memory pool. Mappings of a /dev/daxX.Y device impose
17 restrictions that make the mapping behavior deterministic. 21 restrictions that make the mapping behavior deterministic.
18 22
19
20config DEV_DAX_PMEM 23config DEV_DAX_PMEM
21 tristate "PMEM DAX: direct access to persistent memory" 24 tristate "PMEM DAX: direct access to persistent memory"
22 depends on LIBNVDIMM && NVDIMM_DAX && DEV_DAX 25 depends on LIBNVDIMM && NVDIMM_DAX && DEV_DAX
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index ecdc292aa4e4..2b2332b605e4 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -124,10 +124,19 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
124 return len < 0 ? len : -EIO; 124 return len < 0 ? len : -EIO;
125 } 125 }
126 126
127 if ((IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) 127 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) {
128 || pfn_t_devmap(pfn)) 128 /*
129 * An arch that has enabled the pmem api should also
130 * have its drivers support pfn_t_devmap()
131 *
132 * This is a developer warning and should not trigger in
133 * production. dax_flush() will crash since it depends
134 * on being able to do (page_address(pfn_to_page())).
135 */
136 WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API));
137 } else if (pfn_t_devmap(pfn)) {
129 /* pass */; 138 /* pass */;
130 else { 139 } else {
131 pr_debug("VFS (%s): error: dax support not enabled\n", 140 pr_debug("VFS (%s): error: dax support not enabled\n",
132 sb->s_id); 141 sb->s_id);
133 return -EOPNOTSUPP; 142 return -EOPNOTSUPP;
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 2c8ac3688815..edff083f7c4e 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -201,7 +201,7 @@ config BLK_DEV_DM_BUILTIN
201config BLK_DEV_DM 201config BLK_DEV_DM
202 tristate "Device mapper support" 202 tristate "Device mapper support"
203 select BLK_DEV_DM_BUILTIN 203 select BLK_DEV_DM_BUILTIN
204 select DAX 204 depends on DAX || DAX=n
205 ---help--- 205 ---help---
206 Device-mapper is a low level volume manager. It works by allowing 206 Device-mapper is a low level volume manager. It works by allowing
207 people to specify mappings for ranges of logical sectors. Various 207 people to specify mappings for ranges of logical sectors. Various
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index d5f8eff7c11d..89443e0ededa 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -154,6 +154,7 @@ static int linear_iterate_devices(struct dm_target *ti,
154 return fn(ti, lc->dev, lc->start, ti->len, data); 154 return fn(ti, lc->dev, lc->start, ti->len, data);
155} 155}
156 156
157#if IS_ENABLED(CONFIG_DAX_DRIVER)
157static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, 158static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
158 long nr_pages, void **kaddr, pfn_t *pfn) 159 long nr_pages, void **kaddr, pfn_t *pfn)
159{ 160{
@@ -184,6 +185,11 @@ static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
184 return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); 185 return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
185} 186}
186 187
188#else
189#define linear_dax_direct_access NULL
190#define linear_dax_copy_from_iter NULL
191#endif
192
187static struct target_type linear_target = { 193static struct target_type linear_target = {
188 .name = "linear", 194 .name = "linear",
189 .version = {1, 4, 0}, 195 .version = {1, 4, 0},
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 3362d866793b..7fcb4216973f 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -610,51 +610,6 @@ static int log_mark(struct log_writes_c *lc, char *data)
610 return 0; 610 return 0;
611} 611}
612 612
613static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes,
614 struct iov_iter *i)
615{
616 struct pending_block *block;
617
618 if (!bytes)
619 return 0;
620
621 block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
622 if (!block) {
623 DMERR("Error allocating dax pending block");
624 return -ENOMEM;
625 }
626
627 block->data = kzalloc(bytes, GFP_KERNEL);
628 if (!block->data) {
629 DMERR("Error allocating dax data space");
630 kfree(block);
631 return -ENOMEM;
632 }
633
634 /* write data provided via the iterator */
635 if (!copy_from_iter(block->data, bytes, i)) {
636 DMERR("Error copying dax data");
637 kfree(block->data);
638 kfree(block);
639 return -EIO;
640 }
641
642 /* rewind the iterator so that the block driver can use it */
643 iov_iter_revert(i, bytes);
644
645 block->datalen = bytes;
646 block->sector = bio_to_dev_sectors(lc, sector);
647 block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift;
648
649 atomic_inc(&lc->pending_blocks);
650 spin_lock_irq(&lc->blocks_lock);
651 list_add_tail(&block->list, &lc->unflushed_blocks);
652 spin_unlock_irq(&lc->blocks_lock);
653 wake_up_process(lc->log_kthread);
654
655 return 0;
656}
657
658static void log_writes_dtr(struct dm_target *ti) 613static void log_writes_dtr(struct dm_target *ti)
659{ 614{
660 struct log_writes_c *lc = ti->private; 615 struct log_writes_c *lc = ti->private;
@@ -920,6 +875,52 @@ static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limit
920 limits->io_min = limits->physical_block_size; 875 limits->io_min = limits->physical_block_size;
921} 876}
922 877
878#if IS_ENABLED(CONFIG_DAX_DRIVER)
879static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes,
880 struct iov_iter *i)
881{
882 struct pending_block *block;
883
884 if (!bytes)
885 return 0;
886
887 block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
888 if (!block) {
889 DMERR("Error allocating dax pending block");
890 return -ENOMEM;
891 }
892
893 block->data = kzalloc(bytes, GFP_KERNEL);
894 if (!block->data) {
895 DMERR("Error allocating dax data space");
896 kfree(block);
897 return -ENOMEM;
898 }
899
900 /* write data provided via the iterator */
901 if (!copy_from_iter(block->data, bytes, i)) {
902 DMERR("Error copying dax data");
903 kfree(block->data);
904 kfree(block);
905 return -EIO;
906 }
907
908 /* rewind the iterator so that the block driver can use it */
909 iov_iter_revert(i, bytes);
910
911 block->datalen = bytes;
912 block->sector = bio_to_dev_sectors(lc, sector);
913 block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift;
914
915 atomic_inc(&lc->pending_blocks);
916 spin_lock_irq(&lc->blocks_lock);
917 list_add_tail(&block->list, &lc->unflushed_blocks);
918 spin_unlock_irq(&lc->blocks_lock);
919 wake_up_process(lc->log_kthread);
920
921 return 0;
922}
923
923static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, 924static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
924 long nr_pages, void **kaddr, pfn_t *pfn) 925 long nr_pages, void **kaddr, pfn_t *pfn)
925{ 926{
@@ -956,6 +957,10 @@ static size_t log_writes_dax_copy_from_iter(struct dm_target *ti,
956dax_copy: 957dax_copy:
957 return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i); 958 return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
958} 959}
960#else
961#define log_writes_dax_direct_access NULL
962#define log_writes_dax_copy_from_iter NULL
963#endif
959 964
960static struct target_type log_writes_target = { 965static struct target_type log_writes_target = {
961 .name = "log-writes", 966 .name = "log-writes",
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index b5e892149c54..ac2e8ee9d586 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -311,6 +311,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
311 return DM_MAPIO_REMAPPED; 311 return DM_MAPIO_REMAPPED;
312} 312}
313 313
314#if IS_ENABLED(CONFIG_DAX_DRIVER)
314static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, 315static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
315 long nr_pages, void **kaddr, pfn_t *pfn) 316 long nr_pages, void **kaddr, pfn_t *pfn)
316{ 317{
@@ -351,6 +352,11 @@ static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
351 return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); 352 return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
352} 353}
353 354
355#else
356#define stripe_dax_direct_access NULL
357#define stripe_dax_copy_from_iter NULL
358#endif
359
354/* 360/*
355 * Stripe status: 361 * Stripe status:
356 * 362 *
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 45328d8b2859..bac79f40f3cb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1805,7 +1805,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
1805static struct mapped_device *alloc_dev(int minor) 1805static struct mapped_device *alloc_dev(int minor)
1806{ 1806{
1807 int r, numa_node_id = dm_get_numa_node(); 1807 int r, numa_node_id = dm_get_numa_node();
1808 struct dax_device *dax_dev; 1808 struct dax_device *dax_dev = NULL;
1809 struct mapped_device *md; 1809 struct mapped_device *md;
1810 void *old_md; 1810 void *old_md;
1811 1811
@@ -1871,9 +1871,11 @@ static struct mapped_device *alloc_dev(int minor)
1871 md->disk->private_data = md; 1871 md->disk->private_data = md;
1872 sprintf(md->disk->disk_name, "dm-%d", minor); 1872 sprintf(md->disk->disk_name, "dm-%d", minor);
1873 1873
1874 dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); 1874 if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
1875 if (!dax_dev) 1875 dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
1876 goto bad; 1876 if (!dax_dev)
1877 goto bad;
1878 }
1877 md->dax_dev = dax_dev; 1879 md->dax_dev = dax_dev;
1878 1880
1879 add_disk_no_queue_reg(md->disk); 1881 add_disk_no_queue_reg(md->disk);
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index f6c533c4d09b..85997184e047 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -20,7 +20,7 @@ if LIBNVDIMM
20config BLK_DEV_PMEM 20config BLK_DEV_PMEM
21 tristate "PMEM: Persistent memory block device support" 21 tristate "PMEM: Persistent memory block device support"
22 default LIBNVDIMM 22 default LIBNVDIMM
23 select DAX 23 select DAX_DRIVER
24 select ND_BTT if BTT 24 select ND_BTT if BTT
25 select ND_PFN if NVDIMM_PFN 25 select ND_PFN if NVDIMM_PFN
26 help 26 help
diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig
index 1444333210c7..9ac7574e3cfb 100644
--- a/drivers/s390/block/Kconfig
+++ b/drivers/s390/block/Kconfig
@@ -15,8 +15,8 @@ config BLK_DEV_XPRAM
15 15
16config DCSSBLK 16config DCSSBLK
17 def_tristate m 17 def_tristate m
18 select DAX
19 select FS_DAX_LIMITED 18 select FS_DAX_LIMITED
19 select DAX_DRIVER
20 prompt "DCSSBLK support" 20 prompt "DCSSBLK support"
21 depends on S390 && BLOCK 21 depends on S390 && BLOCK
22 help 22 help
diff --git a/fs/block_dev.c b/fs/block_dev.c
index fe09ef9c21f3..846ee2d31781 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1946,11 +1946,6 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
1946static int blkdev_writepages(struct address_space *mapping, 1946static int blkdev_writepages(struct address_space *mapping,
1947 struct writeback_control *wbc) 1947 struct writeback_control *wbc)
1948{ 1948{
1949 if (dax_mapping(mapping)) {
1950 struct block_device *bdev = I_BDEV(mapping->host);
1951
1952 return dax_writeback_mapping_range(mapping, bdev, wbc);
1953 }
1954 return generic_writepages(mapping, wbc); 1949 return generic_writepages(mapping, wbc);
1955} 1950}
1956 1951
diff --git a/fs/dax.c b/fs/dax.c
index 0276df90e86c..a77394fe586e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -73,16 +73,15 @@ fs_initcall(init_dax_wait_table);
73#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) 73#define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
74#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) 74#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
75 75
76static unsigned long dax_radix_sector(void *entry) 76static unsigned long dax_radix_pfn(void *entry)
77{ 77{
78 return (unsigned long)entry >> RADIX_DAX_SHIFT; 78 return (unsigned long)entry >> RADIX_DAX_SHIFT;
79} 79}
80 80
81static void *dax_radix_locked_entry(sector_t sector, unsigned long flags) 81static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags)
82{ 82{
83 return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | 83 return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
84 ((unsigned long)sector << RADIX_DAX_SHIFT) | 84 (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK);
85 RADIX_DAX_ENTRY_LOCK);
86} 85}
87 86
88static unsigned int dax_radix_order(void *entry) 87static unsigned int dax_radix_order(void *entry)
@@ -299,6 +298,63 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,
299 dax_wake_mapping_entry_waiter(mapping, index, entry, false); 298 dax_wake_mapping_entry_waiter(mapping, index, entry, false);
300} 299}
301 300
301static unsigned long dax_entry_size(void *entry)
302{
303 if (dax_is_zero_entry(entry))
304 return 0;
305 else if (dax_is_empty_entry(entry))
306 return 0;
307 else if (dax_is_pmd_entry(entry))
308 return PMD_SIZE;
309 else
310 return PAGE_SIZE;
311}
312
313static unsigned long dax_radix_end_pfn(void *entry)
314{
315 return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
316}
317
318/*
319 * Iterate through all mapped pfns represented by an entry, i.e. skip
320 * 'empty' and 'zero' entries.
321 */
322#define for_each_mapped_pfn(entry, pfn) \
323 for (pfn = dax_radix_pfn(entry); \
324 pfn < dax_radix_end_pfn(entry); pfn++)
325
326static void dax_associate_entry(void *entry, struct address_space *mapping)
327{
328 unsigned long pfn;
329
330 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
331 return;
332
333 for_each_mapped_pfn(entry, pfn) {
334 struct page *page = pfn_to_page(pfn);
335
336 WARN_ON_ONCE(page->mapping);
337 page->mapping = mapping;
338 }
339}
340
341static void dax_disassociate_entry(void *entry, struct address_space *mapping,
342 bool trunc)
343{
344 unsigned long pfn;
345
346 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
347 return;
348
349 for_each_mapped_pfn(entry, pfn) {
350 struct page *page = pfn_to_page(pfn);
351
352 WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
353 WARN_ON_ONCE(page->mapping && page->mapping != mapping);
354 page->mapping = NULL;
355 }
356}
357
302/* 358/*
303 * Find radix tree entry at given index. If it points to an exceptional entry, 359 * Find radix tree entry at given index. If it points to an exceptional entry,
304 * return it with the radix tree entry locked. If the radix tree doesn't 360 * return it with the radix tree entry locked. If the radix tree doesn't
@@ -405,6 +461,7 @@ restart:
405 } 461 }
406 462
407 if (pmd_downgrade) { 463 if (pmd_downgrade) {
464 dax_disassociate_entry(entry, mapping, false);
408 radix_tree_delete(&mapping->page_tree, index); 465 radix_tree_delete(&mapping->page_tree, index);
409 mapping->nrexceptional--; 466 mapping->nrexceptional--;
410 dax_wake_mapping_entry_waiter(mapping, index, entry, 467 dax_wake_mapping_entry_waiter(mapping, index, entry,
@@ -454,6 +511,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
454 (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || 511 (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
455 radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) 512 radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)))
456 goto out; 513 goto out;
514 dax_disassociate_entry(entry, mapping, trunc);
457 radix_tree_delete(page_tree, index); 515 radix_tree_delete(page_tree, index);
458 mapping->nrexceptional--; 516 mapping->nrexceptional--;
459 ret = 1; 517 ret = 1;
@@ -526,12 +584,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
526 */ 584 */
527static void *dax_insert_mapping_entry(struct address_space *mapping, 585static void *dax_insert_mapping_entry(struct address_space *mapping,
528 struct vm_fault *vmf, 586 struct vm_fault *vmf,
529 void *entry, sector_t sector, 587 void *entry, pfn_t pfn_t,
530 unsigned long flags, bool dirty) 588 unsigned long flags, bool dirty)
531{ 589{
532 struct radix_tree_root *page_tree = &mapping->page_tree; 590 struct radix_tree_root *page_tree = &mapping->page_tree;
533 void *new_entry; 591 unsigned long pfn = pfn_t_to_pfn(pfn_t);
534 pgoff_t index = vmf->pgoff; 592 pgoff_t index = vmf->pgoff;
593 void *new_entry;
535 594
536 if (dirty) 595 if (dirty)
537 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 596 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -546,7 +605,11 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
546 } 605 }
547 606
548 spin_lock_irq(&mapping->tree_lock); 607 spin_lock_irq(&mapping->tree_lock);
549 new_entry = dax_radix_locked_entry(sector, flags); 608 new_entry = dax_radix_locked_entry(pfn, flags);
609 if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
610 dax_disassociate_entry(entry, mapping, false);
611 dax_associate_entry(new_entry, mapping);
612 }
550 613
551 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { 614 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
552 /* 615 /*
@@ -657,17 +720,14 @@ unlock_pte:
657 i_mmap_unlock_read(mapping); 720 i_mmap_unlock_read(mapping);
658} 721}
659 722
660static int dax_writeback_one(struct block_device *bdev, 723static int dax_writeback_one(struct dax_device *dax_dev,
661 struct dax_device *dax_dev, struct address_space *mapping, 724 struct address_space *mapping, pgoff_t index, void *entry)
662 pgoff_t index, void *entry)
663{ 725{
664 struct radix_tree_root *page_tree = &mapping->page_tree; 726 struct radix_tree_root *page_tree = &mapping->page_tree;
665 void *entry2, **slot, *kaddr; 727 void *entry2, **slot;
666 long ret = 0, id; 728 unsigned long pfn;
667 sector_t sector; 729 long ret = 0;
668 pgoff_t pgoff;
669 size_t size; 730 size_t size;
670 pfn_t pfn;
671 731
672 /* 732 /*
673 * A page got tagged dirty in DAX mapping? Something is seriously 733 * A page got tagged dirty in DAX mapping? Something is seriously
@@ -683,10 +743,10 @@ static int dax_writeback_one(struct block_device *bdev,
683 goto put_unlocked; 743 goto put_unlocked;
684 /* 744 /*
685 * Entry got reallocated elsewhere? No need to writeback. We have to 745 * Entry got reallocated elsewhere? No need to writeback. We have to
686 * compare sectors as we must not bail out due to difference in lockbit 746 * compare pfns as we must not bail out due to difference in lockbit
687 * or entry type. 747 * or entry type.
688 */ 748 */
689 if (dax_radix_sector(entry2) != dax_radix_sector(entry)) 749 if (dax_radix_pfn(entry2) != dax_radix_pfn(entry))
690 goto put_unlocked; 750 goto put_unlocked;
691 if (WARN_ON_ONCE(dax_is_empty_entry(entry) || 751 if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
692 dax_is_zero_entry(entry))) { 752 dax_is_zero_entry(entry))) {
@@ -712,33 +772,15 @@ static int dax_writeback_one(struct block_device *bdev,
712 /* 772 /*
713 * Even if dax_writeback_mapping_range() was given a wbc->range_start 773 * Even if dax_writeback_mapping_range() was given a wbc->range_start
714 * in the middle of a PMD, the 'index' we are given will be aligned to 774 * in the middle of a PMD, the 'index' we are given will be aligned to
715 * the start index of the PMD, as will the sector we pull from 775 * the start index of the PMD, as will the pfn we pull from 'entry'.
716 * 'entry'. This allows us to flush for PMD_SIZE and not have to 776 * This allows us to flush for PMD_SIZE and not have to worry about
717 * worry about partial PMD writebacks. 777 * partial PMD writebacks.
718 */ 778 */
719 sector = dax_radix_sector(entry); 779 pfn = dax_radix_pfn(entry);
720 size = PAGE_SIZE << dax_radix_order(entry); 780 size = PAGE_SIZE << dax_radix_order(entry);
721 781
722 id = dax_read_lock(); 782 dax_mapping_entry_mkclean(mapping, index, pfn);
723 ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); 783 dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size);
724 if (ret)
725 goto dax_unlock;
726
727 /*
728 * dax_direct_access() may sleep, so cannot hold tree_lock over
729 * its invocation.
730 */
731 ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn);
732 if (ret < 0)
733 goto dax_unlock;
734
735 if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) {
736 ret = -EIO;
737 goto dax_unlock;
738 }
739
740 dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
741 dax_flush(dax_dev, kaddr, size);
742 /* 784 /*
743 * After we have flushed the cache, we can clear the dirty tag. There 785 * After we have flushed the cache, we can clear the dirty tag. There
744 * cannot be new dirty data in the pfn after the flush has completed as 786 * cannot be new dirty data in the pfn after the flush has completed as
@@ -749,8 +791,6 @@ static int dax_writeback_one(struct block_device *bdev,
749 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); 791 radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
750 spin_unlock_irq(&mapping->tree_lock); 792 spin_unlock_irq(&mapping->tree_lock);
751 trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); 793 trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
752 dax_unlock:
753 dax_read_unlock(id);
754 put_locked_mapping_entry(mapping, index); 794 put_locked_mapping_entry(mapping, index);
755 return ret; 795 return ret;
756 796
@@ -808,8 +848,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
808 break; 848 break;
809 } 849 }
810 850
811 ret = dax_writeback_one(bdev, dax_dev, mapping, 851 ret = dax_writeback_one(dax_dev, mapping, indices[i],
812 indices[i], pvec.pages[i]); 852 pvec.pages[i]);
813 if (ret < 0) { 853 if (ret < 0) {
814 mapping_set_error(mapping, ret); 854 mapping_set_error(mapping, ret);
815 goto out; 855 goto out;
@@ -877,6 +917,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
877 int ret = VM_FAULT_NOPAGE; 917 int ret = VM_FAULT_NOPAGE;
878 struct page *zero_page; 918 struct page *zero_page;
879 void *entry2; 919 void *entry2;
920 pfn_t pfn;
880 921
881 zero_page = ZERO_PAGE(0); 922 zero_page = ZERO_PAGE(0);
882 if (unlikely(!zero_page)) { 923 if (unlikely(!zero_page)) {
@@ -884,14 +925,15 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
884 goto out; 925 goto out;
885 } 926 }
886 927
887 entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, 928 pfn = page_to_pfn_t(zero_page);
929 entry2 = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
888 RADIX_DAX_ZERO_PAGE, false); 930 RADIX_DAX_ZERO_PAGE, false);
889 if (IS_ERR(entry2)) { 931 if (IS_ERR(entry2)) {
890 ret = VM_FAULT_SIGBUS; 932 ret = VM_FAULT_SIGBUS;
891 goto out; 933 goto out;
892 } 934 }
893 935
894 vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page)); 936 vm_insert_mixed(vmf->vma, vaddr, pfn);
895out: 937out:
896 trace_dax_load_hole(inode, vmf, ret); 938 trace_dax_load_hole(inode, vmf, ret);
897 return ret; 939 return ret;
@@ -1200,8 +1242,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
1200 if (error < 0) 1242 if (error < 0)
1201 goto error_finish_iomap; 1243 goto error_finish_iomap;
1202 1244
1203 entry = dax_insert_mapping_entry(mapping, vmf, entry, 1245 entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
1204 dax_iomap_sector(&iomap, pos),
1205 0, write && !sync); 1246 0, write && !sync);
1206 if (IS_ERR(entry)) { 1247 if (IS_ERR(entry)) {
1207 error = PTR_ERR(entry); 1248 error = PTR_ERR(entry);
@@ -1280,13 +1321,15 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
1280 void *ret = NULL; 1321 void *ret = NULL;
1281 spinlock_t *ptl; 1322 spinlock_t *ptl;
1282 pmd_t pmd_entry; 1323 pmd_t pmd_entry;
1324 pfn_t pfn;
1283 1325
1284 zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); 1326 zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
1285 1327
1286 if (unlikely(!zero_page)) 1328 if (unlikely(!zero_page))
1287 goto fallback; 1329 goto fallback;
1288 1330
1289 ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, 1331 pfn = page_to_pfn_t(zero_page);
1332 ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
1290 RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); 1333 RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
1291 if (IS_ERR(ret)) 1334 if (IS_ERR(ret))
1292 goto fallback; 1335 goto fallback;
@@ -1409,8 +1452,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
1409 if (error < 0) 1452 if (error < 0)
1410 goto finish_iomap; 1453 goto finish_iomap;
1411 1454
1412 entry = dax_insert_mapping_entry(mapping, vmf, entry, 1455 entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
1413 dax_iomap_sector(&iomap, pos),
1414 RADIX_DAX_PMD, write && !sync); 1456 RADIX_DAX_PMD, write && !sync);
1415 if (IS_ERR(entry)) 1457 if (IS_ERR(entry))
1416 goto finish_iomap; 1458 goto finish_iomap;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 032295e1d386..cc40802ddfa8 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -814,6 +814,7 @@ extern const struct inode_operations ext2_file_inode_operations;
814extern const struct file_operations ext2_file_operations; 814extern const struct file_operations ext2_file_operations;
815 815
816/* inode.c */ 816/* inode.c */
817extern void ext2_set_file_ops(struct inode *inode);
817extern const struct address_space_operations ext2_aops; 818extern const struct address_space_operations ext2_aops;
818extern const struct address_space_operations ext2_nobh_aops; 819extern const struct address_space_operations ext2_nobh_aops;
819extern const struct iomap_ops ext2_iomap_ops; 820extern const struct iomap_ops ext2_iomap_ops;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 9b2ac55ac34f..1e01fabef130 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -940,9 +940,6 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
940 loff_t offset = iocb->ki_pos; 940 loff_t offset = iocb->ki_pos;
941 ssize_t ret; 941 ssize_t ret;
942 942
943 if (WARN_ON_ONCE(IS_DAX(inode)))
944 return -EIO;
945
946 ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); 943 ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
947 if (ret < 0 && iov_iter_rw(iter) == WRITE) 944 if (ret < 0 && iov_iter_rw(iter) == WRITE)
948 ext2_write_failed(mapping, offset + count); 945 ext2_write_failed(mapping, offset + count);
@@ -952,17 +949,16 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
952static int 949static int
953ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) 950ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
954{ 951{
955#ifdef CONFIG_FS_DAX
956 if (dax_mapping(mapping)) {
957 return dax_writeback_mapping_range(mapping,
958 mapping->host->i_sb->s_bdev,
959 wbc);
960 }
961#endif
962
963 return mpage_writepages(mapping, wbc, ext2_get_block); 952 return mpage_writepages(mapping, wbc, ext2_get_block);
964} 953}
965 954
955static int
956ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc)
957{
958 return dax_writeback_mapping_range(mapping,
959 mapping->host->i_sb->s_bdev, wbc);
960}
961
966const struct address_space_operations ext2_aops = { 962const struct address_space_operations ext2_aops = {
967 .readpage = ext2_readpage, 963 .readpage = ext2_readpage,
968 .readpages = ext2_readpages, 964 .readpages = ext2_readpages,
@@ -990,6 +986,13 @@ const struct address_space_operations ext2_nobh_aops = {
990 .error_remove_page = generic_error_remove_page, 986 .error_remove_page = generic_error_remove_page,
991}; 987};
992 988
989static const struct address_space_operations ext2_dax_aops = {
990 .writepages = ext2_dax_writepages,
991 .direct_IO = noop_direct_IO,
992 .set_page_dirty = noop_set_page_dirty,
993 .invalidatepage = noop_invalidatepage,
994};
995
993/* 996/*
994 * Probably it should be a library function... search for first non-zero word 997 * Probably it should be a library function... search for first non-zero word
995 * or memcmp with zero_page, whatever is better for particular architecture. 998 * or memcmp with zero_page, whatever is better for particular architecture.
@@ -1388,6 +1391,18 @@ void ext2_set_inode_flags(struct inode *inode)
1388 inode->i_flags |= S_DAX; 1391 inode->i_flags |= S_DAX;
1389} 1392}
1390 1393
1394void ext2_set_file_ops(struct inode *inode)
1395{
1396 inode->i_op = &ext2_file_inode_operations;
1397 inode->i_fop = &ext2_file_operations;
1398 if (IS_DAX(inode))
1399 inode->i_mapping->a_ops = &ext2_dax_aops;
1400 else if (test_opt(inode->i_sb, NOBH))
1401 inode->i_mapping->a_ops = &ext2_nobh_aops;
1402 else
1403 inode->i_mapping->a_ops = &ext2_aops;
1404}
1405
1391struct inode *ext2_iget (struct super_block *sb, unsigned long ino) 1406struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
1392{ 1407{
1393 struct ext2_inode_info *ei; 1408 struct ext2_inode_info *ei;
@@ -1480,14 +1495,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
1480 ei->i_data[n] = raw_inode->i_block[n]; 1495 ei->i_data[n] = raw_inode->i_block[n];
1481 1496
1482 if (S_ISREG(inode->i_mode)) { 1497 if (S_ISREG(inode->i_mode)) {
1483 inode->i_op = &ext2_file_inode_operations; 1498 ext2_set_file_ops(inode);
1484 if (test_opt(inode->i_sb, NOBH)) {
1485 inode->i_mapping->a_ops = &ext2_nobh_aops;
1486 inode->i_fop = &ext2_file_operations;
1487 } else {
1488 inode->i_mapping->a_ops = &ext2_aops;
1489 inode->i_fop = &ext2_file_operations;
1490 }
1491 } else if (S_ISDIR(inode->i_mode)) { 1499 } else if (S_ISDIR(inode->i_mode)) {
1492 inode->i_op = &ext2_dir_inode_operations; 1500 inode->i_op = &ext2_dir_inode_operations;
1493 inode->i_fop = &ext2_dir_operations; 1501 inode->i_fop = &ext2_dir_operations;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index e078075dc66f..55f7caadb093 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -107,14 +107,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
107 if (IS_ERR(inode)) 107 if (IS_ERR(inode))
108 return PTR_ERR(inode); 108 return PTR_ERR(inode);
109 109
110 inode->i_op = &ext2_file_inode_operations; 110 ext2_set_file_ops(inode);
111 if (test_opt(inode->i_sb, NOBH)) {
112 inode->i_mapping->a_ops = &ext2_nobh_aops;
113 inode->i_fop = &ext2_file_operations;
114 } else {
115 inode->i_mapping->a_ops = &ext2_aops;
116 inode->i_fop = &ext2_file_operations;
117 }
118 mark_inode_dirty(inode); 111 mark_inode_dirty(inode);
119 return ext2_add_nondir(dentry, inode); 112 return ext2_add_nondir(dentry, inode);
120} 113}
@@ -125,14 +118,7 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
125 if (IS_ERR(inode)) 118 if (IS_ERR(inode))
126 return PTR_ERR(inode); 119 return PTR_ERR(inode);
127 120
128 inode->i_op = &ext2_file_inode_operations; 121 ext2_set_file_ops(inode);
129 if (test_opt(inode->i_sb, NOBH)) {
130 inode->i_mapping->a_ops = &ext2_nobh_aops;
131 inode->i_fop = &ext2_file_operations;
132 } else {
133 inode->i_mapping->a_ops = &ext2_aops;
134 inode->i_fop = &ext2_file_operations;
135 }
136 mark_inode_dirty(inode); 122 mark_inode_dirty(inode);
137 d_tmpfile(dentry, inode); 123 d_tmpfile(dentry, inode);
138 unlock_new_inode(inode); 124 unlock_new_inode(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c94780075b04..249a97b19181 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2725,12 +2725,6 @@ static int ext4_writepages(struct address_space *mapping,
2725 percpu_down_read(&sbi->s_journal_flag_rwsem); 2725 percpu_down_read(&sbi->s_journal_flag_rwsem);
2726 trace_ext4_writepages(inode, wbc); 2726 trace_ext4_writepages(inode, wbc);
2727 2727
2728 if (dax_mapping(mapping)) {
2729 ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
2730 wbc);
2731 goto out_writepages;
2732 }
2733
2734 /* 2728 /*
2735 * No pages to write? This is mainly a kludge to avoid starting 2729 * No pages to write? This is mainly a kludge to avoid starting
2736 * a transaction for special inodes like journal inode on last iput() 2730 * a transaction for special inodes like journal inode on last iput()
@@ -2955,6 +2949,27 @@ out_writepages:
2955 return ret; 2949 return ret;
2956} 2950}
2957 2951
2952static int ext4_dax_writepages(struct address_space *mapping,
2953 struct writeback_control *wbc)
2954{
2955 int ret;
2956 long nr_to_write = wbc->nr_to_write;
2957 struct inode *inode = mapping->host;
2958 struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
2959
2960 if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
2961 return -EIO;
2962
2963 percpu_down_read(&sbi->s_journal_flag_rwsem);
2964 trace_ext4_writepages(inode, wbc);
2965
2966 ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc);
2967 trace_ext4_writepages_result(inode, wbc, ret,
2968 nr_to_write - wbc->nr_to_write);
2969 percpu_up_read(&sbi->s_journal_flag_rwsem);
2970 return ret;
2971}
2972
2958static int ext4_nonda_switch(struct super_block *sb) 2973static int ext4_nonda_switch(struct super_block *sb)
2959{ 2974{
2960 s64 free_clusters, dirty_clusters; 2975 s64 free_clusters, dirty_clusters;
@@ -3857,10 +3872,6 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
3857 if (ext4_has_inline_data(inode)) 3872 if (ext4_has_inline_data(inode))
3858 return 0; 3873 return 0;
3859 3874
3860 /* DAX uses iomap path now */
3861 if (WARN_ON_ONCE(IS_DAX(inode)))
3862 return 0;
3863
3864 trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); 3875 trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
3865 if (iov_iter_rw(iter) == READ) 3876 if (iov_iter_rw(iter) == READ)
3866 ret = ext4_direct_IO_read(iocb, iter); 3877 ret = ext4_direct_IO_read(iocb, iter);
@@ -3946,6 +3957,13 @@ static const struct address_space_operations ext4_da_aops = {
3946 .error_remove_page = generic_error_remove_page, 3957 .error_remove_page = generic_error_remove_page,
3947}; 3958};
3948 3959
3960static const struct address_space_operations ext4_dax_aops = {
3961 .writepages = ext4_dax_writepages,
3962 .direct_IO = noop_direct_IO,
3963 .set_page_dirty = noop_set_page_dirty,
3964 .invalidatepage = noop_invalidatepage,
3965};
3966
3949void ext4_set_aops(struct inode *inode) 3967void ext4_set_aops(struct inode *inode)
3950{ 3968{
3951 switch (ext4_inode_journal_mode(inode)) { 3969 switch (ext4_inode_journal_mode(inode)) {
@@ -3958,7 +3976,9 @@ void ext4_set_aops(struct inode *inode)
3958 default: 3976 default:
3959 BUG(); 3977 BUG();
3960 } 3978 }
3961 if (test_opt(inode->i_sb, DELALLOC)) 3979 if (IS_DAX(inode))
3980 inode->i_mapping->a_ops = &ext4_dax_aops;
3981 else if (test_opt(inode->i_sb, DELALLOC))
3962 inode->i_mapping->a_ops = &ext4_da_aops; 3982 inode->i_mapping->a_ops = &ext4_da_aops;
3963 else 3983 else
3964 inode->i_mapping->a_ops = &ext4_aops; 3984 inode->i_mapping->a_ops = &ext4_aops;
diff --git a/fs/libfs.c b/fs/libfs.c
index 7ff3cb904acd..0fb590d79f30 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1060,6 +1060,45 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
1060} 1060}
1061EXPORT_SYMBOL(noop_fsync); 1061EXPORT_SYMBOL(noop_fsync);
1062 1062
1063int noop_set_page_dirty(struct page *page)
1064{
1065 /*
1066 * Unlike __set_page_dirty_no_writeback that handles dirty page
1067 * tracking in the page object, dax does all dirty tracking in
1068 * the inode address_space in response to mkwrite faults. In the
1069 * dax case we only need to worry about potentially dirty CPU
1070 * caches, not dirty page cache pages to write back.
1071 *
1072 * This callback is defined to prevent fallback to
1073 * __set_page_dirty_buffers() in set_page_dirty().
1074 */
1075 return 0;
1076}
1077EXPORT_SYMBOL_GPL(noop_set_page_dirty);
1078
1079void noop_invalidatepage(struct page *page, unsigned int offset,
1080 unsigned int length)
1081{
1082 /*
1083 * There is no page cache to invalidate in the dax case, however
1084 * we need this callback defined to prevent falling back to
1085 * block_invalidatepage() in do_invalidatepage().
1086 */
1087}
1088EXPORT_SYMBOL_GPL(noop_invalidatepage);
1089
1090ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
1091{
1092 /*
1093 * iomap based filesystems support direct I/O without need for
1094 * this callback. However, it still needs to be set in
1095 * inode->a_ops so that open/fcntl know that direct I/O is
1096 * generally supported.
1097 */
1098 return -EINVAL;
1099}
1100EXPORT_SYMBOL_GPL(noop_direct_IO);
1101
1063/* Because kfree isn't assignment-compatible with void(void*) ;-/ */ 1102/* Because kfree isn't assignment-compatible with void(void*) ;-/ */
1064void kfree_link(void *p) 1103void kfree_link(void *p)
1065{ 1104{
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 9c6a830da0ee..e7a56c4786ff 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1194,16 +1194,22 @@ xfs_vm_writepages(
1194 int ret; 1194 int ret;
1195 1195
1196 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); 1196 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1197 if (dax_mapping(mapping))
1198 return dax_writeback_mapping_range(mapping,
1199 xfs_find_bdev_for_inode(mapping->host), wbc);
1200
1201 ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc); 1197 ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
1202 if (wpc.ioend) 1198 if (wpc.ioend)
1203 ret = xfs_submit_ioend(wbc, wpc.ioend, ret); 1199 ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
1204 return ret; 1200 return ret;
1205} 1201}
1206 1202
1203STATIC int
1204xfs_dax_writepages(
1205 struct address_space *mapping,
1206 struct writeback_control *wbc)
1207{
1208 xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1209 return dax_writeback_mapping_range(mapping,
1210 xfs_find_bdev_for_inode(mapping->host), wbc);
1211}
1212
1207/* 1213/*
1208 * Called to move a page into cleanable state - and from there 1214 * Called to move a page into cleanable state - and from there
1209 * to be released. The page should already be clean. We always 1215 * to be released. The page should already be clean. We always
@@ -1367,17 +1373,6 @@ out_unlock:
1367 return error; 1373 return error;
1368} 1374}
1369 1375
1370STATIC ssize_t
1371xfs_vm_direct_IO(
1372 struct kiocb *iocb,
1373 struct iov_iter *iter)
1374{
1375 /*
1376 * We just need the method present so that open/fcntl allow direct I/O.
1377 */
1378 return -EINVAL;
1379}
1380
1381STATIC sector_t 1376STATIC sector_t
1382xfs_vm_bmap( 1377xfs_vm_bmap(
1383 struct address_space *mapping, 1378 struct address_space *mapping,
@@ -1500,8 +1495,15 @@ const struct address_space_operations xfs_address_space_operations = {
1500 .releasepage = xfs_vm_releasepage, 1495 .releasepage = xfs_vm_releasepage,
1501 .invalidatepage = xfs_vm_invalidatepage, 1496 .invalidatepage = xfs_vm_invalidatepage,
1502 .bmap = xfs_vm_bmap, 1497 .bmap = xfs_vm_bmap,
1503 .direct_IO = xfs_vm_direct_IO, 1498 .direct_IO = noop_direct_IO,
1504 .migratepage = buffer_migrate_page, 1499 .migratepage = buffer_migrate_page,
1505 .is_partially_uptodate = block_is_partially_uptodate, 1500 .is_partially_uptodate = block_is_partially_uptodate,
1506 .error_remove_page = generic_error_remove_page, 1501 .error_remove_page = generic_error_remove_page,
1507}; 1502};
1503
1504const struct address_space_operations xfs_dax_aops = {
1505 .writepages = xfs_dax_writepages,
1506 .direct_IO = noop_direct_IO,
1507 .set_page_dirty = noop_set_page_dirty,
1508 .invalidatepage = noop_invalidatepage,
1509};
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 88c85ea63da0..69346d460dfa 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -54,6 +54,7 @@ struct xfs_ioend {
54}; 54};
55 55
56extern const struct address_space_operations xfs_address_space_operations; 56extern const struct address_space_operations xfs_address_space_operations;
57extern const struct address_space_operations xfs_dax_aops;
57 58
58int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); 59int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
59 60
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 56475fcd76f2..951e84df5576 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1272,7 +1272,10 @@ xfs_setup_iops(
1272 case S_IFREG: 1272 case S_IFREG:
1273 inode->i_op = &xfs_inode_operations; 1273 inode->i_op = &xfs_inode_operations;
1274 inode->i_fop = &xfs_file_operations; 1274 inode->i_fop = &xfs_file_operations;
1275 inode->i_mapping->a_ops = &xfs_address_space_operations; 1275 if (IS_DAX(inode))
1276 inode->i_mapping->a_ops = &xfs_dax_aops;
1277 else
1278 inode->i_mapping->a_ops = &xfs_address_space_operations;
1276 break; 1279 break;
1277 case S_IFDIR: 1280 case S_IFDIR:
1278 if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) 1281 if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 0185ecdae135..f9eb22ad341e 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -26,18 +26,42 @@ extern struct attribute_group dax_attribute_group;
26 26
27#if IS_ENABLED(CONFIG_DAX) 27#if IS_ENABLED(CONFIG_DAX)
28struct dax_device *dax_get_by_host(const char *host); 28struct dax_device *dax_get_by_host(const char *host);
29struct dax_device *alloc_dax(void *private, const char *host,
30 const struct dax_operations *ops);
29void put_dax(struct dax_device *dax_dev); 31void put_dax(struct dax_device *dax_dev);
32void kill_dax(struct dax_device *dax_dev);
33void dax_write_cache(struct dax_device *dax_dev, bool wc);
34bool dax_write_cache_enabled(struct dax_device *dax_dev);
30#else 35#else
31static inline struct dax_device *dax_get_by_host(const char *host) 36static inline struct dax_device *dax_get_by_host(const char *host)
32{ 37{
33 return NULL; 38 return NULL;
34} 39}
35 40static inline struct dax_device *alloc_dax(void *private, const char *host,
41 const struct dax_operations *ops)
42{
43 /*
44 * Callers should check IS_ENABLED(CONFIG_DAX) to know if this
45 * NULL is an error or expected.
46 */
47 return NULL;
48}
36static inline void put_dax(struct dax_device *dax_dev) 49static inline void put_dax(struct dax_device *dax_dev)
37{ 50{
38} 51}
52static inline void kill_dax(struct dax_device *dax_dev)
53{
54}
55static inline void dax_write_cache(struct dax_device *dax_dev, bool wc)
56{
57}
58static inline bool dax_write_cache_enabled(struct dax_device *dax_dev)
59{
60 return false;
61}
39#endif 62#endif
40 63
64struct writeback_control;
41int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); 65int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
42#if IS_ENABLED(CONFIG_FS_DAX) 66#if IS_ENABLED(CONFIG_FS_DAX)
43int __bdev_dax_supported(struct super_block *sb, int blocksize); 67int __bdev_dax_supported(struct super_block *sb, int blocksize);
@@ -57,6 +81,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev)
57} 81}
58 82
59struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev); 83struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
84int dax_writeback_mapping_range(struct address_space *mapping,
85 struct block_device *bdev, struct writeback_control *wbc);
60#else 86#else
61static inline int bdev_dax_supported(struct super_block *sb, int blocksize) 87static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
62{ 88{
@@ -76,22 +102,23 @@ static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
76{ 102{
77 return NULL; 103 return NULL;
78} 104}
105
106static inline int dax_writeback_mapping_range(struct address_space *mapping,
107 struct block_device *bdev, struct writeback_control *wbc)
108{
109 return -EOPNOTSUPP;
110}
79#endif 111#endif
80 112
81int dax_read_lock(void); 113int dax_read_lock(void);
82void dax_read_unlock(int id); 114void dax_read_unlock(int id);
83struct dax_device *alloc_dax(void *private, const char *host,
84 const struct dax_operations *ops);
85bool dax_alive(struct dax_device *dax_dev); 115bool dax_alive(struct dax_device *dax_dev);
86void kill_dax(struct dax_device *dax_dev);
87void *dax_get_private(struct dax_device *dax_dev); 116void *dax_get_private(struct dax_device *dax_dev);
88long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, 117long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
89 void **kaddr, pfn_t *pfn); 118 void **kaddr, pfn_t *pfn);
90size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, 119size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
91 size_t bytes, struct iov_iter *i); 120 size_t bytes, struct iov_iter *i);
92void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); 121void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
93void dax_write_cache(struct dax_device *dax_dev, bool wc);
94bool dax_write_cache_enabled(struct dax_device *dax_dev);
95 122
96ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, 123ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
97 const struct iomap_ops *ops); 124 const struct iomap_ops *ops);
@@ -121,7 +148,4 @@ static inline bool dax_mapping(struct address_space *mapping)
121 return mapping->host && IS_DAX(mapping->host); 148 return mapping->host && IS_DAX(mapping->host);
122} 149}
123 150
124struct writeback_control;
125int dax_writeback_mapping_range(struct address_space *mapping,
126 struct block_device *bdev, struct writeback_control *wbc);
127#endif 151#endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c6baf767619e..a3bb2aedbc2b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3130,6 +3130,10 @@ extern int simple_rmdir(struct inode *, struct dentry *);
3130extern int simple_rename(struct inode *, struct dentry *, 3130extern int simple_rename(struct inode *, struct dentry *,
3131 struct inode *, struct dentry *, unsigned int); 3131 struct inode *, struct dentry *, unsigned int);
3132extern int noop_fsync(struct file *, loff_t, loff_t, int); 3132extern int noop_fsync(struct file *, loff_t, loff_t, int);
3133extern int noop_set_page_dirty(struct page *page);
3134extern void noop_invalidatepage(struct page *page, unsigned int offset,
3135 unsigned int length);
3136extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
3133extern int simple_empty(struct dentry *); 3137extern int simple_empty(struct dentry *);
3134extern int simple_readpage(struct file *file, struct page *page); 3138extern int simple_readpage(struct file *file, struct page *page);
3135extern int simple_write_begin(struct file *file, struct address_space *mapping, 3139extern int simple_write_begin(struct file *file, struct address_space *mapping,
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index a5bc8728ead7..0cb034331cbb 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -1,8 +1,4 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_SCHED_DEADLINE_H
3#define _LINUX_SCHED_DEADLINE_H
4
5#include <linux/sched.h>
6 2
7/* 3/*
8 * SCHED_DEADLINE tasks has negative priorities, reflecting 4 * SCHED_DEADLINE tasks has negative priorities, reflecting
@@ -28,5 +24,3 @@ static inline bool dl_time_before(u64 a, u64 b)
28{ 24{
29 return (s64)(a - b) < 0; 25 return (s64)(a - b) < 0;
30} 26}
31
32#endif /* _LINUX_SCHED_DEADLINE_H */
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index d849431c8060..4a6582c27dea 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -12,6 +12,7 @@ enum hk_flags {
12 HK_FLAG_SCHED = (1 << 3), 12 HK_FLAG_SCHED = (1 << 3),
13 HK_FLAG_TICK = (1 << 4), 13 HK_FLAG_TICK = (1 << 4),
14 HK_FLAG_DOMAIN = (1 << 5), 14 HK_FLAG_DOMAIN = (1 << 5),
15 HK_FLAG_WQ = (1 << 6),
15}; 16};
16 17
17#ifdef CONFIG_CPU_ISOLATION 18#ifdef CONFIG_CPU_ISOLATION
diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
index 3d3a97d9399d..094217273ff9 100644
--- a/include/linux/sched/nohz.h
+++ b/include/linux/sched/nohz.h
@@ -37,8 +37,4 @@ extern void wake_up_nohz_cpu(int cpu);
37static inline void wake_up_nohz_cpu(int cpu) { } 37static inline void wake_up_nohz_cpu(int cpu) { }
38#endif 38#endif
39 39
40#ifdef CONFIG_NO_HZ_FULL
41extern u64 scheduler_tick_max_deferment(void);
42#endif
43
44#endif /* _LINUX_SCHED_NOHZ_H */ 40#endif /* _LINUX_SCHED_NOHZ_H */
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 7cc35921218e..7f8c9a127f5a 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -113,7 +113,8 @@ enum tick_dep_bits {
113 113
114#ifdef CONFIG_NO_HZ_COMMON 114#ifdef CONFIG_NO_HZ_COMMON
115extern bool tick_nohz_enabled; 115extern bool tick_nohz_enabled;
116extern int tick_nohz_tick_stopped(void); 116extern bool tick_nohz_tick_stopped(void);
117extern bool tick_nohz_tick_stopped_cpu(int cpu);
117extern void tick_nohz_idle_enter(void); 118extern void tick_nohz_idle_enter(void);
118extern void tick_nohz_idle_exit(void); 119extern void tick_nohz_idle_exit(void);
119extern void tick_nohz_irq_exit(void); 120extern void tick_nohz_irq_exit(void);
@@ -125,6 +126,7 @@ extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
125#else /* !CONFIG_NO_HZ_COMMON */ 126#else /* !CONFIG_NO_HZ_COMMON */
126#define tick_nohz_enabled (0) 127#define tick_nohz_enabled (0)
127static inline int tick_nohz_tick_stopped(void) { return 0; } 128static inline int tick_nohz_tick_stopped(void) { return 0; }
129static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; }
128static inline void tick_nohz_idle_enter(void) { } 130static inline void tick_nohz_idle_enter(void) { }
129static inline void tick_nohz_idle_exit(void) { } 131static inline void tick_nohz_idle_exit(void) { }
130 132
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 61b39eaf7cad..3fcdb75d69cf 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -262,4 +262,74 @@ int wait_on_atomic_t(atomic_t *val, wait_atomic_t_action_f action, unsigned mode
262 return out_of_line_wait_on_atomic_t(val, action, mode); 262 return out_of_line_wait_on_atomic_t(val, action, mode);
263} 263}
264 264
265extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags);
266extern void wake_up_var(void *var);
267extern wait_queue_head_t *__var_waitqueue(void *p);
268
269#define ___wait_var_event(var, condition, state, exclusive, ret, cmd) \
270({ \
271 __label__ __out; \
272 struct wait_queue_head *__wq_head = __var_waitqueue(var); \
273 struct wait_bit_queue_entry __wbq_entry; \
274 long __ret = ret; /* explicit shadow */ \
275 \
276 init_wait_var_entry(&__wbq_entry, var, \
277 exclusive ? WQ_FLAG_EXCLUSIVE : 0); \
278 for (;;) { \
279 long __int = prepare_to_wait_event(__wq_head, \
280 &__wbq_entry.wq_entry, \
281 state); \
282 if (condition) \
283 break; \
284 \
285 if (___wait_is_interruptible(state) && __int) { \
286 __ret = __int; \
287 goto __out; \
288 } \
289 \
290 cmd; \
291 } \
292 finish_wait(__wq_head, &__wbq_entry.wq_entry); \
293__out: __ret; \
294})
295
296#define __wait_var_event(var, condition) \
297 ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
298 schedule())
299
300#define wait_var_event(var, condition) \
301do { \
302 might_sleep(); \
303 if (condition) \
304 break; \
305 __wait_var_event(var, condition); \
306} while (0)
307
308#define __wait_var_event_killable(var, condition) \
309 ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0, \
310 schedule())
311
312#define wait_var_event_killable(var, condition) \
313({ \
314 int __ret = 0; \
315 might_sleep(); \
316 if (!(condition)) \
317 __ret = __wait_var_event_killable(var, condition); \
318 __ret; \
319})
320
321#define __wait_var_event_timeout(var, condition, timeout) \
322 ___wait_var_event(var, ___wait_cond_timeout(condition), \
323 TASK_UNINTERRUPTIBLE, 0, timeout, \
324 __ret = schedule_timeout(__ret))
325
326#define wait_var_event_timeout(var, condition, timeout) \
327({ \
328 long __ret = timeout; \
329 might_sleep(); \
330 if (!___wait_cond_timeout(condition)) \
331 __ret = __wait_var_event_timeout(var, condition, timeout); \
332 __ret; \
333})
334
265#endif /* _LINUX_WAIT_BIT_H */ 335#endif /* _LINUX_WAIT_BIT_H */
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index e2f9d4feff40..d9a02b318108 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
17endif 17endif
18 18
19obj-y += core.o loadavg.o clock.o cputime.o 19obj-y += core.o loadavg.o clock.o cputime.o
20obj-y += idle_task.o fair.o rt.o deadline.o 20obj-y += idle.o fair.o rt.o deadline.o
21obj-y += wait.o wait_bit.o swait.o completion.o idle.o 21obj-y += wait.o wait_bit.o swait.o completion.o
22
22obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o 23obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
23obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o 24obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
24obj-$(CONFIG_SCHEDSTATS) += stats.o 25obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index bb4b9fe026a1..6be6c575b6cd 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -1,10 +1,7 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include <linux/proc_fs.h> 2/*
3#include <linux/seq_file.h> 3 * Auto-group scheduling implementation:
4#include <linux/utsname.h> 4 */
5#include <linux/security.h>
6#include <linux/export.h>
7
8#include "sched.h" 5#include "sched.h"
9 6
10unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; 7unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
@@ -168,18 +165,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
168 autogroup_kref_put(prev); 165 autogroup_kref_put(prev);
169} 166}
170 167
171/* Allocates GFP_KERNEL, cannot be called under any spinlock */ 168/* Allocates GFP_KERNEL, cannot be called under any spinlock: */
172void sched_autogroup_create_attach(struct task_struct *p) 169void sched_autogroup_create_attach(struct task_struct *p)
173{ 170{
174 struct autogroup *ag = autogroup_create(); 171 struct autogroup *ag = autogroup_create();
175 172
176 autogroup_move_group(p, ag); 173 autogroup_move_group(p, ag);
177 /* drop extra reference added by autogroup_create() */ 174
175 /* Drop extra reference added by autogroup_create(): */
178 autogroup_kref_put(ag); 176 autogroup_kref_put(ag);
179} 177}
180EXPORT_SYMBOL(sched_autogroup_create_attach); 178EXPORT_SYMBOL(sched_autogroup_create_attach);
181 179
182/* Cannot be called under siglock. Currently has no users */ 180/* Cannot be called under siglock. Currently has no users: */
183void sched_autogroup_detach(struct task_struct *p) 181void sched_autogroup_detach(struct task_struct *p)
184{ 182{
185 autogroup_move_group(p, &autogroup_default); 183 autogroup_move_group(p, &autogroup_default);
@@ -202,7 +200,6 @@ static int __init setup_autogroup(char *str)
202 200
203 return 1; 201 return 1;
204} 202}
205
206__setup("noautogroup", setup_autogroup); 203__setup("noautogroup", setup_autogroup);
207 204
208#ifdef CONFIG_PROC_FS 205#ifdef CONFIG_PROC_FS
@@ -224,7 +221,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
224 if (nice < 0 && !can_nice(current, nice)) 221 if (nice < 0 && !can_nice(current, nice))
225 return -EPERM; 222 return -EPERM;
226 223
227 /* this is a heavy operation taking global locks.. */ 224 /* This is a heavy operation, taking global locks.. */
228 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) 225 if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
229 return -EAGAIN; 226 return -EAGAIN;
230 227
@@ -267,4 +264,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
267 264
268 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); 265 return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
269} 266}
270#endif /* CONFIG_SCHED_DEBUG */ 267#endif
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index 27cd22b89824..b96419974a1f 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -1,15 +1,11 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#ifdef CONFIG_SCHED_AUTOGROUP 2#ifdef CONFIG_SCHED_AUTOGROUP
3 3
4#include <linux/kref.h>
5#include <linux/rwsem.h>
6#include <linux/sched/autogroup.h>
7
8struct autogroup { 4struct autogroup {
9 /* 5 /*
10 * reference doesn't mean how many thread attach to this 6 * Reference doesn't mean how many threads attach to this
11 * autogroup now. It just stands for the number of task 7 * autogroup now. It just stands for the number of tasks
12 * could use this autogroup. 8 * which could use this autogroup.
13 */ 9 */
14 struct kref kref; 10 struct kref kref;
15 struct task_group *tg; 11 struct task_group *tg;
@@ -56,11 +52,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg)
56 return tg; 52 return tg;
57} 53}
58 54
59#ifdef CONFIG_SCHED_DEBUG
60static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) 55static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
61{ 56{
62 return 0; 57 return 0;
63} 58}
64#endif
65 59
66#endif /* CONFIG_SCHED_AUTOGROUP */ 60#endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e086babe6c61..10c83e73837a 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * sched_clock for unstable cpu clocks 2 * sched_clock() for unstable CPU clocks
3 * 3 *
4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra 4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
5 * 5 *
@@ -11,7 +11,7 @@
11 * Guillaume Chazarain <guichaz@gmail.com> 11 * Guillaume Chazarain <guichaz@gmail.com>
12 * 12 *
13 * 13 *
14 * What: 14 * What this file implements:
15 * 15 *
16 * cpu_clock(i) provides a fast (execution time) high resolution 16 * cpu_clock(i) provides a fast (execution time) high resolution
17 * clock with bounded drift between CPUs. The value of cpu_clock(i) 17 * clock with bounded drift between CPUs. The value of cpu_clock(i)
@@ -26,11 +26,11 @@
26 * at 0 on boot (but people really shouldn't rely on that). 26 * at 0 on boot (but people really shouldn't rely on that).
27 * 27 *
28 * cpu_clock(i) -- can be used from any context, including NMI. 28 * cpu_clock(i) -- can be used from any context, including NMI.
29 * local_clock() -- is cpu_clock() on the current cpu. 29 * local_clock() -- is cpu_clock() on the current CPU.
30 * 30 *
31 * sched_clock_cpu(i) 31 * sched_clock_cpu(i)
32 * 32 *
33 * How: 33 * How it is implemented:
34 * 34 *
35 * The implementation either uses sched_clock() when 35 * The implementation either uses sched_clock() when
36 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the 36 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
@@ -52,19 +52,7 @@
52 * that is otherwise invisible (TSC gets stopped). 52 * that is otherwise invisible (TSC gets stopped).
53 * 53 *
54 */ 54 */
55#include <linux/spinlock.h> 55#include "sched.h"
56#include <linux/hardirq.h>
57#include <linux/export.h>
58#include <linux/percpu.h>
59#include <linux/ktime.h>
60#include <linux/sched.h>
61#include <linux/nmi.h>
62#include <linux/sched/clock.h>
63#include <linux/static_key.h>
64#include <linux/workqueue.h>
65#include <linux/compiler.h>
66#include <linux/tick.h>
67#include <linux/init.h>
68 56
69/* 57/*
70 * Scheduler clock - returns current time in nanosec units. 58 * Scheduler clock - returns current time in nanosec units.
@@ -302,21 +290,21 @@ again:
302 * cmpxchg64 below only protects one readout. 290 * cmpxchg64 below only protects one readout.
303 * 291 *
304 * We must reread via sched_clock_local() in the retry case on 292 * We must reread via sched_clock_local() in the retry case on
305 * 32bit as an NMI could use sched_clock_local() via the 293 * 32-bit kernels as an NMI could use sched_clock_local() via the
306 * tracer and hit between the readout of 294 * tracer and hit between the readout of
307 * the low32bit and the high 32bit portion. 295 * the low 32-bit and the high 32-bit portion.
308 */ 296 */
309 this_clock = sched_clock_local(my_scd); 297 this_clock = sched_clock_local(my_scd);
310 /* 298 /*
311 * We must enforce atomic readout on 32bit, otherwise the 299 * We must enforce atomic readout on 32-bit, otherwise the
312 * update on the remote cpu can hit inbetween the readout of 300 * update on the remote CPU can hit inbetween the readout of
313 * the low32bit and the high 32bit portion. 301 * the low 32-bit and the high 32-bit portion.
314 */ 302 */
315 remote_clock = cmpxchg64(&scd->clock, 0, 0); 303 remote_clock = cmpxchg64(&scd->clock, 0, 0);
316#else 304#else
317 /* 305 /*
318 * On 64bit the read of [my]scd->clock is atomic versus the 306 * On 64-bit kernels the read of [my]scd->clock is atomic versus the
319 * update, so we can avoid the above 32bit dance. 307 * update, so we can avoid the above 32-bit dance.
320 */ 308 */
321 sched_clock_local(my_scd); 309 sched_clock_local(my_scd);
322again: 310again:
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 0926aef10dad..5d2d56b0817a 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -11,10 +11,7 @@
11 * typically be used for exclusion which gives rise to priority inversion. 11 * typically be used for exclusion which gives rise to priority inversion.
12 * Waiting for completion is a typically sync point, but not an exclusion point. 12 * Waiting for completion is a typically sync point, but not an exclusion point.
13 */ 13 */
14 14#include "sched.h"
15#include <linux/sched/signal.h>
16#include <linux/sched/debug.h>
17#include <linux/completion.h>
18 15
19/** 16/**
20 * complete: - signals a single thread waiting on this completion 17 * complete: - signals a single thread waiting on this completion
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c94895bc5a2c..74e750ffe64f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5,37 +5,11 @@
5 * 5 *
6 * Copyright (C) 1991-2002 Linus Torvalds 6 * Copyright (C) 1991-2002 Linus Torvalds
7 */ 7 */
8#include <linux/sched.h> 8#include "sched.h"
9#include <linux/sched/clock.h>
10#include <uapi/linux/sched/types.h>
11#include <linux/sched/loadavg.h>
12#include <linux/sched/hotplug.h>
13#include <linux/wait_bit.h>
14#include <linux/cpuset.h>
15#include <linux/delayacct.h>
16#include <linux/init_task.h>
17#include <linux/context_tracking.h>
18#include <linux/rcupdate_wait.h>
19#include <linux/compat.h>
20
21#include <linux/blkdev.h>
22#include <linux/kprobes.h>
23#include <linux/mmu_context.h>
24#include <linux/module.h>
25#include <linux/nmi.h>
26#include <linux/prefetch.h>
27#include <linux/profile.h>
28#include <linux/security.h>
29#include <linux/syscalls.h>
30#include <linux/sched/isolation.h>
31 9
32#include <asm/switch_to.h> 10#include <asm/switch_to.h>
33#include <asm/tlb.h> 11#include <asm/tlb.h>
34#ifdef CONFIG_PARAVIRT
35#include <asm/paravirt.h>
36#endif
37 12
38#include "sched.h"
39#include "../workqueue_internal.h" 13#include "../workqueue_internal.h"
40#include "../smpboot.h" 14#include "../smpboot.h"
41 15
@@ -135,7 +109,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
135 * [L] ->on_rq 109 * [L] ->on_rq
136 * RELEASE (rq->lock) 110 * RELEASE (rq->lock)
137 * 111 *
138 * If we observe the old cpu in task_rq_lock, the acquire of 112 * If we observe the old CPU in task_rq_lock, the acquire of
139 * the old rq->lock will fully serialize against the stores. 113 * the old rq->lock will fully serialize against the stores.
140 * 114 *
141 * If we observe the new CPU in task_rq_lock, the acquire will 115 * If we observe the new CPU in task_rq_lock, the acquire will
@@ -333,7 +307,7 @@ void hrtick_start(struct rq *rq, u64 delay)
333} 307}
334#endif /* CONFIG_SMP */ 308#endif /* CONFIG_SMP */
335 309
336static void init_rq_hrtick(struct rq *rq) 310static void hrtick_rq_init(struct rq *rq)
337{ 311{
338#ifdef CONFIG_SMP 312#ifdef CONFIG_SMP
339 rq->hrtick_csd_pending = 0; 313 rq->hrtick_csd_pending = 0;
@@ -351,7 +325,7 @@ static inline void hrtick_clear(struct rq *rq)
351{ 325{
352} 326}
353 327
354static inline void init_rq_hrtick(struct rq *rq) 328static inline void hrtick_rq_init(struct rq *rq)
355{ 329{
356} 330}
357#endif /* CONFIG_SCHED_HRTICK */ 331#endif /* CONFIG_SCHED_HRTICK */
@@ -1457,7 +1431,7 @@ EXPORT_SYMBOL_GPL(kick_process);
1457 * 1431 *
1458 * - cpu_active must be a subset of cpu_online 1432 * - cpu_active must be a subset of cpu_online
1459 * 1433 *
1460 * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, 1434 * - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
1461 * see __set_cpus_allowed_ptr(). At this point the newly online 1435 * see __set_cpus_allowed_ptr(). At this point the newly online
1462 * CPU isn't yet part of the sched domains, and balancing will not 1436 * CPU isn't yet part of the sched domains, and balancing will not
1463 * see it. 1437 * see it.
@@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq)
2629 raw_spin_unlock_irq(&rq->lock); 2603 raw_spin_unlock_irq(&rq->lock);
2630} 2604}
2631 2605
2606/*
2607 * NOP if the arch has not defined these:
2608 */
2609
2610#ifndef prepare_arch_switch
2611# define prepare_arch_switch(next) do { } while (0)
2612#endif
2613
2614#ifndef finish_arch_post_lock_switch
2615# define finish_arch_post_lock_switch() do { } while (0)
2616#endif
2617
2632/** 2618/**
2633 * prepare_task_switch - prepare to switch tasks 2619 * prepare_task_switch - prepare to switch tasks
2634 * @rq: the runqueue preparing to switch 2620 * @rq: the runqueue preparing to switch
@@ -3037,7 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3037 3023
3038#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) 3024#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
3039 /* 3025 /*
3040 * 64-bit doesn't need locks to atomically read a 64bit value. 3026 * 64-bit doesn't need locks to atomically read a 64-bit value.
3041 * So we have a optimization chance when the task's delta_exec is 0. 3027 * So we have a optimization chance when the task's delta_exec is 0.
3042 * Reading ->on_cpu is racy, but this is ok. 3028 * Reading ->on_cpu is racy, but this is ok.
3043 * 3029 *
@@ -3096,35 +3082,99 @@ void scheduler_tick(void)
3096 rq->idle_balance = idle_cpu(cpu); 3082 rq->idle_balance = idle_cpu(cpu);
3097 trigger_load_balance(rq); 3083 trigger_load_balance(rq);
3098#endif 3084#endif
3099 rq_last_tick_reset(rq);
3100} 3085}
3101 3086
3102#ifdef CONFIG_NO_HZ_FULL 3087#ifdef CONFIG_NO_HZ_FULL
3103/** 3088
3104 * scheduler_tick_max_deferment 3089struct tick_work {
3105 * 3090 int cpu;
3106 * Keep at least one tick per second when a single 3091 struct delayed_work work;
3107 * active task is running because the scheduler doesn't 3092};
3108 * yet completely support full dynticks environment. 3093
3109 * 3094static struct tick_work __percpu *tick_work_cpu;
3110 * This makes sure that uptime, CFS vruntime, load 3095
3111 * balancing, etc... continue to move forward, even 3096static void sched_tick_remote(struct work_struct *work)
3112 * with a very low granularity.
3113 *
3114 * Return: Maximum deferment in nanoseconds.
3115 */
3116u64 scheduler_tick_max_deferment(void)
3117{ 3097{
3118 struct rq *rq = this_rq(); 3098 struct delayed_work *dwork = to_delayed_work(work);
3119 unsigned long next, now = READ_ONCE(jiffies); 3099 struct tick_work *twork = container_of(dwork, struct tick_work, work);
3100 int cpu = twork->cpu;
3101 struct rq *rq = cpu_rq(cpu);
3102 struct rq_flags rf;
3120 3103
3121 next = rq->last_sched_tick + HZ; 3104 /*
3105 * Handle the tick only if it appears the remote CPU is running in full
3106 * dynticks mode. The check is racy by nature, but missing a tick or
3107 * having one too much is no big deal because the scheduler tick updates
3108 * statistics and checks timeslices in a time-independent way, regardless
3109 * of when exactly it is running.
3110 */
3111 if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
3112 struct task_struct *curr;
3113 u64 delta;
3122 3114
3123 if (time_before_eq(next, now)) 3115 rq_lock_irq(rq, &rf);
3124 return 0; 3116 update_rq_clock(rq);
3117 curr = rq->curr;
3118 delta = rq_clock_task(rq) - curr->se.exec_start;
3125 3119
3126 return jiffies_to_nsecs(next - now); 3120 /*
3121 * Make sure the next tick runs within a reasonable
3122 * amount of time.
3123 */
3124 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
3125 curr->sched_class->task_tick(rq, curr, 0);
3126 rq_unlock_irq(rq, &rf);
3127 }
3128
3129 /*
3130 * Run the remote tick once per second (1Hz). This arbitrary
3131 * frequency is large enough to avoid overload but short enough
3132 * to keep scheduler internal stats reasonably up to date.
3133 */
3134 queue_delayed_work(system_unbound_wq, dwork, HZ);
3127} 3135}
3136
3137static void sched_tick_start(int cpu)
3138{
3139 struct tick_work *twork;
3140
3141 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
3142 return;
3143
3144 WARN_ON_ONCE(!tick_work_cpu);
3145
3146 twork = per_cpu_ptr(tick_work_cpu, cpu);
3147 twork->cpu = cpu;
3148 INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
3149 queue_delayed_work(system_unbound_wq, &twork->work, HZ);
3150}
3151
3152#ifdef CONFIG_HOTPLUG_CPU
3153static void sched_tick_stop(int cpu)
3154{
3155 struct tick_work *twork;
3156
3157 if (housekeeping_cpu(cpu, HK_FLAG_TICK))
3158 return;
3159
3160 WARN_ON_ONCE(!tick_work_cpu);
3161
3162 twork = per_cpu_ptr(tick_work_cpu, cpu);
3163 cancel_delayed_work_sync(&twork->work);
3164}
3165#endif /* CONFIG_HOTPLUG_CPU */
3166
3167int __init sched_tick_offload_init(void)
3168{
3169 tick_work_cpu = alloc_percpu(struct tick_work);
3170 BUG_ON(!tick_work_cpu);
3171
3172 return 0;
3173}
3174
3175#else /* !CONFIG_NO_HZ_FULL */
3176static inline void sched_tick_start(int cpu) { }
3177static inline void sched_tick_stop(int cpu) { }
3128#endif 3178#endif
3129 3179
3130#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ 3180#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
@@ -5786,6 +5836,7 @@ int sched_cpu_starting(unsigned int cpu)
5786{ 5836{
5787 set_cpu_rq_start_time(cpu); 5837 set_cpu_rq_start_time(cpu);
5788 sched_rq_cpu_starting(cpu); 5838 sched_rq_cpu_starting(cpu);
5839 sched_tick_start(cpu);
5789 return 0; 5840 return 0;
5790} 5841}
5791 5842
@@ -5797,6 +5848,7 @@ int sched_cpu_dying(unsigned int cpu)
5797 5848
5798 /* Handle pending wakeups and then migrate everything off */ 5849 /* Handle pending wakeups and then migrate everything off */
5799 sched_ttwu_pending(); 5850 sched_ttwu_pending();
5851 sched_tick_stop(cpu);
5800 5852
5801 rq_lock_irqsave(rq, &rf); 5853 rq_lock_irqsave(rq, &rf);
5802 if (rq->rd) { 5854 if (rq->rd) {
@@ -6024,11 +6076,8 @@ void __init sched_init(void)
6024 rq->last_load_update_tick = jiffies; 6076 rq->last_load_update_tick = jiffies;
6025 rq->nohz_flags = 0; 6077 rq->nohz_flags = 0;
6026#endif 6078#endif
6027#ifdef CONFIG_NO_HZ_FULL
6028 rq->last_sched_tick = 0;
6029#endif
6030#endif /* CONFIG_SMP */ 6079#endif /* CONFIG_SMP */
6031 init_rq_hrtick(rq); 6080 hrtick_rq_init(rq);
6032 atomic_set(&rq->nr_iowait, 0); 6081 atomic_set(&rq->nr_iowait, 0);
6033 } 6082 }
6034 6083
@@ -7027,3 +7076,5 @@ const u32 sched_prio_to_wmult[40] = {
7027 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, 7076 /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717,
7028 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, 7077 /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
7029}; 7078};
7079
7080#undef CREATE_TRACE_POINTS
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 44ab32a4fab6..9fbb10383434 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -1,24 +1,13 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include <linux/cgroup.h>
3#include <linux/slab.h>
4#include <linux/percpu.h>
5#include <linux/spinlock.h>
6#include <linux/cpumask.h>
7#include <linux/seq_file.h>
8#include <linux/rcupdate.h>
9#include <linux/kernel_stat.h>
10#include <linux/err.h>
11
12#include "sched.h"
13
14/* 2/*
15 * CPU accounting code for task groups. 3 * CPU accounting code for task groups.
16 * 4 *
17 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh 5 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
18 * (balbir@in.ibm.com). 6 * (balbir@in.ibm.com).
19 */ 7 */
8#include "sched.h"
20 9
21/* Time spent by the tasks of the cpu accounting group executing in ... */ 10/* Time spent by the tasks of the CPU accounting group executing in ... */
22enum cpuacct_stat_index { 11enum cpuacct_stat_index {
23 CPUACCT_STAT_USER, /* ... user mode */ 12 CPUACCT_STAT_USER, /* ... user mode */
24 CPUACCT_STAT_SYSTEM, /* ... kernel mode */ 13 CPUACCT_STAT_SYSTEM, /* ... kernel mode */
@@ -35,12 +24,12 @@ struct cpuacct_usage {
35 u64 usages[CPUACCT_STAT_NSTATS]; 24 u64 usages[CPUACCT_STAT_NSTATS];
36}; 25};
37 26
38/* track cpu usage of a group of tasks and its child groups */ 27/* track CPU usage of a group of tasks and its child groups */
39struct cpuacct { 28struct cpuacct {
40 struct cgroup_subsys_state css; 29 struct cgroup_subsys_state css;
41 /* cpuusage holds pointer to a u64-type object on every cpu */ 30 /* cpuusage holds pointer to a u64-type object on every CPU */
42 struct cpuacct_usage __percpu *cpuusage; 31 struct cpuacct_usage __percpu *cpuusage;
43 struct kernel_cpustat __percpu *cpustat; 32 struct kernel_cpustat __percpu *cpustat;
44}; 33};
45 34
46static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) 35static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
@@ -48,7 +37,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
48 return css ? container_of(css, struct cpuacct, css) : NULL; 37 return css ? container_of(css, struct cpuacct, css) : NULL;
49} 38}
50 39
51/* return cpu accounting group to which this task belongs */ 40/* Return CPU accounting group to which this task belongs */
52static inline struct cpuacct *task_ca(struct task_struct *tsk) 41static inline struct cpuacct *task_ca(struct task_struct *tsk)
53{ 42{
54 return css_ca(task_css(tsk, cpuacct_cgrp_id)); 43 return css_ca(task_css(tsk, cpuacct_cgrp_id));
@@ -65,7 +54,7 @@ static struct cpuacct root_cpuacct = {
65 .cpuusage = &root_cpuacct_cpuusage, 54 .cpuusage = &root_cpuacct_cpuusage,
66}; 55};
67 56
68/* create a new cpu accounting group */ 57/* Create a new CPU accounting group */
69static struct cgroup_subsys_state * 58static struct cgroup_subsys_state *
70cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) 59cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
71{ 60{
@@ -96,7 +85,7 @@ out:
96 return ERR_PTR(-ENOMEM); 85 return ERR_PTR(-ENOMEM);
97} 86}
98 87
99/* destroy an existing cpu accounting group */ 88/* Destroy an existing CPU accounting group */
100static void cpuacct_css_free(struct cgroup_subsys_state *css) 89static void cpuacct_css_free(struct cgroup_subsys_state *css)
101{ 90{
102 struct cpuacct *ca = css_ca(css); 91 struct cpuacct *ca = css_ca(css);
@@ -162,7 +151,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
162#endif 151#endif
163} 152}
164 153
165/* return total cpu usage (in nanoseconds) of a group */ 154/* Return total CPU usage (in nanoseconds) of a group */
166static u64 __cpuusage_read(struct cgroup_subsys_state *css, 155static u64 __cpuusage_read(struct cgroup_subsys_state *css,
167 enum cpuacct_stat_index index) 156 enum cpuacct_stat_index index)
168{ 157{
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 8d9562d890d3..50316455ea66 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -10,11 +10,7 @@
10 * as published by the Free Software Foundation; version 2 10 * as published by the Free Software Foundation; version 2
11 * of the License. 11 * of the License.
12 */ 12 */
13 13#include "sched.h"
14#include <linux/gfp.h>
15#include <linux/kernel.h>
16#include <linux/slab.h>
17#include "cpudeadline.h"
18 14
19static inline int parent(int i) 15static inline int parent(int i)
20{ 16{
@@ -42,8 +38,9 @@ static void cpudl_heapify_down(struct cpudl *cp, int idx)
42 return; 38 return;
43 39
44 /* adapted from lib/prio_heap.c */ 40 /* adapted from lib/prio_heap.c */
45 while(1) { 41 while (1) {
46 u64 largest_dl; 42 u64 largest_dl;
43
47 l = left_child(idx); 44 l = left_child(idx);
48 r = right_child(idx); 45 r = right_child(idx);
49 largest = idx; 46 largest = idx;
@@ -131,6 +128,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
131 return 1; 128 return 1;
132 } else { 129 } else {
133 int best_cpu = cpudl_maximum(cp); 130 int best_cpu = cpudl_maximum(cp);
131
134 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); 132 WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
135 133
136 if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && 134 if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
@@ -145,9 +143,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
145} 143}
146 144
147/* 145/*
148 * cpudl_clear - remove a cpu from the cpudl max-heap 146 * cpudl_clear - remove a CPU from the cpudl max-heap
149 * @cp: the cpudl max-heap context 147 * @cp: the cpudl max-heap context
150 * @cpu: the target cpu 148 * @cpu: the target CPU
151 * 149 *
152 * Notes: assumes cpu_rq(cpu)->lock is locked 150 * Notes: assumes cpu_rq(cpu)->lock is locked
153 * 151 *
@@ -186,8 +184,8 @@ void cpudl_clear(struct cpudl *cp, int cpu)
186/* 184/*
187 * cpudl_set - update the cpudl max-heap 185 * cpudl_set - update the cpudl max-heap
188 * @cp: the cpudl max-heap context 186 * @cp: the cpudl max-heap context
189 * @cpu: the target cpu 187 * @cpu: the target CPU
190 * @dl: the new earliest deadline for this cpu 188 * @dl: the new earliest deadline for this CPU
191 * 189 *
192 * Notes: assumes cpu_rq(cpu)->lock is locked 190 * Notes: assumes cpu_rq(cpu)->lock is locked
193 * 191 *
@@ -205,6 +203,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
205 old_idx = cp->elements[cpu].idx; 203 old_idx = cp->elements[cpu].idx;
206 if (old_idx == IDX_INVALID) { 204 if (old_idx == IDX_INVALID) {
207 int new_idx = cp->size++; 205 int new_idx = cp->size++;
206
208 cp->elements[new_idx].dl = dl; 207 cp->elements[new_idx].dl = dl;
209 cp->elements[new_idx].cpu = cpu; 208 cp->elements[new_idx].cpu = cpu;
210 cp->elements[cpu].idx = new_idx; 209 cp->elements[cpu].idx = new_idx;
@@ -221,7 +220,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
221/* 220/*
222 * cpudl_set_freecpu - Set the cpudl.free_cpus 221 * cpudl_set_freecpu - Set the cpudl.free_cpus
223 * @cp: the cpudl max-heap context 222 * @cp: the cpudl max-heap context
224 * @cpu: rd attached cpu 223 * @cpu: rd attached CPU
225 */ 224 */
226void cpudl_set_freecpu(struct cpudl *cp, int cpu) 225void cpudl_set_freecpu(struct cpudl *cp, int cpu)
227{ 226{
@@ -231,7 +230,7 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu)
231/* 230/*
232 * cpudl_clear_freecpu - Clear the cpudl.free_cpus 231 * cpudl_clear_freecpu - Clear the cpudl.free_cpus
233 * @cp: the cpudl max-heap context 232 * @cp: the cpudl max-heap context
234 * @cpu: rd attached cpu 233 * @cpu: rd attached CPU
235 */ 234 */
236void cpudl_clear_freecpu(struct cpudl *cp, int cpu) 235void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
237{ 236{
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index b010d26e108e..0adeda93b5fb 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -1,35 +1,26 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_CPUDL_H
3#define _LINUX_CPUDL_H
4 2
5#include <linux/sched.h> 3#define IDX_INVALID -1
6#include <linux/sched/deadline.h>
7
8#define IDX_INVALID -1
9 4
10struct cpudl_item { 5struct cpudl_item {
11 u64 dl; 6 u64 dl;
12 int cpu; 7 int cpu;
13 int idx; 8 int idx;
14}; 9};
15 10
16struct cpudl { 11struct cpudl {
17 raw_spinlock_t lock; 12 raw_spinlock_t lock;
18 int size; 13 int size;
19 cpumask_var_t free_cpus; 14 cpumask_var_t free_cpus;
20 struct cpudl_item *elements; 15 struct cpudl_item *elements;
21}; 16};
22 17
23
24#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
25int cpudl_find(struct cpudl *cp, struct task_struct *p, 19int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
26 struct cpumask *later_mask);
27void cpudl_set(struct cpudl *cp, int cpu, u64 dl); 20void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
28void cpudl_clear(struct cpudl *cp, int cpu); 21void cpudl_clear(struct cpudl *cp, int cpu);
29int cpudl_init(struct cpudl *cp); 22int cpudl_init(struct cpudl *cp);
30void cpudl_set_freecpu(struct cpudl *cp, int cpu); 23void cpudl_set_freecpu(struct cpudl *cp, int cpu);
31void cpudl_clear_freecpu(struct cpudl *cp, int cpu); 24void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
32void cpudl_cleanup(struct cpudl *cp); 25void cpudl_cleanup(struct cpudl *cp);
33#endif /* CONFIG_SMP */ 26#endif /* CONFIG_SMP */
34
35#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index dbc51442ecbc..5e54cbcae673 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -8,7 +8,6 @@
8 * it under the terms of the GNU General Public License version 2 as 8 * it under the terms of the GNU General Public License version 2 as
9 * published by the Free Software Foundation. 9 * published by the Free Software Foundation.
10 */ 10 */
11
12#include "sched.h" 11#include "sched.h"
13 12
14DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); 13DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 7936f548e071..feb5f89020f2 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -11,61 +11,57 @@
11 11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13 13
14#include <linux/cpufreq.h>
15#include <linux/kthread.h>
16#include <uapi/linux/sched/types.h>
17#include <linux/slab.h>
18#include <trace/events/power.h>
19
20#include "sched.h" 14#include "sched.h"
21 15
16#include <trace/events/power.h>
17
22struct sugov_tunables { 18struct sugov_tunables {
23 struct gov_attr_set attr_set; 19 struct gov_attr_set attr_set;
24 unsigned int rate_limit_us; 20 unsigned int rate_limit_us;
25}; 21};
26 22
27struct sugov_policy { 23struct sugov_policy {
28 struct cpufreq_policy *policy; 24 struct cpufreq_policy *policy;
29 25
30 struct sugov_tunables *tunables; 26 struct sugov_tunables *tunables;
31 struct list_head tunables_hook; 27 struct list_head tunables_hook;
32 28
33 raw_spinlock_t update_lock; /* For shared policies */ 29 raw_spinlock_t update_lock; /* For shared policies */
34 u64 last_freq_update_time; 30 u64 last_freq_update_time;
35 s64 freq_update_delay_ns; 31 s64 freq_update_delay_ns;
36 unsigned int next_freq; 32 unsigned int next_freq;
37 unsigned int cached_raw_freq; 33 unsigned int cached_raw_freq;
38 34
39 /* The next fields are only needed if fast switch cannot be used. */ 35 /* The next fields are only needed if fast switch cannot be used: */
40 struct irq_work irq_work; 36 struct irq_work irq_work;
41 struct kthread_work work; 37 struct kthread_work work;
42 struct mutex work_lock; 38 struct mutex work_lock;
43 struct kthread_worker worker; 39 struct kthread_worker worker;
44 struct task_struct *thread; 40 struct task_struct *thread;
45 bool work_in_progress; 41 bool work_in_progress;
46 42
47 bool need_freq_update; 43 bool need_freq_update;
48}; 44};
49 45
50struct sugov_cpu { 46struct sugov_cpu {
51 struct update_util_data update_util; 47 struct update_util_data update_util;
52 struct sugov_policy *sg_policy; 48 struct sugov_policy *sg_policy;
53 unsigned int cpu; 49 unsigned int cpu;
54 50
55 bool iowait_boost_pending; 51 bool iowait_boost_pending;
56 unsigned int iowait_boost; 52 unsigned int iowait_boost;
57 unsigned int iowait_boost_max; 53 unsigned int iowait_boost_max;
58 u64 last_update; 54 u64 last_update;
59 55
60 /* The fields below are only needed when sharing a policy. */ 56 /* The fields below are only needed when sharing a policy: */
61 unsigned long util_cfs; 57 unsigned long util_cfs;
62 unsigned long util_dl; 58 unsigned long util_dl;
63 unsigned long max; 59 unsigned long max;
64 unsigned int flags; 60 unsigned int flags;
65 61
66 /* The field below is for single-CPU policies only. */ 62 /* The field below is for single-CPU policies only: */
67#ifdef CONFIG_NO_HZ_COMMON 63#ifdef CONFIG_NO_HZ_COMMON
68 unsigned long saved_idle_calls; 64 unsigned long saved_idle_calls;
69#endif 65#endif
70}; 66};
71 67
@@ -79,9 +75,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
79 75
80 /* 76 /*
81 * Since cpufreq_update_util() is called with rq->lock held for 77 * Since cpufreq_update_util() is called with rq->lock held for
82 * the @target_cpu, our per-cpu data is fully serialized. 78 * the @target_cpu, our per-CPU data is fully serialized.
83 * 79 *
84 * However, drivers cannot in general deal with cross-cpu 80 * However, drivers cannot in general deal with cross-CPU
85 * requests, so while get_next_freq() will work, our 81 * requests, so while get_next_freq() will work, our
86 * sugov_update_commit() call may not for the fast switching platforms. 82 * sugov_update_commit() call may not for the fast switching platforms.
87 * 83 *
@@ -111,6 +107,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
111 } 107 }
112 108
113 delta_ns = time - sg_policy->last_freq_update_time; 109 delta_ns = time - sg_policy->last_freq_update_time;
110
114 return delta_ns >= sg_policy->freq_update_delay_ns; 111 return delta_ns >= sg_policy->freq_update_delay_ns;
115} 112}
116 113
@@ -345,8 +342,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
345 return get_next_freq(sg_policy, util, max); 342 return get_next_freq(sg_policy, util, max);
346} 343}
347 344
348static void sugov_update_shared(struct update_util_data *hook, u64 time, 345static void
349 unsigned int flags) 346sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
350{ 347{
351 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); 348 struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
352 struct sugov_policy *sg_policy = sg_cpu->sg_policy; 349 struct sugov_policy *sg_policy = sg_cpu->sg_policy;
@@ -423,8 +420,8 @@ static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
423 return sprintf(buf, "%u\n", tunables->rate_limit_us); 420 return sprintf(buf, "%u\n", tunables->rate_limit_us);
424} 421}
425 422
426static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, 423static ssize_t
427 size_t count) 424rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
428{ 425{
429 struct sugov_tunables *tunables = to_sugov_tunables(attr_set); 426 struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
430 struct sugov_policy *sg_policy; 427 struct sugov_policy *sg_policy;
@@ -479,11 +476,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
479{ 476{
480 struct task_struct *thread; 477 struct task_struct *thread;
481 struct sched_attr attr = { 478 struct sched_attr attr = {
482 .size = sizeof(struct sched_attr), 479 .size = sizeof(struct sched_attr),
483 .sched_policy = SCHED_DEADLINE, 480 .sched_policy = SCHED_DEADLINE,
484 .sched_flags = SCHED_FLAG_SUGOV, 481 .sched_flags = SCHED_FLAG_SUGOV,
485 .sched_nice = 0, 482 .sched_nice = 0,
486 .sched_priority = 0, 483 .sched_priority = 0,
487 /* 484 /*
488 * Fake (unused) bandwidth; workaround to "fix" 485 * Fake (unused) bandwidth; workaround to "fix"
489 * priority inheritance. 486 * priority inheritance.
@@ -663,21 +660,21 @@ static int sugov_start(struct cpufreq_policy *policy)
663 struct sugov_policy *sg_policy = policy->governor_data; 660 struct sugov_policy *sg_policy = policy->governor_data;
664 unsigned int cpu; 661 unsigned int cpu;
665 662
666 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; 663 sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
667 sg_policy->last_freq_update_time = 0; 664 sg_policy->last_freq_update_time = 0;
668 sg_policy->next_freq = UINT_MAX; 665 sg_policy->next_freq = UINT_MAX;
669 sg_policy->work_in_progress = false; 666 sg_policy->work_in_progress = false;
670 sg_policy->need_freq_update = false; 667 sg_policy->need_freq_update = false;
671 sg_policy->cached_raw_freq = 0; 668 sg_policy->cached_raw_freq = 0;
672 669
673 for_each_cpu(cpu, policy->cpus) { 670 for_each_cpu(cpu, policy->cpus) {
674 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); 671 struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
675 672
676 memset(sg_cpu, 0, sizeof(*sg_cpu)); 673 memset(sg_cpu, 0, sizeof(*sg_cpu));
677 sg_cpu->cpu = cpu; 674 sg_cpu->cpu = cpu;
678 sg_cpu->sg_policy = sg_policy; 675 sg_cpu->sg_policy = sg_policy;
679 sg_cpu->flags = 0; 676 sg_cpu->flags = 0;
680 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; 677 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
681 } 678 }
682 679
683 for_each_cpu(cpu, policy->cpus) { 680 for_each_cpu(cpu, policy->cpus) {
@@ -721,14 +718,14 @@ static void sugov_limits(struct cpufreq_policy *policy)
721} 718}
722 719
723static struct cpufreq_governor schedutil_gov = { 720static struct cpufreq_governor schedutil_gov = {
724 .name = "schedutil", 721 .name = "schedutil",
725 .owner = THIS_MODULE, 722 .owner = THIS_MODULE,
726 .dynamic_switching = true, 723 .dynamic_switching = true,
727 .init = sugov_init, 724 .init = sugov_init,
728 .exit = sugov_exit, 725 .exit = sugov_exit,
729 .start = sugov_start, 726 .start = sugov_start,
730 .stop = sugov_stop, 727 .stop = sugov_stop,
731 .limits = sugov_limits, 728 .limits = sugov_limits,
732}; 729};
733 730
734#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL 731#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 2511aba36b89..daaadf939ccb 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -14,7 +14,7 @@
14 * 14 *
15 * going from the lowest priority to the highest. CPUs in the INVALID state 15 * going from the lowest priority to the highest. CPUs in the INVALID state
16 * are not eligible for routing. The system maintains this state with 16 * are not eligible for routing. The system maintains this state with
17 * a 2 dimensional bitmap (the first for priority class, the second for cpus 17 * a 2 dimensional bitmap (the first for priority class, the second for CPUs
18 * in that class). Therefore a typical application without affinity 18 * in that class). Therefore a typical application without affinity
19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit 19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
20 * searches). For tasks with affinity restrictions, the algorithm has a 20 * searches). For tasks with affinity restrictions, the algorithm has a
@@ -26,12 +26,7 @@
26 * as published by the Free Software Foundation; version 2 26 * as published by the Free Software Foundation; version 2
27 * of the License. 27 * of the License.
28 */ 28 */
29 29#include "sched.h"
30#include <linux/gfp.h>
31#include <linux/sched.h>
32#include <linux/sched/rt.h>
33#include <linux/slab.h>
34#include "cpupri.h"
35 30
36/* Convert between a 140 based task->prio, and our 102 based cpupri */ 31/* Convert between a 140 based task->prio, and our 102 based cpupri */
37static int convert_prio(int prio) 32static int convert_prio(int prio)
@@ -128,9 +123,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
128} 123}
129 124
130/** 125/**
131 * cpupri_set - update the cpu priority setting 126 * cpupri_set - update the CPU priority setting
132 * @cp: The cpupri context 127 * @cp: The cpupri context
133 * @cpu: The target cpu 128 * @cpu: The target CPU
134 * @newpri: The priority (INVALID-RT99) to assign to this CPU 129 * @newpri: The priority (INVALID-RT99) to assign to this CPU
135 * 130 *
136 * Note: Assumes cpu_rq(cpu)->lock is locked 131 * Note: Assumes cpu_rq(cpu)->lock is locked
@@ -151,7 +146,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
151 return; 146 return;
152 147
153 /* 148 /*
154 * If the cpu was currently mapped to a different value, we 149 * If the CPU was currently mapped to a different value, we
155 * need to map it to the new value then remove the old value. 150 * need to map it to the new value then remove the old value.
156 * Note, we must add the new value first, otherwise we risk the 151 * Note, we must add the new value first, otherwise we risk the
157 * cpu being missed by the priority loop in cpupri_find. 152 * cpu being missed by the priority loop in cpupri_find.
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index bab050019071..7dc20a3232e7 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -1,32 +1,25 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2#ifndef _LINUX_CPUPRI_H
3#define _LINUX_CPUPRI_H
4
5#include <linux/sched.h>
6 2
7#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) 3#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
8 4
9#define CPUPRI_INVALID -1 5#define CPUPRI_INVALID -1
10#define CPUPRI_IDLE 0 6#define CPUPRI_IDLE 0
11#define CPUPRI_NORMAL 1 7#define CPUPRI_NORMAL 1
12/* values 2-101 are RT priorities 0-99 */ 8/* values 2-101 are RT priorities 0-99 */
13 9
14struct cpupri_vec { 10struct cpupri_vec {
15 atomic_t count; 11 atomic_t count;
16 cpumask_var_t mask; 12 cpumask_var_t mask;
17}; 13};
18 14
19struct cpupri { 15struct cpupri {
20 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; 16 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
21 int *cpu_to_pri; 17 int *cpu_to_pri;
22}; 18};
23 19
24#ifdef CONFIG_SMP 20#ifdef CONFIG_SMP
25int cpupri_find(struct cpupri *cp, 21int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask);
26 struct task_struct *p, struct cpumask *lowest_mask);
27void cpupri_set(struct cpupri *cp, int cpu, int pri); 22void cpupri_set(struct cpupri *cp, int cpu, int pri);
28int cpupri_init(struct cpupri *cp); 23int cpupri_init(struct cpupri *cp);
29void cpupri_cleanup(struct cpupri *cp); 24void cpupri_cleanup(struct cpupri *cp);
30#endif 25#endif
31
32#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index bac6ac9a4ec7..0796f938c4f0 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -1,10 +1,6 @@
1#include <linux/export.h> 1/*
2#include <linux/sched.h> 2 * Simple CPU accounting cgroup controller
3#include <linux/tsacct_kern.h> 3 */
4#include <linux/kernel_stat.h>
5#include <linux/static_key.h>
6#include <linux/context_tracking.h>
7#include <linux/sched/cputime.h>
8#include "sched.h" 4#include "sched.h"
9 5
10#ifdef CONFIG_IRQ_TIME_ACCOUNTING 6#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -113,9 +109,9 @@ static inline void task_group_account_field(struct task_struct *p, int index,
113} 109}
114 110
115/* 111/*
116 * Account user cpu time to a process. 112 * Account user CPU time to a process.
117 * @p: the process that the cpu time gets accounted to 113 * @p: the process that the CPU time gets accounted to
118 * @cputime: the cpu time spent in user space since the last update 114 * @cputime: the CPU time spent in user space since the last update
119 */ 115 */
120void account_user_time(struct task_struct *p, u64 cputime) 116void account_user_time(struct task_struct *p, u64 cputime)
121{ 117{
@@ -135,9 +131,9 @@ void account_user_time(struct task_struct *p, u64 cputime)
135} 131}
136 132
137/* 133/*
138 * Account guest cpu time to a process. 134 * Account guest CPU time to a process.
139 * @p: the process that the cpu time gets accounted to 135 * @p: the process that the CPU time gets accounted to
140 * @cputime: the cpu time spent in virtual machine since the last update 136 * @cputime: the CPU time spent in virtual machine since the last update
141 */ 137 */
142void account_guest_time(struct task_struct *p, u64 cputime) 138void account_guest_time(struct task_struct *p, u64 cputime)
143{ 139{
@@ -159,9 +155,9 @@ void account_guest_time(struct task_struct *p, u64 cputime)
159} 155}
160 156
161/* 157/*
162 * Account system cpu time to a process and desired cpustat field 158 * Account system CPU time to a process and desired cpustat field
163 * @p: the process that the cpu time gets accounted to 159 * @p: the process that the CPU time gets accounted to
164 * @cputime: the cpu time spent in kernel space since the last update 160 * @cputime: the CPU time spent in kernel space since the last update
165 * @index: pointer to cpustat field that has to be updated 161 * @index: pointer to cpustat field that has to be updated
166 */ 162 */
167void account_system_index_time(struct task_struct *p, 163void account_system_index_time(struct task_struct *p,
@@ -179,10 +175,10 @@ void account_system_index_time(struct task_struct *p,
179} 175}
180 176
181/* 177/*
182 * Account system cpu time to a process. 178 * Account system CPU time to a process.
183 * @p: the process that the cpu time gets accounted to 179 * @p: the process that the CPU time gets accounted to
184 * @hardirq_offset: the offset to subtract from hardirq_count() 180 * @hardirq_offset: the offset to subtract from hardirq_count()
185 * @cputime: the cpu time spent in kernel space since the last update 181 * @cputime: the CPU time spent in kernel space since the last update
186 */ 182 */
187void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) 183void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
188{ 184{
@@ -205,7 +201,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
205 201
206/* 202/*
207 * Account for involuntary wait time. 203 * Account for involuntary wait time.
208 * @cputime: the cpu time spent in involuntary wait 204 * @cputime: the CPU time spent in involuntary wait
209 */ 205 */
210void account_steal_time(u64 cputime) 206void account_steal_time(u64 cputime)
211{ 207{
@@ -216,7 +212,7 @@ void account_steal_time(u64 cputime)
216 212
217/* 213/*
218 * Account for idle time. 214 * Account for idle time.
219 * @cputime: the cpu time spent in idle wait 215 * @cputime: the CPU time spent in idle wait
220 */ 216 */
221void account_idle_time(u64 cputime) 217void account_idle_time(u64 cputime)
222{ 218{
@@ -338,7 +334,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
338#ifdef CONFIG_IRQ_TIME_ACCOUNTING 334#ifdef CONFIG_IRQ_TIME_ACCOUNTING
339/* 335/*
340 * Account a tick to a process and cpustat 336 * Account a tick to a process and cpustat
341 * @p: the process that the cpu time gets accounted to 337 * @p: the process that the CPU time gets accounted to
342 * @user_tick: is the tick from userspace 338 * @user_tick: is the tick from userspace
343 * @rq: the pointer to rq 339 * @rq: the pointer to rq
344 * 340 *
@@ -400,17 +396,16 @@ static void irqtime_account_idle_ticks(int ticks)
400 irqtime_account_process_tick(current, 0, rq, ticks); 396 irqtime_account_process_tick(current, 0, rq, ticks);
401} 397}
402#else /* CONFIG_IRQ_TIME_ACCOUNTING */ 398#else /* CONFIG_IRQ_TIME_ACCOUNTING */
403static inline void irqtime_account_idle_ticks(int ticks) {} 399static inline void irqtime_account_idle_ticks(int ticks) { }
404static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, 400static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
405 struct rq *rq, int nr_ticks) {} 401 struct rq *rq, int nr_ticks) { }
406#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ 402#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
407 403
408/* 404/*
409 * Use precise platform statistics if available: 405 * Use precise platform statistics if available:
410 */ 406 */
411#ifdef CONFIG_VIRT_CPU_ACCOUNTING 407#ifdef CONFIG_VIRT_CPU_ACCOUNTING
412 408# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
413#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
414void vtime_common_task_switch(struct task_struct *prev) 409void vtime_common_task_switch(struct task_struct *prev)
415{ 410{
416 if (is_idle_task(prev)) 411 if (is_idle_task(prev))
@@ -421,8 +416,7 @@ void vtime_common_task_switch(struct task_struct *prev)
421 vtime_flush(prev); 416 vtime_flush(prev);
422 arch_vtime_task_switch(prev); 417 arch_vtime_task_switch(prev);
423} 418}
424#endif 419# endif
425
426#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ 420#endif /* CONFIG_VIRT_CPU_ACCOUNTING */
427 421
428 422
@@ -469,10 +463,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
469 *ut = cputime.utime; 463 *ut = cputime.utime;
470 *st = cputime.stime; 464 *st = cputime.stime;
471} 465}
472#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ 466
467#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
468
473/* 469/*
474 * Account a single tick of cpu time. 470 * Account a single tick of CPU time.
475 * @p: the process that the cpu time gets accounted to 471 * @p: the process that the CPU time gets accounted to
476 * @user_tick: indicates if the tick is a user or a system tick 472 * @user_tick: indicates if the tick is a user or a system tick
477 */ 473 */
478void account_process_tick(struct task_struct *p, int user_tick) 474void account_process_tick(struct task_struct *p, int user_tick)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9df09782025c..8b7c2b35bec9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -17,9 +17,6 @@
17 */ 17 */
18#include "sched.h" 18#include "sched.h"
19 19
20#include <linux/slab.h>
21#include <uapi/linux/sched/types.h>
22
23struct dl_bandwidth def_dl_bandwidth; 20struct dl_bandwidth def_dl_bandwidth;
24 21
25static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) 22static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
@@ -514,7 +511,7 @@ static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
514static void push_dl_tasks(struct rq *); 511static void push_dl_tasks(struct rq *);
515static void pull_dl_task(struct rq *); 512static void pull_dl_task(struct rq *);
516 513
517static inline void queue_push_tasks(struct rq *rq) 514static inline void deadline_queue_push_tasks(struct rq *rq)
518{ 515{
519 if (!has_pushable_dl_tasks(rq)) 516 if (!has_pushable_dl_tasks(rq))
520 return; 517 return;
@@ -522,7 +519,7 @@ static inline void queue_push_tasks(struct rq *rq)
522 queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks); 519 queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
523} 520}
524 521
525static inline void queue_pull_task(struct rq *rq) 522static inline void deadline_queue_pull_task(struct rq *rq)
526{ 523{
527 queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task); 524 queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
528} 525}
@@ -539,12 +536,12 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
539 536
540 /* 537 /*
541 * If we cannot preempt any rq, fall back to pick any 538 * If we cannot preempt any rq, fall back to pick any
542 * online cpu. 539 * online CPU:
543 */ 540 */
544 cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); 541 cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
545 if (cpu >= nr_cpu_ids) { 542 if (cpu >= nr_cpu_ids) {
546 /* 543 /*
547 * Fail to find any suitable cpu. 544 * Failed to find any suitable CPU.
548 * The task will never come back! 545 * The task will never come back!
549 */ 546 */
550 BUG_ON(dl_bandwidth_enabled()); 547 BUG_ON(dl_bandwidth_enabled());
@@ -597,19 +594,18 @@ static inline void pull_dl_task(struct rq *rq)
597{ 594{
598} 595}
599 596
600static inline void queue_push_tasks(struct rq *rq) 597static inline void deadline_queue_push_tasks(struct rq *rq)
601{ 598{
602} 599}
603 600
604static inline void queue_pull_task(struct rq *rq) 601static inline void deadline_queue_pull_task(struct rq *rq)
605{ 602{
606} 603}
607#endif /* CONFIG_SMP */ 604#endif /* CONFIG_SMP */
608 605
609static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); 606static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
610static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); 607static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
611static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, 608static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
612 int flags);
613 609
614/* 610/*
615 * We are being explicitly informed that a new instance is starting, 611 * We are being explicitly informed that a new instance is starting,
@@ -1763,7 +1759,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1763 if (hrtick_enabled(rq)) 1759 if (hrtick_enabled(rq))
1764 start_hrtick_dl(rq, p); 1760 start_hrtick_dl(rq, p);
1765 1761
1766 queue_push_tasks(rq); 1762 deadline_queue_push_tasks(rq);
1767 1763
1768 return p; 1764 return p;
1769} 1765}
@@ -1776,6 +1772,14 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
1776 enqueue_pushable_dl_task(rq, p); 1772 enqueue_pushable_dl_task(rq, p);
1777} 1773}
1778 1774
1775/*
1776 * scheduler tick hitting a task of our scheduling class.
1777 *
1778 * NOTE: This function can be called remotely by the tick offload that
1779 * goes along full dynticks. Therefore no local assumption can be made
1780 * and everything must be accessed through the @rq and @curr passed in
1781 * parameters.
1782 */
1779static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) 1783static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
1780{ 1784{
1781 update_curr_dl(rq); 1785 update_curr_dl(rq);
@@ -1865,7 +1869,7 @@ static int find_later_rq(struct task_struct *task)
1865 1869
1866 /* 1870 /*
1867 * We have to consider system topology and task affinity 1871 * We have to consider system topology and task affinity
1868 * first, then we can look for a suitable cpu. 1872 * first, then we can look for a suitable CPU.
1869 */ 1873 */
1870 if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) 1874 if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
1871 return -1; 1875 return -1;
@@ -1879,7 +1883,7 @@ static int find_later_rq(struct task_struct *task)
1879 * Now we check how well this matches with task's 1883 * Now we check how well this matches with task's
1880 * affinity and system topology. 1884 * affinity and system topology.
1881 * 1885 *
1882 * The last cpu where the task run is our first 1886 * The last CPU where the task run is our first
1883 * guess, since it is most likely cache-hot there. 1887 * guess, since it is most likely cache-hot there.
1884 */ 1888 */
1885 if (cpumask_test_cpu(cpu, later_mask)) 1889 if (cpumask_test_cpu(cpu, later_mask))
@@ -1909,9 +1913,9 @@ static int find_later_rq(struct task_struct *task)
1909 best_cpu = cpumask_first_and(later_mask, 1913 best_cpu = cpumask_first_and(later_mask,
1910 sched_domain_span(sd)); 1914 sched_domain_span(sd));
1911 /* 1915 /*
1912 * Last chance: if a cpu being in both later_mask 1916 * Last chance: if a CPU being in both later_mask
1913 * and current sd span is valid, that becomes our 1917 * and current sd span is valid, that becomes our
1914 * choice. Of course, the latest possible cpu is 1918 * choice. Of course, the latest possible CPU is
1915 * already under consideration through later_mask. 1919 * already under consideration through later_mask.
1916 */ 1920 */
1917 if (best_cpu < nr_cpu_ids) { 1921 if (best_cpu < nr_cpu_ids) {
@@ -2067,7 +2071,7 @@ retry:
2067 if (task == next_task) { 2071 if (task == next_task) {
2068 /* 2072 /*
2069 * The task is still there. We don't try 2073 * The task is still there. We don't try
2070 * again, some other cpu will pull it when ready. 2074 * again, some other CPU will pull it when ready.
2071 */ 2075 */
2072 goto out; 2076 goto out;
2073 } 2077 }
@@ -2300,12 +2304,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
2300 /* 2304 /*
2301 * Since this might be the only -deadline task on the rq, 2305 * Since this might be the only -deadline task on the rq,
2302 * this is the right place to try to pull some other one 2306 * this is the right place to try to pull some other one
2303 * from an overloaded cpu, if any. 2307 * from an overloaded CPU, if any.
2304 */ 2308 */
2305 if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) 2309 if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
2306 return; 2310 return;
2307 2311
2308 queue_pull_task(rq); 2312 deadline_queue_pull_task(rq);
2309} 2313}
2310 2314
2311/* 2315/*
@@ -2327,7 +2331,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
2327 if (rq->curr != p) { 2331 if (rq->curr != p) {
2328#ifdef CONFIG_SMP 2332#ifdef CONFIG_SMP
2329 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) 2333 if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
2330 queue_push_tasks(rq); 2334 deadline_queue_push_tasks(rq);
2331#endif 2335#endif
2332 if (dl_task(rq->curr)) 2336 if (dl_task(rq->curr))
2333 check_preempt_curr_dl(rq, p, 0); 2337 check_preempt_curr_dl(rq, p, 0);
@@ -2352,7 +2356,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
2352 * or lowering its prio, so... 2356 * or lowering its prio, so...
2353 */ 2357 */
2354 if (!rq->dl.overloaded) 2358 if (!rq->dl.overloaded)
2355 queue_pull_task(rq); 2359 deadline_queue_pull_task(rq);
2356 2360
2357 /* 2361 /*
2358 * If we now have a earlier deadline task than p, 2362 * If we now have a earlier deadline task than p,
@@ -2626,17 +2630,17 @@ void __dl_clear_params(struct task_struct *p)
2626{ 2630{
2627 struct sched_dl_entity *dl_se = &p->dl; 2631 struct sched_dl_entity *dl_se = &p->dl;
2628 2632
2629 dl_se->dl_runtime = 0; 2633 dl_se->dl_runtime = 0;
2630 dl_se->dl_deadline = 0; 2634 dl_se->dl_deadline = 0;
2631 dl_se->dl_period = 0; 2635 dl_se->dl_period = 0;
2632 dl_se->flags = 0; 2636 dl_se->flags = 0;
2633 dl_se->dl_bw = 0; 2637 dl_se->dl_bw = 0;
2634 dl_se->dl_density = 0; 2638 dl_se->dl_density = 0;
2635 2639
2636 dl_se->dl_throttled = 0; 2640 dl_se->dl_throttled = 0;
2637 dl_se->dl_yielded = 0; 2641 dl_se->dl_yielded = 0;
2638 dl_se->dl_non_contending = 0; 2642 dl_se->dl_non_contending = 0;
2639 dl_se->dl_overrun = 0; 2643 dl_se->dl_overrun = 0;
2640} 2644}
2641 2645
2642bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) 2646bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
@@ -2655,21 +2659,22 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
2655#ifdef CONFIG_SMP 2659#ifdef CONFIG_SMP
2656int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) 2660int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
2657{ 2661{
2658 unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, 2662 unsigned int dest_cpu;
2659 cs_cpus_allowed);
2660 struct dl_bw *dl_b; 2663 struct dl_bw *dl_b;
2661 bool overflow; 2664 bool overflow;
2662 int cpus, ret; 2665 int cpus, ret;
2663 unsigned long flags; 2666 unsigned long flags;
2664 2667
2668 dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
2669
2665 rcu_read_lock_sched(); 2670 rcu_read_lock_sched();
2666 dl_b = dl_bw_of(dest_cpu); 2671 dl_b = dl_bw_of(dest_cpu);
2667 raw_spin_lock_irqsave(&dl_b->lock, flags); 2672 raw_spin_lock_irqsave(&dl_b->lock, flags);
2668 cpus = dl_bw_cpus(dest_cpu); 2673 cpus = dl_bw_cpus(dest_cpu);
2669 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); 2674 overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
2670 if (overflow) 2675 if (overflow) {
2671 ret = -EBUSY; 2676 ret = -EBUSY;
2672 else { 2677 } else {
2673 /* 2678 /*
2674 * We reserve space for this task in the destination 2679 * We reserve space for this task in the destination
2675 * root_domain, as we can't fail after this point. 2680 * root_domain, as we can't fail after this point.
@@ -2681,6 +2686,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo
2681 } 2686 }
2682 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 2687 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
2683 rcu_read_unlock_sched(); 2688 rcu_read_unlock_sched();
2689
2684 return ret; 2690 return ret;
2685} 2691}
2686 2692
@@ -2701,6 +2707,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
2701 ret = 0; 2707 ret = 0;
2702 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); 2708 raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
2703 rcu_read_unlock_sched(); 2709 rcu_read_unlock_sched();
2710
2704 return ret; 2711 return ret;
2705} 2712}
2706 2713
@@ -2718,6 +2725,7 @@ bool dl_cpu_busy(unsigned int cpu)
2718 overflow = __dl_overflow(dl_b, cpus, 0, 0); 2725 overflow = __dl_overflow(dl_b, cpus, 0, 0);
2719 raw_spin_unlock_irqrestore(&dl_b->lock, flags); 2726 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
2720 rcu_read_unlock_sched(); 2727 rcu_read_unlock_sched();
2728
2721 return overflow; 2729 return overflow;
2722} 2730}
2723#endif 2731#endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 72c401b3b15c..99e825b76633 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * kernel/sched/debug.c 2 * kernel/sched/debug.c
3 * 3 *
4 * Print the CFS rbtree 4 * Print the CFS rbtree and other debugging details
5 * 5 *
6 * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar 6 * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
7 * 7 *
@@ -9,16 +9,6 @@
9 * it under the terms of the GNU General Public License version 2 as 9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation. 10 * published by the Free Software Foundation.
11 */ 11 */
12
13#include <linux/proc_fs.h>
14#include <linux/sched/mm.h>
15#include <linux/sched/task.h>
16#include <linux/seq_file.h>
17#include <linux/kallsyms.h>
18#include <linux/utsname.h>
19#include <linux/mempolicy.h>
20#include <linux/debugfs.h>
21
22#include "sched.h" 12#include "sched.h"
23 13
24static DEFINE_SPINLOCK(sched_debug_lock); 14static DEFINE_SPINLOCK(sched_debug_lock);
@@ -274,34 +264,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
274 if (table == NULL) 264 if (table == NULL)
275 return NULL; 265 return NULL;
276 266
277 set_table_entry(&table[0], "min_interval", &sd->min_interval, 267 set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
278 sizeof(long), 0644, proc_doulongvec_minmax, false); 268 set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false);
279 set_table_entry(&table[1], "max_interval", &sd->max_interval, 269 set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
280 sizeof(long), 0644, proc_doulongvec_minmax, false); 270 set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
281 set_table_entry(&table[2], "busy_idx", &sd->busy_idx, 271 set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
282 sizeof(int), 0644, proc_dointvec_minmax, true); 272 set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
283 set_table_entry(&table[3], "idle_idx", &sd->idle_idx, 273 set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true );
284 sizeof(int), 0644, proc_dointvec_minmax, true); 274 set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false);
285 set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, 275 set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false);
286 sizeof(int), 0644, proc_dointvec_minmax, true); 276 set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false);
287 set_table_entry(&table[5], "wake_idx", &sd->wake_idx, 277 set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false);
288 sizeof(int), 0644, proc_dointvec_minmax, true); 278 set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
289 set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, 279 set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false);
290 sizeof(int), 0644, proc_dointvec_minmax, true);
291 set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
292 sizeof(int), 0644, proc_dointvec_minmax, false);
293 set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
294 sizeof(int), 0644, proc_dointvec_minmax, false);
295 set_table_entry(&table[9], "cache_nice_tries",
296 &sd->cache_nice_tries,
297 sizeof(int), 0644, proc_dointvec_minmax, false);
298 set_table_entry(&table[10], "flags", &sd->flags,
299 sizeof(int), 0644, proc_dointvec_minmax, false);
300 set_table_entry(&table[11], "max_newidle_lb_cost",
301 &sd->max_newidle_lb_cost,
302 sizeof(long), 0644, proc_doulongvec_minmax, false);
303 set_table_entry(&table[12], "name", sd->name,
304 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
305 /* &table[13] is terminator */ 280 /* &table[13] is terminator */
306 281
307 return table; 282 return table;
@@ -332,8 +307,8 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
332 return table; 307 return table;
333} 308}
334 309
335static cpumask_var_t sd_sysctl_cpus; 310static cpumask_var_t sd_sysctl_cpus;
336static struct ctl_table_header *sd_sysctl_header; 311static struct ctl_table_header *sd_sysctl_header;
337 312
338void register_sched_domain_sysctl(void) 313void register_sched_domain_sysctl(void)
339{ 314{
@@ -413,14 +388,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
413{ 388{
414 struct sched_entity *se = tg->se[cpu]; 389 struct sched_entity *se = tg->se[cpu];
415 390
416#define P(F) \ 391#define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F)
417 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) 392#define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F))
418#define P_SCHEDSTAT(F) \ 393#define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
419 SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) 394#define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
420#define PN(F) \
421 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
422#define PN_SCHEDSTAT(F) \
423 SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
424 395
425 if (!se) 396 if (!se)
426 return; 397 return;
@@ -428,6 +399,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
428 PN(se->exec_start); 399 PN(se->exec_start);
429 PN(se->vruntime); 400 PN(se->vruntime);
430 PN(se->sum_exec_runtime); 401 PN(se->sum_exec_runtime);
402
431 if (schedstat_enabled()) { 403 if (schedstat_enabled()) {
432 PN_SCHEDSTAT(se->statistics.wait_start); 404 PN_SCHEDSTAT(se->statistics.wait_start);
433 PN_SCHEDSTAT(se->statistics.sleep_start); 405 PN_SCHEDSTAT(se->statistics.sleep_start);
@@ -440,6 +412,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
440 PN_SCHEDSTAT(se->statistics.wait_sum); 412 PN_SCHEDSTAT(se->statistics.wait_sum);
441 P_SCHEDSTAT(se->statistics.wait_count); 413 P_SCHEDSTAT(se->statistics.wait_count);
442 } 414 }
415
443 P(se->load.weight); 416 P(se->load.weight);
444 P(se->runnable_weight); 417 P(se->runnable_weight);
445#ifdef CONFIG_SMP 418#ifdef CONFIG_SMP
@@ -464,6 +437,7 @@ static char *task_group_path(struct task_group *tg)
464 return group_path; 437 return group_path;
465 438
466 cgroup_path(tg->css.cgroup, group_path, PATH_MAX); 439 cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
440
467 return group_path; 441 return group_path;
468} 442}
469#endif 443#endif
@@ -804,9 +778,9 @@ void sysrq_sched_debug_show(void)
804/* 778/*
805 * This itererator needs some explanation. 779 * This itererator needs some explanation.
806 * It returns 1 for the header position. 780 * It returns 1 for the header position.
807 * This means 2 is cpu 0. 781 * This means 2 is CPU 0.
808 * In a hotplugged system some cpus, including cpu 0, may be missing so we have 782 * In a hotplugged system some CPUs, including CPU 0, may be missing so we have
809 * to use cpumask_* to iterate over the cpus. 783 * to use cpumask_* to iterate over the CPUs.
810 */ 784 */
811static void *sched_debug_start(struct seq_file *file, loff_t *offset) 785static void *sched_debug_start(struct seq_file *file, loff_t *offset)
812{ 786{
@@ -826,6 +800,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset)
826 800
827 if (n < nr_cpu_ids) 801 if (n < nr_cpu_ids)
828 return (void *)(unsigned long)(n + 2); 802 return (void *)(unsigned long)(n + 2);
803
829 return NULL; 804 return NULL;
830} 805}
831 806
@@ -840,10 +815,10 @@ static void sched_debug_stop(struct seq_file *file, void *data)
840} 815}
841 816
842static const struct seq_operations sched_debug_sops = { 817static const struct seq_operations sched_debug_sops = {
843 .start = sched_debug_start, 818 .start = sched_debug_start,
844 .next = sched_debug_next, 819 .next = sched_debug_next,
845 .stop = sched_debug_stop, 820 .stop = sched_debug_stop,
846 .show = sched_debug_show, 821 .show = sched_debug_show,
847}; 822};
848 823
849static int sched_debug_release(struct inode *inode, struct file *file) 824static int sched_debug_release(struct inode *inode, struct file *file)
@@ -881,14 +856,10 @@ static int __init init_sched_debug_procfs(void)
881 856
882__initcall(init_sched_debug_procfs); 857__initcall(init_sched_debug_procfs);
883 858
884#define __P(F) \ 859#define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
885 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) 860#define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
886#define P(F) \ 861#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
887 SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) 862#define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
888#define __PN(F) \
889 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
890#define PN(F) \
891 SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
892 863
893 864
894#ifdef CONFIG_NUMA_BALANCING 865#ifdef CONFIG_NUMA_BALANCING
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5eb3ffc9be84..f5591071ae98 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,25 +20,10 @@
20 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra 20 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
21 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra 21 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
22 */ 22 */
23 23#include "sched.h"
24#include <linux/sched/mm.h>
25#include <linux/sched/topology.h>
26
27#include <linux/latencytop.h>
28#include <linux/cpumask.h>
29#include <linux/cpuidle.h>
30#include <linux/slab.h>
31#include <linux/profile.h>
32#include <linux/interrupt.h>
33#include <linux/mempolicy.h>
34#include <linux/migrate.h>
35#include <linux/task_work.h>
36#include <linux/sched/isolation.h>
37 24
38#include <trace/events/sched.h> 25#include <trace/events/sched.h>
39 26
40#include "sched.h"
41
42/* 27/*
43 * Targeted preemption latency for CPU-bound tasks: 28 * Targeted preemption latency for CPU-bound tasks:
44 * 29 *
@@ -103,7 +88,7 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
103 88
104#ifdef CONFIG_SMP 89#ifdef CONFIG_SMP
105/* 90/*
106 * For asym packing, by default the lower numbered cpu has higher priority. 91 * For asym packing, by default the lower numbered CPU has higher priority.
107 */ 92 */
108int __weak arch_asym_cpu_priority(int cpu) 93int __weak arch_asym_cpu_priority(int cpu)
109{ 94{
@@ -1181,7 +1166,7 @@ pid_t task_numa_group_id(struct task_struct *p)
1181} 1166}
1182 1167
1183/* 1168/*
1184 * The averaged statistics, shared & private, memory & cpu, 1169 * The averaged statistics, shared & private, memory & CPU,
1185 * occupy the first half of the array. The second half of the 1170 * occupy the first half of the array. The second half of the
1186 * array is for current counters, which are averaged into the 1171 * array is for current counters, which are averaged into the
1187 * first set by task_numa_placement. 1172 * first set by task_numa_placement.
@@ -1587,7 +1572,7 @@ static void task_numa_compare(struct task_numa_env *env,
1587 * be incurred if the tasks were swapped. 1572 * be incurred if the tasks were swapped.
1588 */ 1573 */
1589 if (cur) { 1574 if (cur) {
1590 /* Skip this swap candidate if cannot move to the source cpu */ 1575 /* Skip this swap candidate if cannot move to the source CPU: */
1591 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) 1576 if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
1592 goto unlock; 1577 goto unlock;
1593 1578
@@ -1631,7 +1616,7 @@ static void task_numa_compare(struct task_numa_env *env,
1631 goto balance; 1616 goto balance;
1632 } 1617 }
1633 1618
1634 /* Balance doesn't matter much if we're running a task per cpu */ 1619 /* Balance doesn't matter much if we're running a task per CPU: */
1635 if (imp > env->best_imp && src_rq->nr_running == 1 && 1620 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1636 dst_rq->nr_running == 1) 1621 dst_rq->nr_running == 1)
1637 goto assign; 1622 goto assign;
@@ -1676,7 +1661,7 @@ balance:
1676 */ 1661 */
1677 if (!cur) { 1662 if (!cur) {
1678 /* 1663 /*
1679 * select_idle_siblings() uses an per-cpu cpumask that 1664 * select_idle_siblings() uses an per-CPU cpumask that
1680 * can be used from IRQ context. 1665 * can be used from IRQ context.
1681 */ 1666 */
1682 local_irq_disable(); 1667 local_irq_disable();
@@ -1869,6 +1854,7 @@ static int task_numa_migrate(struct task_struct *p)
1869static void numa_migrate_preferred(struct task_struct *p) 1854static void numa_migrate_preferred(struct task_struct *p)
1870{ 1855{
1871 unsigned long interval = HZ; 1856 unsigned long interval = HZ;
1857 unsigned long numa_migrate_retry;
1872 1858
1873 /* This task has no NUMA fault statistics yet */ 1859 /* This task has no NUMA fault statistics yet */
1874 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) 1860 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
@@ -1876,7 +1862,18 @@ static void numa_migrate_preferred(struct task_struct *p)
1876 1862
1877 /* Periodically retry migrating the task to the preferred node */ 1863 /* Periodically retry migrating the task to the preferred node */
1878 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); 1864 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1879 p->numa_migrate_retry = jiffies + interval; 1865 numa_migrate_retry = jiffies + interval;
1866
1867 /*
1868 * Check that the new retry threshold is after the current one. If
1869 * the retry is in the future, it implies that wake_affine has
1870 * temporarily asked NUMA balancing to backoff from placement.
1871 */
1872 if (numa_migrate_retry > p->numa_migrate_retry)
1873 return;
1874
1875 /* Safe to try placing the task on the preferred node */
1876 p->numa_migrate_retry = numa_migrate_retry;
1880 1877
1881 /* Success if task is already running on preferred CPU */ 1878 /* Success if task is already running on preferred CPU */
1882 if (task_node(p) == p->numa_preferred_nid) 1879 if (task_node(p) == p->numa_preferred_nid)
@@ -2823,7 +2820,7 @@ void reweight_task(struct task_struct *p, int prio)
2823} 2820}
2824 2821
2825#ifdef CONFIG_FAIR_GROUP_SCHED 2822#ifdef CONFIG_FAIR_GROUP_SCHED
2826# ifdef CONFIG_SMP 2823#ifdef CONFIG_SMP
2827/* 2824/*
2828 * All this does is approximate the hierarchical proportion which includes that 2825 * All this does is approximate the hierarchical proportion which includes that
2829 * global sum we all love to hate. 2826 * global sum we all love to hate.
@@ -2974,7 +2971,7 @@ static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
2974 2971
2975 return clamp_t(long, runnable, MIN_SHARES, shares); 2972 return clamp_t(long, runnable, MIN_SHARES, shares);
2976} 2973}
2977# endif /* CONFIG_SMP */ 2974#endif /* CONFIG_SMP */
2978 2975
2979static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); 2976static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2980 2977
@@ -3350,7 +3347,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
3350} 3347}
3351 3348
3352/* 3349/*
3353 * Called within set_task_rq() right before setting a task's cpu. The 3350 * Called within set_task_rq() right before setting a task's CPU. The
3354 * caller only guarantees p->pi_lock is held; no other assumptions, 3351 * caller only guarantees p->pi_lock is held; no other assumptions,
3355 * including the state of rq->lock, should be made. 3352 * including the state of rq->lock, should be made.
3356 */ 3353 */
@@ -3529,7 +3526,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
3529 3526
3530 /* 3527 /*
3531 * runnable_sum can't be lower than running_sum 3528 * runnable_sum can't be lower than running_sum
3532 * As running sum is scale with cpu capacity wehreas the runnable sum 3529 * As running sum is scale with CPU capacity wehreas the runnable sum
3533 * is not we rescale running_sum 1st 3530 * is not we rescale running_sum 1st
3534 */ 3531 */
3535 running_sum = se->avg.util_sum / 3532 running_sum = se->avg.util_sum /
@@ -4676,7 +4673,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4676 if (!se) 4673 if (!se)
4677 add_nr_running(rq, task_delta); 4674 add_nr_running(rq, task_delta);
4678 4675
4679 /* determine whether we need to wake up potentially idle cpu */ 4676 /* Determine whether we need to wake up potentially idle CPU: */
4680 if (rq->curr == rq->idle && rq->cfs.nr_running) 4677 if (rq->curr == rq->idle && rq->cfs.nr_running)
4681 resched_curr(rq); 4678 resched_curr(rq);
4682} 4679}
@@ -5041,7 +5038,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
5041} 5038}
5042 5039
5043/* 5040/*
5044 * Both these cpu hotplug callbacks race against unregister_fair_sched_group() 5041 * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
5045 * 5042 *
5046 * The race is harmless, since modifying bandwidth settings of unhooked group 5043 * The race is harmless, since modifying bandwidth settings of unhooked group
5047 * bits doesn't do much. 5044 * bits doesn't do much.
@@ -5086,7 +5083,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
5086 */ 5083 */
5087 cfs_rq->runtime_remaining = 1; 5084 cfs_rq->runtime_remaining = 1;
5088 /* 5085 /*
5089 * Offline rq is schedulable till cpu is completely disabled 5086 * Offline rq is schedulable till CPU is completely disabled
5090 * in take_cpu_down(), so we prevent new cfs throttling here. 5087 * in take_cpu_down(), so we prevent new cfs throttling here.
5091 */ 5088 */
5092 cfs_rq->runtime_enabled = 0; 5089 cfs_rq->runtime_enabled = 0;
@@ -5323,8 +5320,8 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
5323 * 5320 *
5324 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load 5321 * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
5325 * 5322 *
5326 * If a cpu misses updates for n ticks (as it was idle) and update gets 5323 * If a CPU misses updates for n ticks (as it was idle) and update gets
5327 * called on the n+1-th tick when cpu may be busy, then we have: 5324 * called on the n+1-th tick when CPU may be busy, then we have:
5328 * 5325 *
5329 * load_n = (1 - 1/2^i)^n * load_0 5326 * load_n = (1 - 1/2^i)^n * load_0
5330 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load 5327 * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load
@@ -5468,7 +5465,7 @@ static unsigned long weighted_cpuload(struct rq *rq)
5468#ifdef CONFIG_NO_HZ_COMMON 5465#ifdef CONFIG_NO_HZ_COMMON
5469/* 5466/*
5470 * There is no sane way to deal with nohz on smp when using jiffies because the 5467 * There is no sane way to deal with nohz on smp when using jiffies because the
5471 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading 5468 * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
5472 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. 5469 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5473 * 5470 *
5474 * Therefore we need to avoid the delta approach from the regular tick when 5471 * Therefore we need to avoid the delta approach from the regular tick when
@@ -5579,7 +5576,7 @@ void cpu_load_update_active(struct rq *this_rq)
5579} 5576}
5580 5577
5581/* 5578/*
5582 * Return a low guess at the load of a migration-source cpu weighted 5579 * Return a low guess at the load of a migration-source CPU weighted
5583 * according to the scheduling class and "nice" value. 5580 * according to the scheduling class and "nice" value.
5584 * 5581 *
5585 * We want to under-estimate the load of migration sources, to 5582 * We want to under-estimate the load of migration sources, to
@@ -5597,7 +5594,7 @@ static unsigned long source_load(int cpu, int type)
5597} 5594}
5598 5595
5599/* 5596/*
5600 * Return a high guess at the load of a migration-target cpu weighted 5597 * Return a high guess at the load of a migration-target CPU weighted
5601 * according to the scheduling class and "nice" value. 5598 * according to the scheduling class and "nice" value.
5602 */ 5599 */
5603static unsigned long target_load(int cpu, int type) 5600static unsigned long target_load(int cpu, int type)
@@ -5724,7 +5721,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5724 unsigned long task_load; 5721 unsigned long task_load;
5725 5722
5726 this_eff_load = target_load(this_cpu, sd->wake_idx); 5723 this_eff_load = target_load(this_cpu, sd->wake_idx);
5727 prev_eff_load = source_load(prev_cpu, sd->wake_idx);
5728 5724
5729 if (sync) { 5725 if (sync) {
5730 unsigned long current_load = task_h_load(current); 5726 unsigned long current_load = task_h_load(current);
@@ -5742,18 +5738,69 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
5742 this_eff_load *= 100; 5738 this_eff_load *= 100;
5743 this_eff_load *= capacity_of(prev_cpu); 5739 this_eff_load *= capacity_of(prev_cpu);
5744 5740
5741 prev_eff_load = source_load(prev_cpu, sd->wake_idx);
5745 prev_eff_load -= task_load; 5742 prev_eff_load -= task_load;
5746 if (sched_feat(WA_BIAS)) 5743 if (sched_feat(WA_BIAS))
5747 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; 5744 prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
5748 prev_eff_load *= capacity_of(this_cpu); 5745 prev_eff_load *= capacity_of(this_cpu);
5749 5746
5750 return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits; 5747 /*
5748 * If sync, adjust the weight of prev_eff_load such that if
5749 * prev_eff == this_eff that select_idle_sibling() will consider
5750 * stacking the wakee on top of the waker if no other CPU is
5751 * idle.
5752 */
5753 if (sync)
5754 prev_eff_load += 1;
5755
5756 return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
5757}
5758
5759#ifdef CONFIG_NUMA_BALANCING
5760static void
5761update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
5762{
5763 unsigned long interval;
5764
5765 if (!static_branch_likely(&sched_numa_balancing))
5766 return;
5767
5768 /* If balancing has no preference then continue gathering data */
5769 if (p->numa_preferred_nid == -1)
5770 return;
5771
5772 /*
5773 * If the wakeup is not affecting locality then it is neutral from
5774 * the perspective of NUMA balacing so continue gathering data.
5775 */
5776 if (cpu_to_node(prev_cpu) == cpu_to_node(target))
5777 return;
5778
5779 /*
5780 * Temporarily prevent NUMA balancing trying to place waker/wakee after
5781 * wakee has been moved by wake_affine. This will potentially allow
5782 * related tasks to converge and update their data placement. The
5783 * 4 * numa_scan_period is to allow the two-pass filter to migrate
5784 * hot data to the wakers node.
5785 */
5786 interval = max(sysctl_numa_balancing_scan_delay,
5787 p->numa_scan_period << 2);
5788 p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
5789
5790 interval = max(sysctl_numa_balancing_scan_delay,
5791 current->numa_scan_period << 2);
5792 current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
5793}
5794#else
5795static void
5796update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
5797{
5751} 5798}
5799#endif
5752 5800
5753static int wake_affine(struct sched_domain *sd, struct task_struct *p, 5801static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5754 int prev_cpu, int sync) 5802 int this_cpu, int prev_cpu, int sync)
5755{ 5803{
5756 int this_cpu = smp_processor_id();
5757 int target = nr_cpumask_bits; 5804 int target = nr_cpumask_bits;
5758 5805
5759 if (sched_feat(WA_IDLE)) 5806 if (sched_feat(WA_IDLE))
@@ -5766,6 +5813,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5766 if (target == nr_cpumask_bits) 5813 if (target == nr_cpumask_bits)
5767 return prev_cpu; 5814 return prev_cpu;
5768 5815
5816 update_wa_numa_placement(p, prev_cpu, target);
5769 schedstat_inc(sd->ttwu_move_affine); 5817 schedstat_inc(sd->ttwu_move_affine);
5770 schedstat_inc(p->se.statistics.nr_wakeups_affine); 5818 schedstat_inc(p->se.statistics.nr_wakeups_affine);
5771 return target; 5819 return target;
@@ -5826,7 +5874,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5826 max_spare_cap = 0; 5874 max_spare_cap = 0;
5827 5875
5828 for_each_cpu(i, sched_group_span(group)) { 5876 for_each_cpu(i, sched_group_span(group)) {
5829 /* Bias balancing toward cpus of our domain */ 5877 /* Bias balancing toward CPUs of our domain */
5830 if (local_group) 5878 if (local_group)
5831 load = source_load(i, load_idx); 5879 load = source_load(i, load_idx);
5832 else 5880 else
@@ -5856,7 +5904,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5856 if (min_runnable_load > (runnable_load + imbalance)) { 5904 if (min_runnable_load > (runnable_load + imbalance)) {
5857 /* 5905 /*
5858 * The runnable load is significantly smaller 5906 * The runnable load is significantly smaller
5859 * so we can pick this new cpu 5907 * so we can pick this new CPU:
5860 */ 5908 */
5861 min_runnable_load = runnable_load; 5909 min_runnable_load = runnable_load;
5862 min_avg_load = avg_load; 5910 min_avg_load = avg_load;
@@ -5865,7 +5913,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5865 (100*min_avg_load > imbalance_scale*avg_load)) { 5913 (100*min_avg_load > imbalance_scale*avg_load)) {
5866 /* 5914 /*
5867 * The runnable loads are close so take the 5915 * The runnable loads are close so take the
5868 * blocked load into account through avg_load. 5916 * blocked load into account through avg_load:
5869 */ 5917 */
5870 min_avg_load = avg_load; 5918 min_avg_load = avg_load;
5871 idlest = group; 5919 idlest = group;
@@ -5903,6 +5951,18 @@ skip_spare:
5903 if (!idlest) 5951 if (!idlest)
5904 return NULL; 5952 return NULL;
5905 5953
5954 /*
5955 * When comparing groups across NUMA domains, it's possible for the
5956 * local domain to be very lightly loaded relative to the remote
5957 * domains but "imbalance" skews the comparison making remote CPUs
5958 * look much more favourable. When considering cross-domain, add
5959 * imbalance to the runnable load on the remote node and consider
5960 * staying local.
5961 */
5962 if ((sd->flags & SD_NUMA) &&
5963 min_runnable_load + imbalance >= this_runnable_load)
5964 return NULL;
5965
5906 if (min_runnable_load > (this_runnable_load + imbalance)) 5966 if (min_runnable_load > (this_runnable_load + imbalance))
5907 return NULL; 5967 return NULL;
5908 5968
@@ -5914,7 +5974,7 @@ skip_spare:
5914} 5974}
5915 5975
5916/* 5976/*
5917 * find_idlest_group_cpu - find the idlest cpu among the cpus in group. 5977 * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
5918 */ 5978 */
5919static int 5979static int
5920find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) 5980find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
@@ -5992,12 +6052,12 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
5992 6052
5993 new_cpu = find_idlest_group_cpu(group, p, cpu); 6053 new_cpu = find_idlest_group_cpu(group, p, cpu);
5994 if (new_cpu == cpu) { 6054 if (new_cpu == cpu) {
5995 /* Now try balancing at a lower domain level of cpu */ 6055 /* Now try balancing at a lower domain level of 'cpu': */
5996 sd = sd->child; 6056 sd = sd->child;
5997 continue; 6057 continue;
5998 } 6058 }
5999 6059
6000 /* Now try balancing at a lower domain level of new_cpu */ 6060 /* Now try balancing at a lower domain level of 'new_cpu': */
6001 cpu = new_cpu; 6061 cpu = new_cpu;
6002 weight = sd->span_weight; 6062 weight = sd->span_weight;
6003 sd = NULL; 6063 sd = NULL;
@@ -6007,7 +6067,6 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
6007 if (tmp->flags & sd_flag) 6067 if (tmp->flags & sd_flag)
6008 sd = tmp; 6068 sd = tmp;
6009 } 6069 }
6010 /* while loop will break here if sd == NULL */
6011 } 6070 }
6012 6071
6013 return new_cpu; 6072 return new_cpu;
@@ -6203,12 +6262,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6203 return target; 6262 return target;
6204 6263
6205 /* 6264 /*
6206 * If the previous cpu is cache affine and idle, don't be stupid. 6265 * If the previous CPU is cache affine and idle, don't be stupid:
6207 */ 6266 */
6208 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) 6267 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
6209 return prev; 6268 return prev;
6210 6269
6211 /* Check a recently used CPU as a potential idle candidate */ 6270 /* Check a recently used CPU as a potential idle candidate: */
6212 recent_used_cpu = p->recent_used_cpu; 6271 recent_used_cpu = p->recent_used_cpu;
6213 if (recent_used_cpu != prev && 6272 if (recent_used_cpu != prev &&
6214 recent_used_cpu != target && 6273 recent_used_cpu != target &&
@@ -6217,7 +6276,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
6217 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { 6276 cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
6218 /* 6277 /*
6219 * Replace recent_used_cpu with prev as it is a potential 6278 * Replace recent_used_cpu with prev as it is a potential
6220 * candidate for the next wake. 6279 * candidate for the next wake:
6221 */ 6280 */
6222 p->recent_used_cpu = prev; 6281 p->recent_used_cpu = prev;
6223 return recent_used_cpu; 6282 return recent_used_cpu;
@@ -6282,7 +6341,7 @@ static inline unsigned long task_util(struct task_struct *p)
6282} 6341}
6283 6342
6284/* 6343/*
6285 * cpu_util_wake: Compute cpu utilization with any contributions from 6344 * cpu_util_wake: Compute CPU utilization with any contributions from
6286 * the waking task p removed. 6345 * the waking task p removed.
6287 */ 6346 */
6288static unsigned long cpu_util_wake(int cpu, struct task_struct *p) 6347static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
@@ -6328,10 +6387,10 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6328 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, 6387 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
6329 * SD_BALANCE_FORK, or SD_BALANCE_EXEC. 6388 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
6330 * 6389 *
6331 * Balances load by selecting the idlest cpu in the idlest group, or under 6390 * Balances load by selecting the idlest CPU in the idlest group, or under
6332 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. 6391 * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
6333 * 6392 *
6334 * Returns the target cpu number. 6393 * Returns the target CPU number.
6335 * 6394 *
6336 * preempt must be disabled. 6395 * preempt must be disabled.
6337 */ 6396 */
@@ -6342,7 +6401,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6342 int cpu = smp_processor_id(); 6401 int cpu = smp_processor_id();
6343 int new_cpu = prev_cpu; 6402 int new_cpu = prev_cpu;
6344 int want_affine = 0; 6403 int want_affine = 0;
6345 int sync = wake_flags & WF_SYNC; 6404 int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
6346 6405
6347 if (sd_flag & SD_BALANCE_WAKE) { 6406 if (sd_flag & SD_BALANCE_WAKE) {
6348 record_wakee(p); 6407 record_wakee(p);
@@ -6356,7 +6415,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6356 break; 6415 break;
6357 6416
6358 /* 6417 /*
6359 * If both cpu and prev_cpu are part of this domain, 6418 * If both 'cpu' and 'prev_cpu' are part of this domain,
6360 * cpu is a valid SD_WAKE_AFFINE target. 6419 * cpu is a valid SD_WAKE_AFFINE target.
6361 */ 6420 */
6362 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && 6421 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
@@ -6376,7 +6435,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
6376 if (cpu == prev_cpu) 6435 if (cpu == prev_cpu)
6377 goto pick_cpu; 6436 goto pick_cpu;
6378 6437
6379 new_cpu = wake_affine(affine_sd, p, prev_cpu, sync); 6438 new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync);
6380 } 6439 }
6381 6440
6382 if (sd && !(sd_flag & SD_BALANCE_FORK)) { 6441 if (sd && !(sd_flag & SD_BALANCE_FORK)) {
@@ -6407,9 +6466,9 @@ pick_cpu:
6407static void detach_entity_cfs_rq(struct sched_entity *se); 6466static void detach_entity_cfs_rq(struct sched_entity *se);
6408 6467
6409/* 6468/*
6410 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and 6469 * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
6411 * cfs_rq_of(p) references at time of call are still valid and identify the 6470 * cfs_rq_of(p) references at time of call are still valid and identify the
6412 * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. 6471 * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
6413 */ 6472 */
6414static void migrate_task_rq_fair(struct task_struct *p) 6473static void migrate_task_rq_fair(struct task_struct *p)
6415{ 6474{
@@ -6843,17 +6902,17 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6843 * BASICS 6902 * BASICS
6844 * 6903 *
6845 * The purpose of load-balancing is to achieve the same basic fairness the 6904 * The purpose of load-balancing is to achieve the same basic fairness the
6846 * per-cpu scheduler provides, namely provide a proportional amount of compute 6905 * per-CPU scheduler provides, namely provide a proportional amount of compute
6847 * time to each task. This is expressed in the following equation: 6906 * time to each task. This is expressed in the following equation:
6848 * 6907 *
6849 * W_i,n/P_i == W_j,n/P_j for all i,j (1) 6908 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
6850 * 6909 *
6851 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight 6910 * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
6852 * W_i,0 is defined as: 6911 * W_i,0 is defined as:
6853 * 6912 *
6854 * W_i,0 = \Sum_j w_i,j (2) 6913 * W_i,0 = \Sum_j w_i,j (2)
6855 * 6914 *
6856 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight 6915 * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
6857 * is derived from the nice value as per sched_prio_to_weight[]. 6916 * is derived from the nice value as per sched_prio_to_weight[].
6858 * 6917 *
6859 * The weight average is an exponential decay average of the instantaneous 6918 * The weight average is an exponential decay average of the instantaneous
@@ -6861,7 +6920,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6861 * 6920 *
6862 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) 6921 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
6863 * 6922 *
6864 * C_i is the compute capacity of cpu i, typically it is the 6923 * C_i is the compute capacity of CPU i, typically it is the
6865 * fraction of 'recent' time available for SCHED_OTHER task execution. But it 6924 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
6866 * can also include other factors [XXX]. 6925 * can also include other factors [XXX].
6867 * 6926 *
@@ -6882,11 +6941,11 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6882 * SCHED DOMAINS 6941 * SCHED DOMAINS
6883 * 6942 *
6884 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) 6943 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
6885 * for all i,j solution, we create a tree of cpus that follows the hardware 6944 * for all i,j solution, we create a tree of CPUs that follows the hardware
6886 * topology where each level pairs two lower groups (or better). This results 6945 * topology where each level pairs two lower groups (or better). This results
6887 * in O(log n) layers. Furthermore we reduce the number of cpus going up the 6946 * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
6888 * tree to only the first of the previous level and we decrease the frequency 6947 * tree to only the first of the previous level and we decrease the frequency
6889 * of load-balance at each level inv. proportional to the number of cpus in 6948 * of load-balance at each level inv. proportional to the number of CPUs in
6890 * the groups. 6949 * the groups.
6891 * 6950 *
6892 * This yields: 6951 * This yields:
@@ -6895,7 +6954,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6895 * \Sum { --- * --- * 2^i } = O(n) (5) 6954 * \Sum { --- * --- * 2^i } = O(n) (5)
6896 * i = 0 2^i 2^i 6955 * i = 0 2^i 2^i
6897 * `- size of each group 6956 * `- size of each group
6898 * | | `- number of cpus doing load-balance 6957 * | | `- number of CPUs doing load-balance
6899 * | `- freq 6958 * | `- freq
6900 * `- sum over all levels 6959 * `- sum over all levels
6901 * 6960 *
@@ -6903,7 +6962,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6903 * this makes (5) the runtime complexity of the balancer. 6962 * this makes (5) the runtime complexity of the balancer.
6904 * 6963 *
6905 * An important property here is that each CPU is still (indirectly) connected 6964 * An important property here is that each CPU is still (indirectly) connected
6906 * to every other cpu in at most O(log n) steps: 6965 * to every other CPU in at most O(log n) steps:
6907 * 6966 *
6908 * The adjacency matrix of the resulting graph is given by: 6967 * The adjacency matrix of the resulting graph is given by:
6909 * 6968 *
@@ -6915,7 +6974,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6915 * 6974 *
6916 * A^(log_2 n)_i,j != 0 for all i,j (7) 6975 * A^(log_2 n)_i,j != 0 for all i,j (7)
6917 * 6976 *
6918 * Showing there's indeed a path between every cpu in at most O(log n) steps. 6977 * Showing there's indeed a path between every CPU in at most O(log n) steps.
6919 * The task movement gives a factor of O(m), giving a convergence complexity 6978 * The task movement gives a factor of O(m), giving a convergence complexity
6920 * of: 6979 * of:
6921 * 6980 *
@@ -6925,7 +6984,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6925 * WORK CONSERVING 6984 * WORK CONSERVING
6926 * 6985 *
6927 * In order to avoid CPUs going idle while there's still work to do, new idle 6986 * In order to avoid CPUs going idle while there's still work to do, new idle
6928 * balancing is more aggressive and has the newly idle cpu iterate up the domain 6987 * balancing is more aggressive and has the newly idle CPU iterate up the domain
6929 * tree itself instead of relying on other CPUs to bring it work. 6988 * tree itself instead of relying on other CPUs to bring it work.
6930 * 6989 *
6931 * This adds some complexity to both (5) and (8) but it reduces the total idle 6990 * This adds some complexity to both (5) and (8) but it reduces the total idle
@@ -6946,7 +7005,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
6946 * 7005 *
6947 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) 7006 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
6948 * 7007 *
6949 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. 7008 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
6950 * 7009 *
6951 * The big problem is S_k, its a global sum needed to compute a local (W_i) 7010 * The big problem is S_k, its a global sum needed to compute a local (W_i)
6952 * property. 7011 * property.
@@ -7110,7 +7169,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
7110 env->flags |= LBF_SOME_PINNED; 7169 env->flags |= LBF_SOME_PINNED;
7111 7170
7112 /* 7171 /*
7113 * Remember if this task can be migrated to any other cpu in 7172 * Remember if this task can be migrated to any other CPU in
7114 * our sched_group. We may want to revisit it if we couldn't 7173 * our sched_group. We may want to revisit it if we couldn't
7115 * meet load balance goals by pulling other tasks on src_cpu. 7174 * meet load balance goals by pulling other tasks on src_cpu.
7116 * 7175 *
@@ -7120,7 +7179,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
7120 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) 7179 if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
7121 return 0; 7180 return 0;
7122 7181
7123 /* Prevent to re-select dst_cpu via env's cpus */ 7182 /* Prevent to re-select dst_cpu via env's CPUs: */
7124 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { 7183 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7125 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { 7184 if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
7126 env->flags |= LBF_DST_PINNED; 7185 env->flags |= LBF_DST_PINNED;
@@ -7694,8 +7753,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
7694 * Group imbalance indicates (and tries to solve) the problem where balancing 7753 * Group imbalance indicates (and tries to solve) the problem where balancing
7695 * groups is inadequate due to ->cpus_allowed constraints. 7754 * groups is inadequate due to ->cpus_allowed constraints.
7696 * 7755 *
7697 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a 7756 * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
7698 * cpumask covering 1 cpu of the first group and 3 cpus of the second group. 7757 * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
7699 * Something like: 7758 * Something like:
7700 * 7759 *
7701 * { 0 1 2 3 } { 4 5 6 7 } 7760 * { 0 1 2 3 } { 4 5 6 7 }
@@ -7703,7 +7762,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
7703 * 7762 *
7704 * If we were to balance group-wise we'd place two tasks in the first group and 7763 * If we were to balance group-wise we'd place two tasks in the first group and
7705 * two tasks in the second group. Clearly this is undesired as it will overload 7764 * two tasks in the second group. Clearly this is undesired as it will overload
7706 * cpu 3 and leave one of the cpus in the second group unused. 7765 * cpu 3 and leave one of the CPUs in the second group unused.
7707 * 7766 *
7708 * The current solution to this issue is detecting the skew in the first group 7767 * The current solution to this issue is detecting the skew in the first group
7709 * by noticing the lower domain failed to reach balance and had difficulty 7768 * by noticing the lower domain failed to reach balance and had difficulty
@@ -7816,7 +7875,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
7816 for_each_cpu_and(i, sched_group_span(group), env->cpus) { 7875 for_each_cpu_and(i, sched_group_span(group), env->cpus) {
7817 struct rq *rq = cpu_rq(i); 7876 struct rq *rq = cpu_rq(i);
7818 7877
7819 /* Bias balancing toward cpus of our domain */ 7878 /* Bias balancing toward CPUs of our domain: */
7820 if (local_group) 7879 if (local_group)
7821 load = target_load(i, load_idx); 7880 load = target_load(i, load_idx);
7822 else 7881 else
@@ -7902,7 +7961,7 @@ asym_packing:
7902 if (!(env->sd->flags & SD_ASYM_PACKING)) 7961 if (!(env->sd->flags & SD_ASYM_PACKING))
7903 return true; 7962 return true;
7904 7963
7905 /* No ASYM_PACKING if target cpu is already busy */ 7964 /* No ASYM_PACKING if target CPU is already busy */
7906 if (env->idle == CPU_NOT_IDLE) 7965 if (env->idle == CPU_NOT_IDLE)
7907 return true; 7966 return true;
7908 /* 7967 /*
@@ -7915,7 +7974,7 @@ asym_packing:
7915 if (!sds->busiest) 7974 if (!sds->busiest)
7916 return true; 7975 return true;
7917 7976
7918 /* Prefer to move from lowest priority cpu's work */ 7977 /* Prefer to move from lowest priority CPU's work */
7919 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, 7978 if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
7920 sg->asym_prefer_cpu)) 7979 sg->asym_prefer_cpu))
7921 return true; 7980 return true;
@@ -8168,7 +8227,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
8168 if (busiest->group_type == group_imbalanced) { 8227 if (busiest->group_type == group_imbalanced) {
8169 /* 8228 /*
8170 * In the group_imb case we cannot rely on group-wide averages 8229 * In the group_imb case we cannot rely on group-wide averages
8171 * to ensure cpu-load equilibrium, look at wider averages. XXX 8230 * to ensure CPU-load equilibrium, look at wider averages. XXX
8172 */ 8231 */
8173 busiest->load_per_task = 8232 busiest->load_per_task =
8174 min(busiest->load_per_task, sds->avg_load); 8233 min(busiest->load_per_task, sds->avg_load);
@@ -8187,7 +8246,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
8187 } 8246 }
8188 8247
8189 /* 8248 /*
8190 * If there aren't any idle cpus, avoid creating some. 8249 * If there aren't any idle CPUs, avoid creating some.
8191 */ 8250 */
8192 if (busiest->group_type == group_overloaded && 8251 if (busiest->group_type == group_overloaded &&
8193 local->group_type == group_overloaded) { 8252 local->group_type == group_overloaded) {
@@ -8201,9 +8260,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
8201 } 8260 }
8202 8261
8203 /* 8262 /*
8204 * We're trying to get all the cpus to the average_load, so we don't 8263 * We're trying to get all the CPUs to the average_load, so we don't
8205 * want to push ourselves above the average load, nor do we wish to 8264 * want to push ourselves above the average load, nor do we wish to
8206 * reduce the max loaded cpu below the average load. At the same time, 8265 * reduce the max loaded CPU below the average load. At the same time,
8207 * we also don't want to reduce the group load below the group 8266 * we also don't want to reduce the group load below the group
8208 * capacity. Thus we look for the minimum possible imbalance. 8267 * capacity. Thus we look for the minimum possible imbalance.
8209 */ 8268 */
@@ -8297,9 +8356,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
8297 8356
8298 if (env->idle == CPU_IDLE) { 8357 if (env->idle == CPU_IDLE) {
8299 /* 8358 /*
8300 * This cpu is idle. If the busiest group is not overloaded 8359 * This CPU is idle. If the busiest group is not overloaded
8301 * and there is no imbalance between this and busiest group 8360 * and there is no imbalance between this and busiest group
8302 * wrt idle cpus, it is balanced. The imbalance becomes 8361 * wrt idle CPUs, it is balanced. The imbalance becomes
8303 * significant if the diff is greater than 1 otherwise we 8362 * significant if the diff is greater than 1 otherwise we
8304 * might end up to just move the imbalance on another group 8363 * might end up to just move the imbalance on another group
8305 */ 8364 */
@@ -8327,7 +8386,7 @@ out_balanced:
8327} 8386}
8328 8387
8329/* 8388/*
8330 * find_busiest_queue - find the busiest runqueue among the cpus in group. 8389 * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
8331 */ 8390 */
8332static struct rq *find_busiest_queue(struct lb_env *env, 8391static struct rq *find_busiest_queue(struct lb_env *env,
8333 struct sched_group *group) 8392 struct sched_group *group)
@@ -8371,7 +8430,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
8371 8430
8372 /* 8431 /*
8373 * When comparing with imbalance, use weighted_cpuload() 8432 * When comparing with imbalance, use weighted_cpuload()
8374 * which is not scaled with the cpu capacity. 8433 * which is not scaled with the CPU capacity.
8375 */ 8434 */
8376 8435
8377 if (rq->nr_running == 1 && wl > env->imbalance && 8436 if (rq->nr_running == 1 && wl > env->imbalance &&
@@ -8379,9 +8438,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
8379 continue; 8438 continue;
8380 8439
8381 /* 8440 /*
8382 * For the load comparisons with the other cpu's, consider 8441 * For the load comparisons with the other CPU's, consider
8383 * the weighted_cpuload() scaled with the cpu capacity, so 8442 * the weighted_cpuload() scaled with the CPU capacity, so
8384 * that the load can be moved away from the cpu that is 8443 * that the load can be moved away from the CPU that is
8385 * potentially running at a lower capacity. 8444 * potentially running at a lower capacity.
8386 * 8445 *
8387 * Thus we're looking for max(wl_i / capacity_i), crosswise 8446 * Thus we're looking for max(wl_i / capacity_i), crosswise
@@ -8452,13 +8511,13 @@ static int should_we_balance(struct lb_env *env)
8452 return 0; 8511 return 0;
8453 8512
8454 /* 8513 /*
8455 * In the newly idle case, we will allow all the cpu's 8514 * In the newly idle case, we will allow all the CPUs
8456 * to do the newly idle load balance. 8515 * to do the newly idle load balance.
8457 */ 8516 */
8458 if (env->idle == CPU_NEWLY_IDLE) 8517 if (env->idle == CPU_NEWLY_IDLE)
8459 return 1; 8518 return 1;
8460 8519
8461 /* Try to find first idle cpu */ 8520 /* Try to find first idle CPU */
8462 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { 8521 for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
8463 if (!idle_cpu(cpu)) 8522 if (!idle_cpu(cpu))
8464 continue; 8523 continue;
@@ -8471,7 +8530,7 @@ static int should_we_balance(struct lb_env *env)
8471 balance_cpu = group_balance_cpu(sg); 8530 balance_cpu = group_balance_cpu(sg);
8472 8531
8473 /* 8532 /*
8474 * First idle cpu or the first cpu(busiest) in this sched group 8533 * First idle CPU or the first CPU(busiest) in this sched group
8475 * is eligible for doing load balancing at this and above domains. 8534 * is eligible for doing load balancing at this and above domains.
8476 */ 8535 */
8477 return balance_cpu == env->dst_cpu; 8536 return balance_cpu == env->dst_cpu;
@@ -8580,7 +8639,7 @@ more_balance:
8580 * Revisit (affine) tasks on src_cpu that couldn't be moved to 8639 * Revisit (affine) tasks on src_cpu that couldn't be moved to
8581 * us and move them to an alternate dst_cpu in our sched_group 8640 * us and move them to an alternate dst_cpu in our sched_group
8582 * where they can run. The upper limit on how many times we 8641 * where they can run. The upper limit on how many times we
8583 * iterate on same src_cpu is dependent on number of cpus in our 8642 * iterate on same src_cpu is dependent on number of CPUs in our
8584 * sched_group. 8643 * sched_group.
8585 * 8644 *
8586 * This changes load balance semantics a bit on who can move 8645 * This changes load balance semantics a bit on who can move
@@ -8597,7 +8656,7 @@ more_balance:
8597 */ 8656 */
8598 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { 8657 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
8599 8658
8600 /* Prevent to re-select dst_cpu via env's cpus */ 8659 /* Prevent to re-select dst_cpu via env's CPUs */
8601 cpumask_clear_cpu(env.dst_cpu, env.cpus); 8660 cpumask_clear_cpu(env.dst_cpu, env.cpus);
8602 8661
8603 env.dst_rq = cpu_rq(env.new_dst_cpu); 8662 env.dst_rq = cpu_rq(env.new_dst_cpu);
@@ -8659,9 +8718,10 @@ more_balance:
8659 8718
8660 raw_spin_lock_irqsave(&busiest->lock, flags); 8719 raw_spin_lock_irqsave(&busiest->lock, flags);
8661 8720
8662 /* don't kick the active_load_balance_cpu_stop, 8721 /*
8663 * if the curr task on busiest cpu can't be 8722 * Don't kick the active_load_balance_cpu_stop,
8664 * moved to this_cpu 8723 * if the curr task on busiest CPU can't be
8724 * moved to this_cpu:
8665 */ 8725 */
8666 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { 8726 if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
8667 raw_spin_unlock_irqrestore(&busiest->lock, 8727 raw_spin_unlock_irqrestore(&busiest->lock,
@@ -8887,7 +8947,7 @@ out:
8887} 8947}
8888 8948
8889/* 8949/*
8890 * active_load_balance_cpu_stop is run by cpu stopper. It pushes 8950 * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
8891 * running tasks off the busiest CPU onto idle CPUs. It requires at 8951 * running tasks off the busiest CPU onto idle CPUs. It requires at
8892 * least 1 task to be running on each physical CPU where possible, and 8952 * least 1 task to be running on each physical CPU where possible, and
8893 * avoids physical / logical imbalances. 8953 * avoids physical / logical imbalances.
@@ -8911,7 +8971,7 @@ static int active_load_balance_cpu_stop(void *data)
8911 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) 8971 if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
8912 goto out_unlock; 8972 goto out_unlock;
8913 8973
8914 /* make sure the requested cpu hasn't gone down in the meantime */ 8974 /* Make sure the requested CPU hasn't gone down in the meantime: */
8915 if (unlikely(busiest_cpu != smp_processor_id() || 8975 if (unlikely(busiest_cpu != smp_processor_id() ||
8916 !busiest_rq->active_balance)) 8976 !busiest_rq->active_balance))
8917 goto out_unlock; 8977 goto out_unlock;
@@ -8923,7 +8983,7 @@ static int active_load_balance_cpu_stop(void *data)
8923 /* 8983 /*
8924 * This condition is "impossible", if it occurs 8984 * This condition is "impossible", if it occurs
8925 * we need to fix it. Originally reported by 8985 * we need to fix it. Originally reported by
8926 * Bjorn Helgaas on a 128-cpu setup. 8986 * Bjorn Helgaas on a 128-CPU setup.
8927 */ 8987 */
8928 BUG_ON(busiest_rq == target_rq); 8988 BUG_ON(busiest_rq == target_rq);
8929 8989
@@ -9025,7 +9085,7 @@ static void nohz_balancer_kick(void)
9025 return; 9085 return;
9026 /* 9086 /*
9027 * Use smp_send_reschedule() instead of resched_cpu(). 9087 * Use smp_send_reschedule() instead of resched_cpu().
9028 * This way we generate a sched IPI on the target cpu which 9088 * This way we generate a sched IPI on the target CPU which
9029 * is idle. And the softirq performing nohz idle load balance 9089 * is idle. And the softirq performing nohz idle load balance
9030 * will be run before returning from the IPI. 9090 * will be run before returning from the IPI.
9031 */ 9091 */
@@ -9082,14 +9142,12 @@ unlock:
9082} 9142}
9083 9143
9084/* 9144/*
9085 * This routine will record that the cpu is going idle with tick stopped. 9145 * This routine will record that the CPU is going idle with tick stopped.
9086 * This info will be used in performing idle load balancing in the future. 9146 * This info will be used in performing idle load balancing in the future.
9087 */ 9147 */
9088void nohz_balance_enter_idle(int cpu) 9148void nohz_balance_enter_idle(int cpu)
9089{ 9149{
9090 /* 9150 /* If this CPU is going down, then nothing needs to be done: */
9091 * If this cpu is going down, then nothing needs to be done.
9092 */
9093 if (!cpu_active(cpu)) 9151 if (!cpu_active(cpu))
9094 return; 9152 return;
9095 9153
@@ -9100,9 +9158,7 @@ void nohz_balance_enter_idle(int cpu)
9100 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) 9158 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
9101 return; 9159 return;
9102 9160
9103 /* 9161 /* If we're a completely isolated CPU, we don't play: */
9104 * If we're a completely isolated CPU, we don't play.
9105 */
9106 if (on_null_domain(cpu_rq(cpu))) 9162 if (on_null_domain(cpu_rq(cpu)))
9107 return; 9163 return;
9108 9164
@@ -9211,7 +9267,7 @@ out:
9211 9267
9212 /* 9268 /*
9213 * next_balance will be updated only when there is a need. 9269 * next_balance will be updated only when there is a need.
9214 * When the cpu is attached to null domain for ex, it will not be 9270 * When the CPU is attached to null domain for ex, it will not be
9215 * updated. 9271 * updated.
9216 */ 9272 */
9217 if (likely(update_next_balance)) { 9273 if (likely(update_next_balance)) {
@@ -9235,7 +9291,7 @@ out:
9235#ifdef CONFIG_NO_HZ_COMMON 9291#ifdef CONFIG_NO_HZ_COMMON
9236/* 9292/*
9237 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the 9293 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
9238 * rebalancing for all the cpus for whom scheduler ticks are stopped. 9294 * rebalancing for all the CPUs for whom scheduler ticks are stopped.
9239 */ 9295 */
9240static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) 9296static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9241{ 9297{
@@ -9255,8 +9311,8 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9255 continue; 9311 continue;
9256 9312
9257 /* 9313 /*
9258 * If this cpu gets work to do, stop the load balancing 9314 * If this CPU gets work to do, stop the load balancing
9259 * work being done for other cpus. Next load 9315 * work being done for other CPUs. Next load
9260 * balancing owner will pick it up. 9316 * balancing owner will pick it up.
9261 */ 9317 */
9262 if (need_resched()) 9318 if (need_resched())
@@ -9298,13 +9354,13 @@ end:
9298 9354
9299/* 9355/*
9300 * Current heuristic for kicking the idle load balancer in the presence 9356 * Current heuristic for kicking the idle load balancer in the presence
9301 * of an idle cpu in the system. 9357 * of an idle CPU in the system.
9302 * - This rq has more than one task. 9358 * - This rq has more than one task.
9303 * - This rq has at least one CFS task and the capacity of the CPU is 9359 * - This rq has at least one CFS task and the capacity of the CPU is
9304 * significantly reduced because of RT tasks or IRQs. 9360 * significantly reduced because of RT tasks or IRQs.
9305 * - At parent of LLC scheduler domain level, this cpu's scheduler group has 9361 * - At parent of LLC scheduler domain level, this CPU's scheduler group has
9306 * multiple busy cpu. 9362 * multiple busy CPUs.
9307 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler 9363 * - For SD_ASYM_PACKING, if the lower numbered CPU's in the scheduler
9308 * domain span are idle. 9364 * domain span are idle.
9309 */ 9365 */
9310static inline bool nohz_kick_needed(struct rq *rq) 9366static inline bool nohz_kick_needed(struct rq *rq)
@@ -9394,10 +9450,10 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
9394 CPU_IDLE : CPU_NOT_IDLE; 9450 CPU_IDLE : CPU_NOT_IDLE;
9395 9451
9396 /* 9452 /*
9397 * If this cpu has a pending nohz_balance_kick, then do the 9453 * If this CPU has a pending nohz_balance_kick, then do the
9398 * balancing on behalf of the other idle cpus whose ticks are 9454 * balancing on behalf of the other idle CPUs whose ticks are
9399 * stopped. Do nohz_idle_balance *before* rebalance_domains to 9455 * stopped. Do nohz_idle_balance *before* rebalance_domains to
9400 * give the idle cpus a chance to load balance. Else we may 9456 * give the idle CPUs a chance to load balance. Else we may
9401 * load balance only within the local sched_domain hierarchy 9457 * load balance only within the local sched_domain hierarchy
9402 * and abort nohz_idle_balance altogether if we pull some load. 9458 * and abort nohz_idle_balance altogether if we pull some load.
9403 */ 9459 */
@@ -9440,7 +9496,12 @@ static void rq_offline_fair(struct rq *rq)
9440#endif /* CONFIG_SMP */ 9496#endif /* CONFIG_SMP */
9441 9497
9442/* 9498/*
9443 * scheduler tick hitting a task of our scheduling class: 9499 * scheduler tick hitting a task of our scheduling class.
9500 *
9501 * NOTE: This function can be called remotely by the tick offload that
9502 * goes along full dynticks. Therefore no local assumption can be made
9503 * and everything must be accessed through the @rq and @curr passed in
9504 * parameters.
9444 */ 9505 */
9445static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) 9506static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
9446{ 9507{
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 7dae9eb8c042..2975f195e1c4 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -1,23 +1,14 @@
1/* 1/*
2 * Generic entry point for the idle threads 2 * Generic entry points for the idle threads and
3 * implementation of the idle task scheduling class.
4 *
5 * (NOTE: these are not related to SCHED_IDLE batch scheduled
6 * tasks which are handled in sched/fair.c )
3 */ 7 */
4#include <linux/sched.h> 8#include "sched.h"
5#include <linux/sched/idle.h>
6#include <linux/cpu.h>
7#include <linux/cpuidle.h>
8#include <linux/cpuhotplug.h>
9#include <linux/tick.h>
10#include <linux/mm.h>
11#include <linux/stackprotector.h>
12#include <linux/suspend.h>
13#include <linux/livepatch.h>
14
15#include <asm/tlb.h>
16 9
17#include <trace/events/power.h> 10#include <trace/events/power.h>
18 11
19#include "sched.h"
20
21/* Linker adds these: start and end of __cpuidle functions */ 12/* Linker adds these: start and end of __cpuidle functions */
22extern char __cpuidle_text_start[], __cpuidle_text_end[]; 13extern char __cpuidle_text_start[], __cpuidle_text_end[];
23 14
@@ -46,6 +37,7 @@ void cpu_idle_poll_ctrl(bool enable)
46static int __init cpu_idle_poll_setup(char *__unused) 37static int __init cpu_idle_poll_setup(char *__unused)
47{ 38{
48 cpu_idle_force_poll = 1; 39 cpu_idle_force_poll = 1;
40
49 return 1; 41 return 1;
50} 42}
51__setup("nohlt", cpu_idle_poll_setup); 43__setup("nohlt", cpu_idle_poll_setup);
@@ -53,6 +45,7 @@ __setup("nohlt", cpu_idle_poll_setup);
53static int __init cpu_idle_nopoll_setup(char *__unused) 45static int __init cpu_idle_nopoll_setup(char *__unused)
54{ 46{
55 cpu_idle_force_poll = 0; 47 cpu_idle_force_poll = 0;
48
56 return 1; 49 return 1;
57} 50}
58__setup("hlt", cpu_idle_nopoll_setup); 51__setup("hlt", cpu_idle_nopoll_setup);
@@ -64,12 +57,14 @@ static noinline int __cpuidle cpu_idle_poll(void)
64 trace_cpu_idle_rcuidle(0, smp_processor_id()); 57 trace_cpu_idle_rcuidle(0, smp_processor_id());
65 local_irq_enable(); 58 local_irq_enable();
66 stop_critical_timings(); 59 stop_critical_timings();
60
67 while (!tif_need_resched() && 61 while (!tif_need_resched() &&
68 (cpu_idle_force_poll || tick_check_broadcast_expired())) 62 (cpu_idle_force_poll || tick_check_broadcast_expired()))
69 cpu_relax(); 63 cpu_relax();
70 start_critical_timings(); 64 start_critical_timings();
71 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); 65 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
72 rcu_idle_exit(); 66 rcu_idle_exit();
67
73 return 1; 68 return 1;
74} 69}
75 70
@@ -332,8 +327,8 @@ void cpu_startup_entry(enum cpuhp_state state)
332{ 327{
333 /* 328 /*
334 * This #ifdef needs to die, but it's too late in the cycle to 329 * This #ifdef needs to die, but it's too late in the cycle to
335 * make this generic (arm and sh have never invoked the canary 330 * make this generic (ARM and SH have never invoked the canary
336 * init for the non boot cpus!). Will be fixed in 3.11 331 * init for the non boot CPUs!). Will be fixed in 3.11
337 */ 332 */
338#ifdef CONFIG_X86 333#ifdef CONFIG_X86
339 /* 334 /*
@@ -350,3 +345,116 @@ void cpu_startup_entry(enum cpuhp_state state)
350 while (1) 345 while (1)
351 do_idle(); 346 do_idle();
352} 347}
348
349/*
350 * idle-task scheduling class.
351 */
352
353#ifdef CONFIG_SMP
354static int
355select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
356{
357 return task_cpu(p); /* IDLE tasks as never migrated */
358}
359#endif
360
361/*
362 * Idle tasks are unconditionally rescheduled:
363 */
364static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
365{
366 resched_curr(rq);
367}
368
369static struct task_struct *
370pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
371{
372 put_prev_task(rq, prev);
373 update_idle_core(rq);
374 schedstat_inc(rq->sched_goidle);
375
376 return rq->idle;
377}
378
379/*
380 * It is not legal to sleep in the idle task - print a warning
381 * message if some code attempts to do it:
382 */
383static void
384dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
385{
386 raw_spin_unlock_irq(&rq->lock);
387 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
388 dump_stack();
389 raw_spin_lock_irq(&rq->lock);
390}
391
392static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
393{
394}
395
396/*
397 * scheduler tick hitting a task of our scheduling class.
398 *
399 * NOTE: This function can be called remotely by the tick offload that
400 * goes along full dynticks. Therefore no local assumption can be made
401 * and everything must be accessed through the @rq and @curr passed in
402 * parameters.
403 */
404static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
405{
406}
407
408static void set_curr_task_idle(struct rq *rq)
409{
410}
411
412static void switched_to_idle(struct rq *rq, struct task_struct *p)
413{
414 BUG();
415}
416
417static void
418prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
419{
420 BUG();
421}
422
423static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
424{
425 return 0;
426}
427
428static void update_curr_idle(struct rq *rq)
429{
430}
431
432/*
433 * Simple, special scheduling class for the per-CPU idle tasks:
434 */
435const struct sched_class idle_sched_class = {
436 /* .next is NULL */
437 /* no enqueue/yield_task for idle tasks */
438
439 /* dequeue is not valid, we print a debug message there: */
440 .dequeue_task = dequeue_task_idle,
441
442 .check_preempt_curr = check_preempt_curr_idle,
443
444 .pick_next_task = pick_next_task_idle,
445 .put_prev_task = put_prev_task_idle,
446
447#ifdef CONFIG_SMP
448 .select_task_rq = select_task_rq_idle,
449 .set_cpus_allowed = set_cpus_allowed_common,
450#endif
451
452 .set_curr_task = set_curr_task_idle,
453 .task_tick = task_tick_idle,
454
455 .get_rr_interval = get_rr_interval_idle,
456
457 .prio_changed = prio_changed_idle,
458 .switched_to = switched_to_idle,
459 .update_curr = update_curr_idle,
460};
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
deleted file mode 100644
index d518664cce4f..000000000000
--- a/kernel/sched/idle_task.c
+++ /dev/null
@@ -1,110 +0,0 @@
1// SPDX-License-Identifier: GPL-2.0
2#include "sched.h"
3
4/*
5 * idle-task scheduling class.
6 *
7 * (NOTE: these are not related to SCHED_IDLE tasks which are
8 * handled in sched/fair.c)
9 */
10
11#ifdef CONFIG_SMP
12static int
13select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
14{
15 return task_cpu(p); /* IDLE tasks as never migrated */
16}
17#endif /* CONFIG_SMP */
18
19/*
20 * Idle tasks are unconditionally rescheduled:
21 */
22static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
23{
24 resched_curr(rq);
25}
26
27static struct task_struct *
28pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
29{
30 put_prev_task(rq, prev);
31 update_idle_core(rq);
32 schedstat_inc(rq->sched_goidle);
33 return rq->idle;
34}
35
36/*
37 * It is not legal to sleep in the idle task - print a warning
38 * message if some code attempts to do it:
39 */
40static void
41dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
42{
43 raw_spin_unlock_irq(&rq->lock);
44 printk(KERN_ERR "bad: scheduling from the idle thread!\n");
45 dump_stack();
46 raw_spin_lock_irq(&rq->lock);
47}
48
49static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
50{
51 rq_last_tick_reset(rq);
52}
53
54static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
55{
56}
57
58static void set_curr_task_idle(struct rq *rq)
59{
60}
61
62static void switched_to_idle(struct rq *rq, struct task_struct *p)
63{
64 BUG();
65}
66
67static void
68prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
69{
70 BUG();
71}
72
73static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
74{
75 return 0;
76}
77
78static void update_curr_idle(struct rq *rq)
79{
80}
81
82/*
83 * Simple, special scheduling class for the per-CPU idle tasks:
84 */
85const struct sched_class idle_sched_class = {
86 /* .next is NULL */
87 /* no enqueue/yield_task for idle tasks */
88
89 /* dequeue is not valid, we print a debug message there: */
90 .dequeue_task = dequeue_task_idle,
91
92 .check_preempt_curr = check_preempt_curr_idle,
93
94 .pick_next_task = pick_next_task_idle,
95 .put_prev_task = put_prev_task_idle,
96
97#ifdef CONFIG_SMP
98 .select_task_rq = select_task_rq_idle,
99 .set_cpus_allowed = set_cpus_allowed_common,
100#endif
101
102 .set_curr_task = set_curr_task_idle,
103 .task_tick = task_tick_idle,
104
105 .get_rr_interval = get_rr_interval_idle,
106
107 .prio_changed = prio_changed_idle,
108 .switched_to = switched_to_idle,
109 .update_curr = update_curr_idle,
110};
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index b71b436f59f2..e6802181900f 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -3,15 +3,10 @@
3 * any CPU: unbound workqueues, timers, kthreads and any offloadable work. 3 * any CPU: unbound workqueues, timers, kthreads and any offloadable work.
4 * 4 *
5 * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker 5 * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
6 * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
6 * 7 *
7 */ 8 */
8 9#include "sched.h"
9#include <linux/sched/isolation.h>
10#include <linux/tick.h>
11#include <linux/init.h>
12#include <linux/kernel.h>
13#include <linux/static_key.h>
14#include <linux/ctype.h>
15 10
16DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); 11DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
17EXPORT_SYMBOL_GPL(housekeeping_overriden); 12EXPORT_SYMBOL_GPL(housekeeping_overriden);
@@ -60,6 +55,9 @@ void __init housekeeping_init(void)
60 55
61 static_branch_enable(&housekeeping_overriden); 56 static_branch_enable(&housekeeping_overriden);
62 57
58 if (housekeeping_flags & HK_FLAG_TICK)
59 sched_tick_offload_init();
60
63 /* We need at least one CPU to handle housekeeping work */ 61 /* We need at least one CPU to handle housekeeping work */
64 WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); 62 WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
65} 63}
@@ -119,7 +117,7 @@ static int __init housekeeping_nohz_full_setup(char *str)
119{ 117{
120 unsigned int flags; 118 unsigned int flags;
121 119
122 flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; 120 flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
123 121
124 return housekeeping_setup(str, flags); 122 return housekeeping_setup(str, flags);
125} 123}
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 89a989e4d758..a171c1258109 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -6,10 +6,6 @@
6 * figure. Its a silly number but people think its important. We go through 6 * figure. Its a silly number but people think its important. We go through
7 * great pains to make it work on big machines and tickless kernels. 7 * great pains to make it work on big machines and tickless kernels.
8 */ 8 */
9
10#include <linux/export.h>
11#include <linux/sched/loadavg.h>
12
13#include "sched.h" 9#include "sched.h"
14 10
15/* 11/*
@@ -32,29 +28,29 @@
32 * Due to a number of reasons the above turns in the mess below: 28 * Due to a number of reasons the above turns in the mess below:
33 * 29 *
34 * - for_each_possible_cpu() is prohibitively expensive on machines with 30 * - for_each_possible_cpu() is prohibitively expensive on machines with
35 * serious number of cpus, therefore we need to take a distributed approach 31 * serious number of CPUs, therefore we need to take a distributed approach
36 * to calculating nr_active. 32 * to calculating nr_active.
37 * 33 *
38 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 34 * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
39 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } 35 * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
40 * 36 *
41 * So assuming nr_active := 0 when we start out -- true per definition, we 37 * So assuming nr_active := 0 when we start out -- true per definition, we
42 * can simply take per-cpu deltas and fold those into a global accumulate 38 * can simply take per-CPU deltas and fold those into a global accumulate
43 * to obtain the same result. See calc_load_fold_active(). 39 * to obtain the same result. See calc_load_fold_active().
44 * 40 *
45 * Furthermore, in order to avoid synchronizing all per-cpu delta folding 41 * Furthermore, in order to avoid synchronizing all per-CPU delta folding
46 * across the machine, we assume 10 ticks is sufficient time for every 42 * across the machine, we assume 10 ticks is sufficient time for every
47 * cpu to have completed this task. 43 * CPU to have completed this task.
48 * 44 *
49 * This places an upper-bound on the IRQ-off latency of the machine. Then 45 * This places an upper-bound on the IRQ-off latency of the machine. Then
50 * again, being late doesn't loose the delta, just wrecks the sample. 46 * again, being late doesn't loose the delta, just wrecks the sample.
51 * 47 *
52 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because 48 * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
53 * this would add another cross-cpu cacheline miss and atomic operation 49 * this would add another cross-CPU cacheline miss and atomic operation
54 * to the wakeup path. Instead we increment on whatever cpu the task ran 50 * to the wakeup path. Instead we increment on whatever CPU the task ran
55 * when it went into uninterruptible state and decrement on whatever cpu 51 * when it went into uninterruptible state and decrement on whatever CPU
56 * did the wakeup. This means that only the sum of nr_uninterruptible over 52 * did the wakeup. This means that only the sum of nr_uninterruptible over
57 * all cpus yields the correct result. 53 * all CPUs yields the correct result.
58 * 54 *
59 * This covers the NO_HZ=n code, for extra head-aches, see the comment below. 55 * This covers the NO_HZ=n code, for extra head-aches, see the comment below.
60 */ 56 */
@@ -115,11 +111,11 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
115 * Handle NO_HZ for the global load-average. 111 * Handle NO_HZ for the global load-average.
116 * 112 *
117 * Since the above described distributed algorithm to compute the global 113 * Since the above described distributed algorithm to compute the global
118 * load-average relies on per-cpu sampling from the tick, it is affected by 114 * load-average relies on per-CPU sampling from the tick, it is affected by
119 * NO_HZ. 115 * NO_HZ.
120 * 116 *
121 * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon 117 * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
122 * entering NO_HZ state such that we can include this as an 'extra' cpu delta 118 * entering NO_HZ state such that we can include this as an 'extra' CPU delta
123 * when we read the global state. 119 * when we read the global state.
124 * 120 *
125 * Obviously reality has to ruin such a delightfully simple scheme: 121 * Obviously reality has to ruin such a delightfully simple scheme:
@@ -146,9 +142,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
146 * busy state. 142 * busy state.
147 * 143 *
148 * This is solved by pushing the window forward, and thus skipping the 144 * This is solved by pushing the window forward, and thus skipping the
149 * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which 145 * sample, for this CPU (effectively using the NO_HZ-delta for this CPU which
150 * was in effect at the time the window opened). This also solves the issue 146 * was in effect at the time the window opened). This also solves the issue
151 * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ 147 * of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ
152 * intervals. 148 * intervals.
153 * 149 *
154 * When making the ILB scale, we should try to pull this in as well. 150 * When making the ILB scale, we should try to pull this in as well.
@@ -299,7 +295,7 @@ calc_load_n(unsigned long load, unsigned long exp,
299} 295}
300 296
301/* 297/*
302 * NO_HZ can leave us missing all per-cpu ticks calling 298 * NO_HZ can leave us missing all per-CPU ticks calling
303 * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into 299 * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
304 * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold 300 * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
305 * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. 301 * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
@@ -363,7 +359,7 @@ void calc_global_load(unsigned long ticks)
363 return; 359 return;
364 360
365 /* 361 /*
366 * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus. 362 * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
367 */ 363 */
368 delta = calc_load_nohz_fold(); 364 delta = calc_load_nohz_fold();
369 if (delta) 365 if (delta)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 5d0762633639..76e0eaf4654e 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -13,32 +13,25 @@
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details. 14 * GNU General Public License for more details.
15 */ 15 */
16 16#include "sched.h"
17#include <linux/syscalls.h>
18#include <linux/membarrier.h>
19#include <linux/tick.h>
20#include <linux/cpumask.h>
21#include <linux/atomic.h>
22
23#include "sched.h" /* for cpu_rq(). */
24 17
25/* 18/*
26 * Bitmask made from a "or" of all commands within enum membarrier_cmd, 19 * Bitmask made from a "or" of all commands within enum membarrier_cmd,
27 * except MEMBARRIER_CMD_QUERY. 20 * except MEMBARRIER_CMD_QUERY.
28 */ 21 */
29#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE 22#ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
30#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ 23#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \
31 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ 24 (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
32 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) 25 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
33#else 26#else
34#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 27#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0
35#endif 28#endif
36 29
37#define MEMBARRIER_CMD_BITMASK \ 30#define MEMBARRIER_CMD_BITMASK \
38 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ 31 (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
39 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ 32 | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
40 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ 33 | MEMBARRIER_CMD_PRIVATE_EXPEDITED \
41 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ 34 | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \
42 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) 35 | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
43 36
44static void ipi_mb(void *info) 37static void ipi_mb(void *info)
@@ -85,6 +78,7 @@ static int membarrier_global_expedited(void)
85 */ 78 */
86 if (cpu == raw_smp_processor_id()) 79 if (cpu == raw_smp_processor_id())
87 continue; 80 continue;
81
88 rcu_read_lock(); 82 rcu_read_lock();
89 p = task_rcu_dereference(&cpu_rq(cpu)->curr); 83 p = task_rcu_dereference(&cpu_rq(cpu)->curr);
90 if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & 84 if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
@@ -188,6 +182,7 @@ static int membarrier_private_expedited(int flags)
188 * rq->curr modification in scheduler. 182 * rq->curr modification in scheduler.
189 */ 183 */
190 smp_mb(); /* exit from system call is not a mb */ 184 smp_mb(); /* exit from system call is not a mb */
185
191 return 0; 186 return 0;
192} 187}
193 188
@@ -219,6 +214,7 @@ static int membarrier_register_global_expedited(void)
219 } 214 }
220 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, 215 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
221 &mm->membarrier_state); 216 &mm->membarrier_state);
217
222 return 0; 218 return 0;
223} 219}
224 220
@@ -253,6 +249,7 @@ static int membarrier_register_private_expedited(int flags)
253 synchronize_sched(); 249 synchronize_sched();
254 } 250 }
255 atomic_or(state, &mm->membarrier_state); 251 atomic_or(state, &mm->membarrier_state);
252
256 return 0; 253 return 0;
257} 254}
258 255
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index aad49451584e..4f4fd3b157f1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -3,12 +3,8 @@
3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR 3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
4 * policies) 4 * policies)
5 */ 5 */
6
7#include "sched.h" 6#include "sched.h"
8 7
9#include <linux/slab.h>
10#include <linux/irq_work.h>
11
12int sched_rr_timeslice = RR_TIMESLICE; 8int sched_rr_timeslice = RR_TIMESLICE;
13int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; 9int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
14 10
@@ -359,7 +355,7 @@ static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
359static void push_rt_tasks(struct rq *); 355static void push_rt_tasks(struct rq *);
360static void pull_rt_task(struct rq *); 356static void pull_rt_task(struct rq *);
361 357
362static inline void queue_push_tasks(struct rq *rq) 358static inline void rt_queue_push_tasks(struct rq *rq)
363{ 359{
364 if (!has_pushable_tasks(rq)) 360 if (!has_pushable_tasks(rq))
365 return; 361 return;
@@ -367,7 +363,7 @@ static inline void queue_push_tasks(struct rq *rq)
367 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); 363 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
368} 364}
369 365
370static inline void queue_pull_task(struct rq *rq) 366static inline void rt_queue_pull_task(struct rq *rq)
371{ 367{
372 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); 368 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
373} 369}
@@ -425,7 +421,7 @@ static inline void pull_rt_task(struct rq *this_rq)
425{ 421{
426} 422}
427 423
428static inline void queue_push_tasks(struct rq *rq) 424static inline void rt_queue_push_tasks(struct rq *rq)
429{ 425{
430} 426}
431#endif /* CONFIG_SMP */ 427#endif /* CONFIG_SMP */
@@ -1453,9 +1449,9 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1453 return; 1449 return;
1454 1450
1455 /* 1451 /*
1456 * There appears to be other cpus that can accept 1452 * There appear to be other CPUs that can accept
1457 * current and none to run 'p', so lets reschedule 1453 * the current task but none can run 'p', so lets reschedule
1458 * to try and push current away: 1454 * to try and push the current task away:
1459 */ 1455 */
1460 requeue_task_rt(rq, p, 1); 1456 requeue_task_rt(rq, p, 1);
1461 resched_curr(rq); 1457 resched_curr(rq);
@@ -1569,7 +1565,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1569 /* The running task is never eligible for pushing */ 1565 /* The running task is never eligible for pushing */
1570 dequeue_pushable_task(rq, p); 1566 dequeue_pushable_task(rq, p);
1571 1567
1572 queue_push_tasks(rq); 1568 rt_queue_push_tasks(rq);
1573 1569
1574 return p; 1570 return p;
1575} 1571}
@@ -1596,12 +1592,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1596 if (!task_running(rq, p) && 1592 if (!task_running(rq, p) &&
1597 cpumask_test_cpu(cpu, &p->cpus_allowed)) 1593 cpumask_test_cpu(cpu, &p->cpus_allowed))
1598 return 1; 1594 return 1;
1595
1599 return 0; 1596 return 0;
1600} 1597}
1601 1598
1602/* 1599/*
1603 * Return the highest pushable rq's task, which is suitable to be executed 1600 * Return the highest pushable rq's task, which is suitable to be executed
1604 * on the cpu, NULL otherwise 1601 * on the CPU, NULL otherwise
1605 */ 1602 */
1606static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) 1603static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1607{ 1604{
@@ -1639,11 +1636,11 @@ static int find_lowest_rq(struct task_struct *task)
1639 return -1; /* No targets found */ 1636 return -1; /* No targets found */
1640 1637
1641 /* 1638 /*
1642 * At this point we have built a mask of cpus representing the 1639 * At this point we have built a mask of CPUs representing the
1643 * lowest priority tasks in the system. Now we want to elect 1640 * lowest priority tasks in the system. Now we want to elect
1644 * the best one based on our affinity and topology. 1641 * the best one based on our affinity and topology.
1645 * 1642 *
1646 * We prioritize the last cpu that the task executed on since 1643 * We prioritize the last CPU that the task executed on since
1647 * it is most likely cache-hot in that location. 1644 * it is most likely cache-hot in that location.
1648 */ 1645 */
1649 if (cpumask_test_cpu(cpu, lowest_mask)) 1646 if (cpumask_test_cpu(cpu, lowest_mask))
@@ -1651,7 +1648,7 @@ static int find_lowest_rq(struct task_struct *task)
1651 1648
1652 /* 1649 /*
1653 * Otherwise, we consult the sched_domains span maps to figure 1650 * Otherwise, we consult the sched_domains span maps to figure
1654 * out which cpu is logically closest to our hot cache data. 1651 * out which CPU is logically closest to our hot cache data.
1655 */ 1652 */
1656 if (!cpumask_test_cpu(this_cpu, lowest_mask)) 1653 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1657 this_cpu = -1; /* Skip this_cpu opt if not among lowest */ 1654 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
@@ -1692,6 +1689,7 @@ static int find_lowest_rq(struct task_struct *task)
1692 cpu = cpumask_any(lowest_mask); 1689 cpu = cpumask_any(lowest_mask);
1693 if (cpu < nr_cpu_ids) 1690 if (cpu < nr_cpu_ids)
1694 return cpu; 1691 return cpu;
1692
1695 return -1; 1693 return -1;
1696} 1694}
1697 1695
@@ -1827,7 +1825,7 @@ retry:
1827 * The task hasn't migrated, and is still the next 1825 * The task hasn't migrated, and is still the next
1828 * eligible task, but we failed to find a run-queue 1826 * eligible task, but we failed to find a run-queue
1829 * to push it to. Do not retry in this case, since 1827 * to push it to. Do not retry in this case, since
1830 * other cpus will pull from us when ready. 1828 * other CPUs will pull from us when ready.
1831 */ 1829 */
1832 goto out; 1830 goto out;
1833 } 1831 }
@@ -1919,7 +1917,7 @@ static int rto_next_cpu(struct root_domain *rd)
1919 * rt_next_cpu() will simply return the first CPU found in 1917 * rt_next_cpu() will simply return the first CPU found in
1920 * the rto_mask. 1918 * the rto_mask.
1921 * 1919 *
1922 * If rto_next_cpu() is called with rto_cpu is a valid cpu, it 1920 * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
1923 * will return the next CPU found in the rto_mask. 1921 * will return the next CPU found in the rto_mask.
1924 * 1922 *
1925 * If there are no more CPUs left in the rto_mask, then a check is made 1923 * If there are no more CPUs left in the rto_mask, then a check is made
@@ -1980,7 +1978,7 @@ static void tell_cpu_to_push(struct rq *rq)
1980 raw_spin_lock(&rq->rd->rto_lock); 1978 raw_spin_lock(&rq->rd->rto_lock);
1981 1979
1982 /* 1980 /*
1983 * The rto_cpu is updated under the lock, if it has a valid cpu 1981 * The rto_cpu is updated under the lock, if it has a valid CPU
1984 * then the IPI is still running and will continue due to the 1982 * then the IPI is still running and will continue due to the
1985 * update to loop_next, and nothing needs to be done here. 1983 * update to loop_next, and nothing needs to be done here.
1986 * Otherwise it is finishing up and an ipi needs to be sent. 1984 * Otherwise it is finishing up and an ipi needs to be sent.
@@ -2105,7 +2103,7 @@ static void pull_rt_task(struct rq *this_rq)
2105 2103
2106 /* 2104 /*
2107 * There's a chance that p is higher in priority 2105 * There's a chance that p is higher in priority
2108 * than what's currently running on its cpu. 2106 * than what's currently running on its CPU.
2109 * This is just that p is wakeing up and hasn't 2107 * This is just that p is wakeing up and hasn't
2110 * had a chance to schedule. We only pull 2108 * had a chance to schedule. We only pull
2111 * p if it is lower in priority than the 2109 * p if it is lower in priority than the
@@ -2187,7 +2185,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
2187 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) 2185 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2188 return; 2186 return;
2189 2187
2190 queue_pull_task(rq); 2188 rt_queue_pull_task(rq);
2191} 2189}
2192 2190
2193void __init init_sched_rt_class(void) 2191void __init init_sched_rt_class(void)
@@ -2218,7 +2216,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
2218 if (task_on_rq_queued(p) && rq->curr != p) { 2216 if (task_on_rq_queued(p) && rq->curr != p) {
2219#ifdef CONFIG_SMP 2217#ifdef CONFIG_SMP
2220 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) 2218 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2221 queue_push_tasks(rq); 2219 rt_queue_push_tasks(rq);
2222#endif /* CONFIG_SMP */ 2220#endif /* CONFIG_SMP */
2223 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) 2221 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
2224 resched_curr(rq); 2222 resched_curr(rq);
@@ -2242,7 +2240,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2242 * may need to pull tasks to this runqueue. 2240 * may need to pull tasks to this runqueue.
2243 */ 2241 */
2244 if (oldprio < p->prio) 2242 if (oldprio < p->prio)
2245 queue_pull_task(rq); 2243 rt_queue_pull_task(rq);
2246 2244
2247 /* 2245 /*
2248 * If there's a higher priority task waiting to run 2246 * If there's a higher priority task waiting to run
@@ -2292,6 +2290,14 @@ static void watchdog(struct rq *rq, struct task_struct *p)
2292static inline void watchdog(struct rq *rq, struct task_struct *p) { } 2290static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2293#endif 2291#endif
2294 2292
2293/*
2294 * scheduler tick hitting a task of our scheduling class.
2295 *
2296 * NOTE: This function can be called remotely by the tick offload that
2297 * goes along full dynticks. Therefore no local assumption can be made
2298 * and everything must be accessed through the @rq and @curr passed in
2299 * parameters.
2300 */
2295static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) 2301static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2296{ 2302{
2297 struct sched_rt_entity *rt_se = &p->rt; 2303 struct sched_rt_entity *rt_se = &p->rt;
@@ -2685,6 +2691,7 @@ int sched_rr_handler(struct ctl_table *table, int write,
2685 msecs_to_jiffies(sysctl_sched_rr_timeslice); 2691 msecs_to_jiffies(sysctl_sched_rr_timeslice);
2686 } 2692 }
2687 mutex_unlock(&mutex); 2693 mutex_unlock(&mutex);
2694
2688 return ret; 2695 return ret;
2689} 2696}
2690 2697
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb5fc458547f..23ba4dd76ac4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,39 +1,73 @@
1/* SPDX-License-Identifier: GPL-2.0 */ 1/* SPDX-License-Identifier: GPL-2.0 */
2 2/*
3 * Scheduler internal types and methods:
4 */
3#include <linux/sched.h> 5#include <linux/sched.h>
6
4#include <linux/sched/autogroup.h> 7#include <linux/sched/autogroup.h>
5#include <linux/sched/sysctl.h>
6#include <linux/sched/topology.h>
7#include <linux/sched/rt.h>
8#include <linux/sched/deadline.h>
9#include <linux/sched/clock.h> 8#include <linux/sched/clock.h>
10#include <linux/sched/wake_q.h> 9#include <linux/sched/coredump.h>
11#include <linux/sched/signal.h>
12#include <linux/sched/numa_balancing.h>
13#include <linux/sched/mm.h>
14#include <linux/sched/cpufreq.h> 10#include <linux/sched/cpufreq.h>
15#include <linux/sched/stat.h> 11#include <linux/sched/cputime.h>
16#include <linux/sched/nohz.h> 12#include <linux/sched/deadline.h>
17#include <linux/sched/debug.h> 13#include <linux/sched/debug.h>
18#include <linux/sched/hotplug.h> 14#include <linux/sched/hotplug.h>
15#include <linux/sched/idle.h>
16#include <linux/sched/init.h>
17#include <linux/sched/isolation.h>
18#include <linux/sched/jobctl.h>
19#include <linux/sched/loadavg.h>
20#include <linux/sched/mm.h>
21#include <linux/sched/nohz.h>
22#include <linux/sched/numa_balancing.h>
23#include <linux/sched/prio.h>
24#include <linux/sched/rt.h>
25#include <linux/sched/signal.h>
26#include <linux/sched/stat.h>
27#include <linux/sched/sysctl.h>
19#include <linux/sched/task.h> 28#include <linux/sched/task.h>
20#include <linux/sched/task_stack.h> 29#include <linux/sched/task_stack.h>
21#include <linux/sched/cputime.h> 30#include <linux/sched/topology.h>
22#include <linux/sched/init.h> 31#include <linux/sched/user.h>
32#include <linux/sched/wake_q.h>
33#include <linux/sched/xacct.h>
34
35#include <uapi/linux/sched/types.h>
23 36
24#include <linux/u64_stats_sync.h>
25#include <linux/kernel_stat.h>
26#include <linux/binfmts.h> 37#include <linux/binfmts.h>
27#include <linux/mutex.h> 38#include <linux/blkdev.h>
28#include <linux/spinlock.h> 39#include <linux/compat.h>
40#include <linux/context_tracking.h>
41#include <linux/cpufreq.h>
42#include <linux/cpuidle.h>
43#include <linux/cpuset.h>
44#include <linux/ctype.h>
45#include <linux/debugfs.h>
46#include <linux/delayacct.h>
47#include <linux/init_task.h>
48#include <linux/kprobes.h>
49#include <linux/kthread.h>
50#include <linux/membarrier.h>
51#include <linux/migrate.h>
52#include <linux/mmu_context.h>
53#include <linux/nmi.h>
54#include <linux/proc_fs.h>
55#include <linux/prefetch.h>
56#include <linux/profile.h>
57#include <linux/rcupdate_wait.h>
58#include <linux/security.h>
59#include <linux/stackprotector.h>
29#include <linux/stop_machine.h> 60#include <linux/stop_machine.h>
30#include <linux/irq_work.h> 61#include <linux/suspend.h>
31#include <linux/tick.h> 62#include <linux/swait.h>
32#include <linux/slab.h> 63#include <linux/syscalls.h>
33#include <linux/cgroup.h> 64#include <linux/task_work.h>
65#include <linux/tsacct_kern.h>
66
67#include <asm/tlb.h>
34 68
35#ifdef CONFIG_PARAVIRT 69#ifdef CONFIG_PARAVIRT
36#include <asm/paravirt.h> 70# include <asm/paravirt.h>
37#endif 71#endif
38 72
39#include "cpupri.h" 73#include "cpupri.h"
@@ -79,11 +113,11 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
79 * and does not change the user-interface for setting shares/weights. 113 * and does not change the user-interface for setting shares/weights.
80 * 114 *
81 * We increase resolution only if we have enough bits to allow this increased 115 * We increase resolution only if we have enough bits to allow this increased
82 * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are 116 * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit
83 * pretty high and the returns do not justify the increased costs. 117 * are pretty high and the returns do not justify the increased costs.
84 * 118 *
85 * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to 119 * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to
86 * increase coverage and consistency always enable it on 64bit platforms. 120 * increase coverage and consistency always enable it on 64-bit platforms.
87 */ 121 */
88#ifdef CONFIG_64BIT 122#ifdef CONFIG_64BIT
89# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) 123# define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
@@ -111,16 +145,12 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
111 * 10 -> just above 1us 145 * 10 -> just above 1us
112 * 9 -> just above 0.5us 146 * 9 -> just above 0.5us
113 */ 147 */
114#define DL_SCALE (10) 148#define DL_SCALE 10
115
116/*
117 * These are the 'tuning knobs' of the scheduler:
118 */
119 149
120/* 150/*
121 * single value that denotes runtime == period, ie unlimited time. 151 * Single value that denotes runtime == period, ie unlimited time.
122 */ 152 */
123#define RUNTIME_INF ((u64)~0ULL) 153#define RUNTIME_INF ((u64)~0ULL)
124 154
125static inline int idle_policy(int policy) 155static inline int idle_policy(int policy)
126{ 156{
@@ -235,9 +265,9 @@ void __dl_clear_params(struct task_struct *p);
235 * control. 265 * control.
236 */ 266 */
237struct dl_bandwidth { 267struct dl_bandwidth {
238 raw_spinlock_t dl_runtime_lock; 268 raw_spinlock_t dl_runtime_lock;
239 u64 dl_runtime; 269 u64 dl_runtime;
240 u64 dl_period; 270 u64 dl_period;
241}; 271};
242 272
243static inline int dl_bandwidth_enabled(void) 273static inline int dl_bandwidth_enabled(void)
@@ -246,8 +276,9 @@ static inline int dl_bandwidth_enabled(void)
246} 276}
247 277
248struct dl_bw { 278struct dl_bw {
249 raw_spinlock_t lock; 279 raw_spinlock_t lock;
250 u64 bw, total_bw; 280 u64 bw;
281 u64 total_bw;
251}; 282};
252 283
253static inline void __dl_update(struct dl_bw *dl_b, s64 bw); 284static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
@@ -273,20 +304,17 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
273 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; 304 dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
274} 305}
275 306
276void dl_change_utilization(struct task_struct *p, u64 new_bw); 307extern void dl_change_utilization(struct task_struct *p, u64 new_bw);
277extern void init_dl_bw(struct dl_bw *dl_b); 308extern void init_dl_bw(struct dl_bw *dl_b);
278extern int sched_dl_global_validate(void); 309extern int sched_dl_global_validate(void);
279extern void sched_dl_do_global(void); 310extern void sched_dl_do_global(void);
280extern int sched_dl_overflow(struct task_struct *p, int policy, 311extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr);
281 const struct sched_attr *attr);
282extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); 312extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
283extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); 313extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
284extern bool __checkparam_dl(const struct sched_attr *attr); 314extern bool __checkparam_dl(const struct sched_attr *attr);
285extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); 315extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
286extern int dl_task_can_attach(struct task_struct *p, 316extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
287 const struct cpumask *cs_cpus_allowed); 317extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
288extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
289 const struct cpumask *trial);
290extern bool dl_cpu_busy(unsigned int cpu); 318extern bool dl_cpu_busy(unsigned int cpu);
291 319
292#ifdef CONFIG_CGROUP_SCHED 320#ifdef CONFIG_CGROUP_SCHED
@@ -300,32 +328,36 @@ extern struct list_head task_groups;
300 328
301struct cfs_bandwidth { 329struct cfs_bandwidth {
302#ifdef CONFIG_CFS_BANDWIDTH 330#ifdef CONFIG_CFS_BANDWIDTH
303 raw_spinlock_t lock; 331 raw_spinlock_t lock;
304 ktime_t period; 332 ktime_t period;
305 u64 quota, runtime; 333 u64 quota;
306 s64 hierarchical_quota; 334 u64 runtime;
307 u64 runtime_expires; 335 s64 hierarchical_quota;
308 336 u64 runtime_expires;
309 int idle, period_active; 337
310 struct hrtimer period_timer, slack_timer; 338 int idle;
311 struct list_head throttled_cfs_rq; 339 int period_active;
312 340 struct hrtimer period_timer;
313 /* statistics */ 341 struct hrtimer slack_timer;
314 int nr_periods, nr_throttled; 342 struct list_head throttled_cfs_rq;
315 u64 throttled_time; 343
344 /* Statistics: */
345 int nr_periods;
346 int nr_throttled;
347 u64 throttled_time;
316#endif 348#endif
317}; 349};
318 350
319/* task group related information */ 351/* Task group related information */
320struct task_group { 352struct task_group {
321 struct cgroup_subsys_state css; 353 struct cgroup_subsys_state css;
322 354
323#ifdef CONFIG_FAIR_GROUP_SCHED 355#ifdef CONFIG_FAIR_GROUP_SCHED
324 /* schedulable entities of this group on each cpu */ 356 /* schedulable entities of this group on each CPU */
325 struct sched_entity **se; 357 struct sched_entity **se;
326 /* runqueue "owned" by this group on each cpu */ 358 /* runqueue "owned" by this group on each CPU */
327 struct cfs_rq **cfs_rq; 359 struct cfs_rq **cfs_rq;
328 unsigned long shares; 360 unsigned long shares;
329 361
330#ifdef CONFIG_SMP 362#ifdef CONFIG_SMP
331 /* 363 /*
@@ -333,29 +365,29 @@ struct task_group {
333 * it in its own cacheline separated from the fields above which 365 * it in its own cacheline separated from the fields above which
334 * will also be accessed at each tick. 366 * will also be accessed at each tick.
335 */ 367 */
336 atomic_long_t load_avg ____cacheline_aligned; 368 atomic_long_t load_avg ____cacheline_aligned;
337#endif 369#endif
338#endif 370#endif
339 371
340#ifdef CONFIG_RT_GROUP_SCHED 372#ifdef CONFIG_RT_GROUP_SCHED
341 struct sched_rt_entity **rt_se; 373 struct sched_rt_entity **rt_se;
342 struct rt_rq **rt_rq; 374 struct rt_rq **rt_rq;
343 375
344 struct rt_bandwidth rt_bandwidth; 376 struct rt_bandwidth rt_bandwidth;
345#endif 377#endif
346 378
347 struct rcu_head rcu; 379 struct rcu_head rcu;
348 struct list_head list; 380 struct list_head list;
349 381
350 struct task_group *parent; 382 struct task_group *parent;
351 struct list_head siblings; 383 struct list_head siblings;
352 struct list_head children; 384 struct list_head children;
353 385
354#ifdef CONFIG_SCHED_AUTOGROUP 386#ifdef CONFIG_SCHED_AUTOGROUP
355 struct autogroup *autogroup; 387 struct autogroup *autogroup;
356#endif 388#endif
357 389
358 struct cfs_bandwidth cfs_bandwidth; 390 struct cfs_bandwidth cfs_bandwidth;
359}; 391};
360 392
361#ifdef CONFIG_FAIR_GROUP_SCHED 393#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -369,8 +401,8 @@ struct task_group {
369 * (The default weight is 1024 - so there's no practical 401 * (The default weight is 1024 - so there's no practical
370 * limitation from this.) 402 * limitation from this.)
371 */ 403 */
372#define MIN_SHARES (1UL << 1) 404#define MIN_SHARES (1UL << 1)
373#define MAX_SHARES (1UL << 18) 405#define MAX_SHARES (1UL << 18)
374#endif 406#endif
375 407
376typedef int (*tg_visitor)(struct task_group *, void *); 408typedef int (*tg_visitor)(struct task_group *, void *);
@@ -443,35 +475,39 @@ struct cfs_bandwidth { };
443 475
444/* CFS-related fields in a runqueue */ 476/* CFS-related fields in a runqueue */
445struct cfs_rq { 477struct cfs_rq {
446 struct load_weight load; 478 struct load_weight load;
447 unsigned long runnable_weight; 479 unsigned long runnable_weight;
448 unsigned int nr_running, h_nr_running; 480 unsigned int nr_running;
481 unsigned int h_nr_running;
449 482
450 u64 exec_clock; 483 u64 exec_clock;
451 u64 min_vruntime; 484 u64 min_vruntime;
452#ifndef CONFIG_64BIT 485#ifndef CONFIG_64BIT
453 u64 min_vruntime_copy; 486 u64 min_vruntime_copy;
454#endif 487#endif
455 488
456 struct rb_root_cached tasks_timeline; 489 struct rb_root_cached tasks_timeline;
457 490
458 /* 491 /*
459 * 'curr' points to currently running entity on this cfs_rq. 492 * 'curr' points to currently running entity on this cfs_rq.
460 * It is set to NULL otherwise (i.e when none are currently running). 493 * It is set to NULL otherwise (i.e when none are currently running).
461 */ 494 */
462 struct sched_entity *curr, *next, *last, *skip; 495 struct sched_entity *curr;
496 struct sched_entity *next;
497 struct sched_entity *last;
498 struct sched_entity *skip;
463 499
464#ifdef CONFIG_SCHED_DEBUG 500#ifdef CONFIG_SCHED_DEBUG
465 unsigned int nr_spread_over; 501 unsigned int nr_spread_over;
466#endif 502#endif
467 503
468#ifdef CONFIG_SMP 504#ifdef CONFIG_SMP
469 /* 505 /*
470 * CFS load tracking 506 * CFS load tracking
471 */ 507 */
472 struct sched_avg avg; 508 struct sched_avg avg;
473#ifndef CONFIG_64BIT 509#ifndef CONFIG_64BIT
474 u64 load_last_update_time_copy; 510 u64 load_last_update_time_copy;
475#endif 511#endif
476 struct { 512 struct {
477 raw_spinlock_t lock ____cacheline_aligned; 513 raw_spinlock_t lock ____cacheline_aligned;
@@ -482,9 +518,9 @@ struct cfs_rq {
482 } removed; 518 } removed;
483 519
484#ifdef CONFIG_FAIR_GROUP_SCHED 520#ifdef CONFIG_FAIR_GROUP_SCHED
485 unsigned long tg_load_avg_contrib; 521 unsigned long tg_load_avg_contrib;
486 long propagate; 522 long propagate;
487 long prop_runnable_sum; 523 long prop_runnable_sum;
488 524
489 /* 525 /*
490 * h_load = weight * f(tg) 526 * h_load = weight * f(tg)
@@ -492,36 +528,38 @@ struct cfs_rq {
492 * Where f(tg) is the recursive weight fraction assigned to 528 * Where f(tg) is the recursive weight fraction assigned to
493 * this group. 529 * this group.
494 */ 530 */
495 unsigned long h_load; 531 unsigned long h_load;
496 u64 last_h_load_update; 532 u64 last_h_load_update;
497 struct sched_entity *h_load_next; 533 struct sched_entity *h_load_next;
498#endif /* CONFIG_FAIR_GROUP_SCHED */ 534#endif /* CONFIG_FAIR_GROUP_SCHED */
499#endif /* CONFIG_SMP */ 535#endif /* CONFIG_SMP */
500 536
501#ifdef CONFIG_FAIR_GROUP_SCHED 537#ifdef CONFIG_FAIR_GROUP_SCHED
502 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 538 struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */
503 539
504 /* 540 /*
505 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 541 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
506 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 542 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
507 * (like users, containers etc.) 543 * (like users, containers etc.)
508 * 544 *
509 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 545 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
510 * list is used during load balance. 546 * This list is used during load balance.
511 */ 547 */
512 int on_list; 548 int on_list;
513 struct list_head leaf_cfs_rq_list; 549 struct list_head leaf_cfs_rq_list;
514 struct task_group *tg; /* group that "owns" this runqueue */ 550 struct task_group *tg; /* group that "owns" this runqueue */
515 551
516#ifdef CONFIG_CFS_BANDWIDTH 552#ifdef CONFIG_CFS_BANDWIDTH
517 int runtime_enabled; 553 int runtime_enabled;
518 u64 runtime_expires; 554 u64 runtime_expires;
519 s64 runtime_remaining; 555 s64 runtime_remaining;
520 556
521 u64 throttled_clock, throttled_clock_task; 557 u64 throttled_clock;
522 u64 throttled_clock_task_time; 558 u64 throttled_clock_task;
523 int throttled, throttle_count; 559 u64 throttled_clock_task_time;
524 struct list_head throttled_list; 560 int throttled;
561 int throttle_count;
562 struct list_head throttled_list;
525#endif /* CONFIG_CFS_BANDWIDTH */ 563#endif /* CONFIG_CFS_BANDWIDTH */
526#endif /* CONFIG_FAIR_GROUP_SCHED */ 564#endif /* CONFIG_FAIR_GROUP_SCHED */
527}; 565};
@@ -538,45 +576,45 @@ static inline int rt_bandwidth_enabled(void)
538 576
539/* Real-Time classes' related field in a runqueue: */ 577/* Real-Time classes' related field in a runqueue: */
540struct rt_rq { 578struct rt_rq {
541 struct rt_prio_array active; 579 struct rt_prio_array active;
542 unsigned int rt_nr_running; 580 unsigned int rt_nr_running;
543 unsigned int rr_nr_running; 581 unsigned int rr_nr_running;
544#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 582#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
545 struct { 583 struct {
546 int curr; /* highest queued rt task prio */ 584 int curr; /* highest queued rt task prio */
547#ifdef CONFIG_SMP 585#ifdef CONFIG_SMP
548 int next; /* next highest */ 586 int next; /* next highest */
549#endif 587#endif
550 } highest_prio; 588 } highest_prio;
551#endif 589#endif
552#ifdef CONFIG_SMP 590#ifdef CONFIG_SMP
553 unsigned long rt_nr_migratory; 591 unsigned long rt_nr_migratory;
554 unsigned long rt_nr_total; 592 unsigned long rt_nr_total;
555 int overloaded; 593 int overloaded;
556 struct plist_head pushable_tasks; 594 struct plist_head pushable_tasks;
557#endif /* CONFIG_SMP */ 595#endif /* CONFIG_SMP */
558 int rt_queued; 596 int rt_queued;
559 597
560 int rt_throttled; 598 int rt_throttled;
561 u64 rt_time; 599 u64 rt_time;
562 u64 rt_runtime; 600 u64 rt_runtime;
563 /* Nests inside the rq lock: */ 601 /* Nests inside the rq lock: */
564 raw_spinlock_t rt_runtime_lock; 602 raw_spinlock_t rt_runtime_lock;
565 603
566#ifdef CONFIG_RT_GROUP_SCHED 604#ifdef CONFIG_RT_GROUP_SCHED
567 unsigned long rt_nr_boosted; 605 unsigned long rt_nr_boosted;
568 606
569 struct rq *rq; 607 struct rq *rq;
570 struct task_group *tg; 608 struct task_group *tg;
571#endif 609#endif
572}; 610};
573 611
574/* Deadline class' related fields in a runqueue */ 612/* Deadline class' related fields in a runqueue */
575struct dl_rq { 613struct dl_rq {
576 /* runqueue is an rbtree, ordered by deadline */ 614 /* runqueue is an rbtree, ordered by deadline */
577 struct rb_root_cached root; 615 struct rb_root_cached root;
578 616
579 unsigned long dl_nr_running; 617 unsigned long dl_nr_running;
580 618
581#ifdef CONFIG_SMP 619#ifdef CONFIG_SMP
582 /* 620 /*
@@ -586,28 +624,28 @@ struct dl_rq {
586 * should migrate somewhere else. 624 * should migrate somewhere else.
587 */ 625 */
588 struct { 626 struct {
589 u64 curr; 627 u64 curr;
590 u64 next; 628 u64 next;
591 } earliest_dl; 629 } earliest_dl;
592 630
593 unsigned long dl_nr_migratory; 631 unsigned long dl_nr_migratory;
594 int overloaded; 632 int overloaded;
595 633
596 /* 634 /*
597 * Tasks on this rq that can be pushed away. They are kept in 635 * Tasks on this rq that can be pushed away. They are kept in
598 * an rb-tree, ordered by tasks' deadlines, with caching 636 * an rb-tree, ordered by tasks' deadlines, with caching
599 * of the leftmost (earliest deadline) element. 637 * of the leftmost (earliest deadline) element.
600 */ 638 */
601 struct rb_root_cached pushable_dl_tasks_root; 639 struct rb_root_cached pushable_dl_tasks_root;
602#else 640#else
603 struct dl_bw dl_bw; 641 struct dl_bw dl_bw;
604#endif 642#endif
605 /* 643 /*
606 * "Active utilization" for this runqueue: increased when a 644 * "Active utilization" for this runqueue: increased when a
607 * task wakes up (becomes TASK_RUNNING) and decreased when a 645 * task wakes up (becomes TASK_RUNNING) and decreased when a
608 * task blocks 646 * task blocks
609 */ 647 */
610 u64 running_bw; 648 u64 running_bw;
611 649
612 /* 650 /*
613 * Utilization of the tasks "assigned" to this runqueue (including 651 * Utilization of the tasks "assigned" to this runqueue (including
@@ -618,14 +656,14 @@ struct dl_rq {
618 * This is needed to compute the "inactive utilization" for the 656 * This is needed to compute the "inactive utilization" for the
619 * runqueue (inactive utilization = this_bw - running_bw). 657 * runqueue (inactive utilization = this_bw - running_bw).
620 */ 658 */
621 u64 this_bw; 659 u64 this_bw;
622 u64 extra_bw; 660 u64 extra_bw;
623 661
624 /* 662 /*
625 * Inverse of the fraction of CPU utilization that can be reclaimed 663 * Inverse of the fraction of CPU utilization that can be reclaimed
626 * by the GRUB algorithm. 664 * by the GRUB algorithm.
627 */ 665 */
628 u64 bw_ratio; 666 u64 bw_ratio;
629}; 667};
630 668
631#ifdef CONFIG_SMP 669#ifdef CONFIG_SMP
@@ -638,51 +676,51 @@ static inline bool sched_asym_prefer(int a, int b)
638/* 676/*
639 * We add the notion of a root-domain which will be used to define per-domain 677 * We add the notion of a root-domain which will be used to define per-domain
640 * variables. Each exclusive cpuset essentially defines an island domain by 678 * variables. Each exclusive cpuset essentially defines an island domain by
641 * fully partitioning the member cpus from any other cpuset. Whenever a new 679 * fully partitioning the member CPUs from any other cpuset. Whenever a new
642 * exclusive cpuset is created, we also create and attach a new root-domain 680 * exclusive cpuset is created, we also create and attach a new root-domain
643 * object. 681 * object.
644 * 682 *
645 */ 683 */
646struct root_domain { 684struct root_domain {
647 atomic_t refcount; 685 atomic_t refcount;
648 atomic_t rto_count; 686 atomic_t rto_count;
649 struct rcu_head rcu; 687 struct rcu_head rcu;
650 cpumask_var_t span; 688 cpumask_var_t span;
651 cpumask_var_t online; 689 cpumask_var_t online;
652 690
653 /* Indicate more than one runnable task for any CPU */ 691 /* Indicate more than one runnable task for any CPU */
654 bool overload; 692 bool overload;
655 693
656 /* 694 /*
657 * The bit corresponding to a CPU gets set here if such CPU has more 695 * The bit corresponding to a CPU gets set here if such CPU has more
658 * than one runnable -deadline task (as it is below for RT tasks). 696 * than one runnable -deadline task (as it is below for RT tasks).
659 */ 697 */
660 cpumask_var_t dlo_mask; 698 cpumask_var_t dlo_mask;
661 atomic_t dlo_count; 699 atomic_t dlo_count;
662 struct dl_bw dl_bw; 700 struct dl_bw dl_bw;
663 struct cpudl cpudl; 701 struct cpudl cpudl;
664 702
665#ifdef HAVE_RT_PUSH_IPI 703#ifdef HAVE_RT_PUSH_IPI
666 /* 704 /*
667 * For IPI pull requests, loop across the rto_mask. 705 * For IPI pull requests, loop across the rto_mask.
668 */ 706 */
669 struct irq_work rto_push_work; 707 struct irq_work rto_push_work;
670 raw_spinlock_t rto_lock; 708 raw_spinlock_t rto_lock;
671 /* These are only updated and read within rto_lock */ 709 /* These are only updated and read within rto_lock */
672 int rto_loop; 710 int rto_loop;
673 int rto_cpu; 711 int rto_cpu;
674 /* These atomics are updated outside of a lock */ 712 /* These atomics are updated outside of a lock */
675 atomic_t rto_loop_next; 713 atomic_t rto_loop_next;
676 atomic_t rto_loop_start; 714 atomic_t rto_loop_start;
677#endif 715#endif
678 /* 716 /*
679 * The "RT overload" flag: it gets set if a CPU has more than 717 * The "RT overload" flag: it gets set if a CPU has more than
680 * one runnable RT task. 718 * one runnable RT task.
681 */ 719 */
682 cpumask_var_t rto_mask; 720 cpumask_var_t rto_mask;
683 struct cpupri cpupri; 721 struct cpupri cpupri;
684 722
685 unsigned long max_cpu_capacity; 723 unsigned long max_cpu_capacity;
686}; 724};
687 725
688extern struct root_domain def_root_domain; 726extern struct root_domain def_root_domain;
@@ -708,41 +746,39 @@ extern void rto_push_irq_work_func(struct irq_work *work);
708 */ 746 */
709struct rq { 747struct rq {
710 /* runqueue lock: */ 748 /* runqueue lock: */
711 raw_spinlock_t lock; 749 raw_spinlock_t lock;
712 750
713 /* 751 /*
714 * nr_running and cpu_load should be in the same cacheline because 752 * nr_running and cpu_load should be in the same cacheline because
715 * remote CPUs use both these fields when doing load calculation. 753 * remote CPUs use both these fields when doing load calculation.
716 */ 754 */
717 unsigned int nr_running; 755 unsigned int nr_running;
718#ifdef CONFIG_NUMA_BALANCING 756#ifdef CONFIG_NUMA_BALANCING
719 unsigned int nr_numa_running; 757 unsigned int nr_numa_running;
720 unsigned int nr_preferred_running; 758 unsigned int nr_preferred_running;
721#endif 759#endif
722 #define CPU_LOAD_IDX_MAX 5 760 #define CPU_LOAD_IDX_MAX 5
723 unsigned long cpu_load[CPU_LOAD_IDX_MAX]; 761 unsigned long cpu_load[CPU_LOAD_IDX_MAX];
724#ifdef CONFIG_NO_HZ_COMMON 762#ifdef CONFIG_NO_HZ_COMMON
725#ifdef CONFIG_SMP 763#ifdef CONFIG_SMP
726 unsigned long last_load_update_tick; 764 unsigned long last_load_update_tick;
727#endif /* CONFIG_SMP */ 765#endif /* CONFIG_SMP */
728 unsigned long nohz_flags; 766 unsigned long nohz_flags;
729#endif /* CONFIG_NO_HZ_COMMON */ 767#endif /* CONFIG_NO_HZ_COMMON */
730#ifdef CONFIG_NO_HZ_FULL
731 unsigned long last_sched_tick;
732#endif
733 /* capture load from *all* tasks on this cpu: */
734 struct load_weight load;
735 unsigned long nr_load_updates;
736 u64 nr_switches;
737 768
738 struct cfs_rq cfs; 769 /* capture load from *all* tasks on this CPU: */
739 struct rt_rq rt; 770 struct load_weight load;
740 struct dl_rq dl; 771 unsigned long nr_load_updates;
772 u64 nr_switches;
773
774 struct cfs_rq cfs;
775 struct rt_rq rt;
776 struct dl_rq dl;
741 777
742#ifdef CONFIG_FAIR_GROUP_SCHED 778#ifdef CONFIG_FAIR_GROUP_SCHED
743 /* list of leaf cfs_rq on this cpu: */ 779 /* list of leaf cfs_rq on this CPU: */
744 struct list_head leaf_cfs_rq_list; 780 struct list_head leaf_cfs_rq_list;
745 struct list_head *tmp_alone_branch; 781 struct list_head *tmp_alone_branch;
746#endif /* CONFIG_FAIR_GROUP_SCHED */ 782#endif /* CONFIG_FAIR_GROUP_SCHED */
747 783
748 /* 784 /*
@@ -751,94 +787,98 @@ struct rq {
751 * one CPU and if it got migrated afterwards it may decrease 787 * one CPU and if it got migrated afterwards it may decrease
752 * it on another CPU. Always updated under the runqueue lock: 788 * it on another CPU. Always updated under the runqueue lock:
753 */ 789 */
754 unsigned long nr_uninterruptible; 790 unsigned long nr_uninterruptible;
755 791
756 struct task_struct *curr, *idle, *stop; 792 struct task_struct *curr;
757 unsigned long next_balance; 793 struct task_struct *idle;
758 struct mm_struct *prev_mm; 794 struct task_struct *stop;
795 unsigned long next_balance;
796 struct mm_struct *prev_mm;
759 797
760 unsigned int clock_update_flags; 798 unsigned int clock_update_flags;
761 u64 clock; 799 u64 clock;
762 u64 clock_task; 800 u64 clock_task;
763 801
764 atomic_t nr_iowait; 802 atomic_t nr_iowait;
765 803
766#ifdef CONFIG_SMP 804#ifdef CONFIG_SMP
767 struct root_domain *rd; 805 struct root_domain *rd;
768 struct sched_domain *sd; 806 struct sched_domain *sd;
807
808 unsigned long cpu_capacity;
809 unsigned long cpu_capacity_orig;
769 810
770 unsigned long cpu_capacity; 811 struct callback_head *balance_callback;
771 unsigned long cpu_capacity_orig;
772 812
773 struct callback_head *balance_callback; 813 unsigned char idle_balance;
774 814
775 unsigned char idle_balance;
776 /* For active balancing */ 815 /* For active balancing */
777 int active_balance; 816 int active_balance;
778 int push_cpu; 817 int push_cpu;
779 struct cpu_stop_work active_balance_work; 818 struct cpu_stop_work active_balance_work;
780 /* cpu of this runqueue: */ 819
781 int cpu; 820 /* CPU of this runqueue: */
782 int online; 821 int cpu;
822 int online;
783 823
784 struct list_head cfs_tasks; 824 struct list_head cfs_tasks;
785 825
786 u64 rt_avg; 826 u64 rt_avg;
787 u64 age_stamp; 827 u64 age_stamp;
788 u64 idle_stamp; 828 u64 idle_stamp;
789 u64 avg_idle; 829 u64 avg_idle;
790 830
791 /* This is used to determine avg_idle's max value */ 831 /* This is used to determine avg_idle's max value */
792 u64 max_idle_balance_cost; 832 u64 max_idle_balance_cost;
793#endif 833#endif
794 834
795#ifdef CONFIG_IRQ_TIME_ACCOUNTING 835#ifdef CONFIG_IRQ_TIME_ACCOUNTING
796 u64 prev_irq_time; 836 u64 prev_irq_time;
797#endif 837#endif
798#ifdef CONFIG_PARAVIRT 838#ifdef CONFIG_PARAVIRT
799 u64 prev_steal_time; 839 u64 prev_steal_time;
800#endif 840#endif
801#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING 841#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
802 u64 prev_steal_time_rq; 842 u64 prev_steal_time_rq;
803#endif 843#endif
804 844
805 /* calc_load related fields */ 845 /* calc_load related fields */
806 unsigned long calc_load_update; 846 unsigned long calc_load_update;
807 long calc_load_active; 847 long calc_load_active;
808 848
809#ifdef CONFIG_SCHED_HRTICK 849#ifdef CONFIG_SCHED_HRTICK
810#ifdef CONFIG_SMP 850#ifdef CONFIG_SMP
811 int hrtick_csd_pending; 851 int hrtick_csd_pending;
812 call_single_data_t hrtick_csd; 852 call_single_data_t hrtick_csd;
813#endif 853#endif
814 struct hrtimer hrtick_timer; 854 struct hrtimer hrtick_timer;
815#endif 855#endif
816 856
817#ifdef CONFIG_SCHEDSTATS 857#ifdef CONFIG_SCHEDSTATS
818 /* latency stats */ 858 /* latency stats */
819 struct sched_info rq_sched_info; 859 struct sched_info rq_sched_info;
820 unsigned long long rq_cpu_time; 860 unsigned long long rq_cpu_time;
821 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ 861 /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
822 862
823 /* sys_sched_yield() stats */ 863 /* sys_sched_yield() stats */
824 unsigned int yld_count; 864 unsigned int yld_count;
825 865
826 /* schedule() stats */ 866 /* schedule() stats */
827 unsigned int sched_count; 867 unsigned int sched_count;
828 unsigned int sched_goidle; 868 unsigned int sched_goidle;
829 869
830 /* try_to_wake_up() stats */ 870 /* try_to_wake_up() stats */
831 unsigned int ttwu_count; 871 unsigned int ttwu_count;
832 unsigned int ttwu_local; 872 unsigned int ttwu_local;
833#endif 873#endif
834 874
835#ifdef CONFIG_SMP 875#ifdef CONFIG_SMP
836 struct llist_head wake_list; 876 struct llist_head wake_list;
837#endif 877#endif
838 878
839#ifdef CONFIG_CPU_IDLE 879#ifdef CONFIG_CPU_IDLE
840 /* Must be inspected within a rcu lock section */ 880 /* Must be inspected within a rcu lock section */
841 struct cpuidle_state *idle_state; 881 struct cpuidle_state *idle_state;
842#endif 882#endif
843}; 883};
844 884
@@ -904,9 +944,9 @@ static inline u64 __rq_clock_broken(struct rq *rq)
904 * one position though, because the next rq_unpin_lock() will shift it 944 * one position though, because the next rq_unpin_lock() will shift it
905 * back. 945 * back.
906 */ 946 */
907#define RQCF_REQ_SKIP 0x01 947#define RQCF_REQ_SKIP 0x01
908#define RQCF_ACT_SKIP 0x02 948#define RQCF_ACT_SKIP 0x02
909#define RQCF_UPDATED 0x04 949#define RQCF_UPDATED 0x04
910 950
911static inline void assert_clock_updated(struct rq *rq) 951static inline void assert_clock_updated(struct rq *rq)
912{ 952{
@@ -1059,12 +1099,12 @@ extern void sched_ttwu_pending(void);
1059 1099
1060/** 1100/**
1061 * highest_flag_domain - Return highest sched_domain containing flag. 1101 * highest_flag_domain - Return highest sched_domain containing flag.
1062 * @cpu: The cpu whose highest level of sched domain is to 1102 * @cpu: The CPU whose highest level of sched domain is to
1063 * be returned. 1103 * be returned.
1064 * @flag: The flag to check for the highest sched_domain 1104 * @flag: The flag to check for the highest sched_domain
1065 * for the given cpu. 1105 * for the given CPU.
1066 * 1106 *
1067 * Returns the highest sched_domain of a cpu which contains the given flag. 1107 * Returns the highest sched_domain of a CPU which contains the given flag.
1068 */ 1108 */
1069static inline struct sched_domain *highest_flag_domain(int cpu, int flag) 1109static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
1070{ 1110{
@@ -1099,30 +1139,30 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
1099DECLARE_PER_CPU(struct sched_domain *, sd_asym); 1139DECLARE_PER_CPU(struct sched_domain *, sd_asym);
1100 1140
1101struct sched_group_capacity { 1141struct sched_group_capacity {
1102 atomic_t ref; 1142 atomic_t ref;
1103 /* 1143 /*
1104 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity 1144 * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
1105 * for a single CPU. 1145 * for a single CPU.
1106 */ 1146 */
1107 unsigned long capacity; 1147 unsigned long capacity;
1108 unsigned long min_capacity; /* Min per-CPU capacity in group */ 1148 unsigned long min_capacity; /* Min per-CPU capacity in group */
1109 unsigned long next_update; 1149 unsigned long next_update;
1110 int imbalance; /* XXX unrelated to capacity but shared group state */ 1150 int imbalance; /* XXX unrelated to capacity but shared group state */
1111 1151
1112#ifdef CONFIG_SCHED_DEBUG 1152#ifdef CONFIG_SCHED_DEBUG
1113 int id; 1153 int id;
1114#endif 1154#endif
1115 1155
1116 unsigned long cpumask[0]; /* balance mask */ 1156 unsigned long cpumask[0]; /* Balance mask */
1117}; 1157};
1118 1158
1119struct sched_group { 1159struct sched_group {
1120 struct sched_group *next; /* Must be a circular list */ 1160 struct sched_group *next; /* Must be a circular list */
1121 atomic_t ref; 1161 atomic_t ref;
1122 1162
1123 unsigned int group_weight; 1163 unsigned int group_weight;
1124 struct sched_group_capacity *sgc; 1164 struct sched_group_capacity *sgc;
1125 int asym_prefer_cpu; /* cpu of highest priority in group */ 1165 int asym_prefer_cpu; /* CPU of highest priority in group */
1126 1166
1127 /* 1167 /*
1128 * The CPUs this group covers. 1168 * The CPUs this group covers.
@@ -1131,7 +1171,7 @@ struct sched_group {
1131 * by attaching extra space to the end of the structure, 1171 * by attaching extra space to the end of the structure,
1132 * depending on how many CPUs the kernel has booted up with) 1172 * depending on how many CPUs the kernel has booted up with)
1133 */ 1173 */
1134 unsigned long cpumask[0]; 1174 unsigned long cpumask[0];
1135}; 1175};
1136 1176
1137static inline struct cpumask *sched_group_span(struct sched_group *sg) 1177static inline struct cpumask *sched_group_span(struct sched_group *sg)
@@ -1148,8 +1188,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
1148} 1188}
1149 1189
1150/** 1190/**
1151 * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. 1191 * group_first_cpu - Returns the first CPU in the cpumask of a sched_group.
1152 * @group: The group whose first cpu is to be returned. 1192 * @group: The group whose first CPU is to be returned.
1153 */ 1193 */
1154static inline unsigned int group_first_cpu(struct sched_group *group) 1194static inline unsigned int group_first_cpu(struct sched_group *group)
1155{ 1195{
@@ -1349,19 +1389,12 @@ static inline int task_on_rq_migrating(struct task_struct *p)
1349 return p->on_rq == TASK_ON_RQ_MIGRATING; 1389 return p->on_rq == TASK_ON_RQ_MIGRATING;
1350} 1390}
1351 1391
1352#ifndef prepare_arch_switch
1353# define prepare_arch_switch(next) do { } while (0)
1354#endif
1355#ifndef finish_arch_post_lock_switch
1356# define finish_arch_post_lock_switch() do { } while (0)
1357#endif
1358
1359/* 1392/*
1360 * wake flags 1393 * wake flags
1361 */ 1394 */
1362#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ 1395#define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */
1363#define WF_FORK 0x02 /* child wakeup after fork */ 1396#define WF_FORK 0x02 /* Child wakeup after fork */
1364#define WF_MIGRATED 0x4 /* internal use, task got migrated */ 1397#define WF_MIGRATED 0x4 /* Internal use, task got migrated */
1365 1398
1366/* 1399/*
1367 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1400 * To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1372,11 +1405,11 @@ static inline int task_on_rq_migrating(struct task_struct *p)
1372 * slice expiry etc. 1405 * slice expiry etc.
1373 */ 1406 */
1374 1407
1375#define WEIGHT_IDLEPRIO 3 1408#define WEIGHT_IDLEPRIO 3
1376#define WMULT_IDLEPRIO 1431655765 1409#define WMULT_IDLEPRIO 1431655765
1377 1410
1378extern const int sched_prio_to_weight[40]; 1411extern const int sched_prio_to_weight[40];
1379extern const u32 sched_prio_to_wmult[40]; 1412extern const u32 sched_prio_to_wmult[40];
1380 1413
1381/* 1414/*
1382 * {de,en}queue flags: 1415 * {de,en}queue flags:
@@ -1398,9 +1431,9 @@ extern const u32 sched_prio_to_wmult[40];
1398 */ 1431 */
1399 1432
1400#define DEQUEUE_SLEEP 0x01 1433#define DEQUEUE_SLEEP 0x01
1401#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ 1434#define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */
1402#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ 1435#define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */
1403#define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ 1436#define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */
1404 1437
1405#define ENQUEUE_WAKEUP 0x01 1438#define ENQUEUE_WAKEUP 0x01
1406#define ENQUEUE_RESTORE 0x02 1439#define ENQUEUE_RESTORE 0x02
@@ -1422,10 +1455,10 @@ struct sched_class {
1422 1455
1423 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); 1456 void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
1424 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); 1457 void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
1425 void (*yield_task) (struct rq *rq); 1458 void (*yield_task) (struct rq *rq);
1426 bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); 1459 bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
1427 1460
1428 void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); 1461 void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
1429 1462
1430 /* 1463 /*
1431 * It is the responsibility of the pick_next_task() method that will 1464 * It is the responsibility of the pick_next_task() method that will
@@ -1435,16 +1468,16 @@ struct sched_class {
1435 * May return RETRY_TASK when it finds a higher prio class has runnable 1468 * May return RETRY_TASK when it finds a higher prio class has runnable
1436 * tasks. 1469 * tasks.
1437 */ 1470 */
1438 struct task_struct * (*pick_next_task) (struct rq *rq, 1471 struct task_struct * (*pick_next_task)(struct rq *rq,
1439 struct task_struct *prev, 1472 struct task_struct *prev,
1440 struct rq_flags *rf); 1473 struct rq_flags *rf);
1441 void (*put_prev_task) (struct rq *rq, struct task_struct *p); 1474 void (*put_prev_task)(struct rq *rq, struct task_struct *p);
1442 1475
1443#ifdef CONFIG_SMP 1476#ifdef CONFIG_SMP
1444 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); 1477 int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
1445 void (*migrate_task_rq)(struct task_struct *p); 1478 void (*migrate_task_rq)(struct task_struct *p);
1446 1479
1447 void (*task_woken) (struct rq *this_rq, struct task_struct *task); 1480 void (*task_woken)(struct rq *this_rq, struct task_struct *task);
1448 1481
1449 void (*set_cpus_allowed)(struct task_struct *p, 1482 void (*set_cpus_allowed)(struct task_struct *p,
1450 const struct cpumask *newmask); 1483 const struct cpumask *newmask);
@@ -1453,31 +1486,31 @@ struct sched_class {
1453 void (*rq_offline)(struct rq *rq); 1486 void (*rq_offline)(struct rq *rq);
1454#endif 1487#endif
1455 1488
1456 void (*set_curr_task) (struct rq *rq); 1489 void (*set_curr_task)(struct rq *rq);
1457 void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); 1490 void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
1458 void (*task_fork) (struct task_struct *p); 1491 void (*task_fork)(struct task_struct *p);
1459 void (*task_dead) (struct task_struct *p); 1492 void (*task_dead)(struct task_struct *p);
1460 1493
1461 /* 1494 /*
1462 * The switched_from() call is allowed to drop rq->lock, therefore we 1495 * The switched_from() call is allowed to drop rq->lock, therefore we
1463 * cannot assume the switched_from/switched_to pair is serliazed by 1496 * cannot assume the switched_from/switched_to pair is serliazed by
1464 * rq->lock. They are however serialized by p->pi_lock. 1497 * rq->lock. They are however serialized by p->pi_lock.
1465 */ 1498 */
1466 void (*switched_from) (struct rq *this_rq, struct task_struct *task); 1499 void (*switched_from)(struct rq *this_rq, struct task_struct *task);
1467 void (*switched_to) (struct rq *this_rq, struct task_struct *task); 1500 void (*switched_to) (struct rq *this_rq, struct task_struct *task);
1468 void (*prio_changed) (struct rq *this_rq, struct task_struct *task, 1501 void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
1469 int oldprio); 1502 int oldprio);
1470 1503
1471 unsigned int (*get_rr_interval) (struct rq *rq, 1504 unsigned int (*get_rr_interval)(struct rq *rq,
1472 struct task_struct *task); 1505 struct task_struct *task);
1473 1506
1474 void (*update_curr) (struct rq *rq); 1507 void (*update_curr)(struct rq *rq);
1475 1508
1476#define TASK_SET_GROUP 0 1509#define TASK_SET_GROUP 0
1477#define TASK_MOVE_GROUP 1 1510#define TASK_MOVE_GROUP 1
1478 1511
1479#ifdef CONFIG_FAIR_GROUP_SCHED 1512#ifdef CONFIG_FAIR_GROUP_SCHED
1480 void (*task_change_group) (struct task_struct *p, int type); 1513 void (*task_change_group)(struct task_struct *p, int type);
1481#endif 1514#endif
1482}; 1515};
1483 1516
@@ -1526,6 +1559,7 @@ static inline void idle_set_state(struct rq *rq,
1526static inline struct cpuidle_state *idle_get_state(struct rq *rq) 1559static inline struct cpuidle_state *idle_get_state(struct rq *rq)
1527{ 1560{
1528 SCHED_WARN_ON(!rcu_read_lock_held()); 1561 SCHED_WARN_ON(!rcu_read_lock_held());
1562
1529 return rq->idle_state; 1563 return rq->idle_state;
1530} 1564}
1531#else 1565#else
@@ -1564,9 +1598,9 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
1564extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); 1598extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
1565extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); 1599extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
1566 1600
1567#define BW_SHIFT 20 1601#define BW_SHIFT 20
1568#define BW_UNIT (1 << BW_SHIFT) 1602#define BW_UNIT (1 << BW_SHIFT)
1569#define RATIO_SHIFT 8 1603#define RATIO_SHIFT 8
1570unsigned long to_ratio(u64 period, u64 runtime); 1604unsigned long to_ratio(u64 period, u64 runtime);
1571 1605
1572extern void init_entity_runnable_average(struct sched_entity *se); 1606extern void init_entity_runnable_average(struct sched_entity *se);
@@ -1574,6 +1608,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se);
1574 1608
1575#ifdef CONFIG_NO_HZ_FULL 1609#ifdef CONFIG_NO_HZ_FULL
1576extern bool sched_can_stop_tick(struct rq *rq); 1610extern bool sched_can_stop_tick(struct rq *rq);
1611extern int __init sched_tick_offload_init(void);
1577 1612
1578/* 1613/*
1579 * Tick may be needed by tasks in the runqueue depending on their policy and 1614 * Tick may be needed by tasks in the runqueue depending on their policy and
@@ -1598,6 +1633,7 @@ static inline void sched_update_tick_dependency(struct rq *rq)
1598 tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); 1633 tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
1599} 1634}
1600#else 1635#else
1636static inline int sched_tick_offload_init(void) { return 0; }
1601static inline void sched_update_tick_dependency(struct rq *rq) { } 1637static inline void sched_update_tick_dependency(struct rq *rq) { }
1602#endif 1638#endif
1603 1639
@@ -1624,13 +1660,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
1624 sched_update_tick_dependency(rq); 1660 sched_update_tick_dependency(rq);
1625} 1661}
1626 1662
1627static inline void rq_last_tick_reset(struct rq *rq)
1628{
1629#ifdef CONFIG_NO_HZ_FULL
1630 rq->last_sched_tick = jiffies;
1631#endif
1632}
1633
1634extern void update_rq_clock(struct rq *rq); 1663extern void update_rq_clock(struct rq *rq);
1635 1664
1636extern void activate_task(struct rq *rq, struct task_struct *p, int flags); 1665extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
@@ -1821,8 +1850,8 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1821/* 1850/*
1822 * Unfair double_lock_balance: Optimizes throughput at the expense of 1851 * Unfair double_lock_balance: Optimizes throughput at the expense of
1823 * latency by eliminating extra atomic operations when the locks are 1852 * latency by eliminating extra atomic operations when the locks are
1824 * already in proper order on entry. This favors lower cpu-ids and will 1853 * already in proper order on entry. This favors lower CPU-ids and will
1825 * grant the double lock to lower cpus over higher ids under contention, 1854 * grant the double lock to lower CPUs over higher ids under contention,
1826 * regardless of entry order into the function. 1855 * regardless of entry order into the function.
1827 */ 1856 */
1828static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) 1857static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
@@ -1854,7 +1883,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
1854static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) 1883static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
1855{ 1884{
1856 if (unlikely(!irqs_disabled())) { 1885 if (unlikely(!irqs_disabled())) {
1857 /* printk() doesn't work good under rq->lock */ 1886 /* printk() doesn't work well under rq->lock */
1858 raw_spin_unlock(&this_rq->lock); 1887 raw_spin_unlock(&this_rq->lock);
1859 BUG_ON(1); 1888 BUG_ON(1);
1860 } 1889 }
@@ -2113,15 +2142,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
2113#endif /* CONFIG_CPU_FREQ */ 2142#endif /* CONFIG_CPU_FREQ */
2114 2143
2115#ifdef arch_scale_freq_capacity 2144#ifdef arch_scale_freq_capacity
2116#ifndef arch_scale_freq_invariant 2145# ifndef arch_scale_freq_invariant
2117#define arch_scale_freq_invariant() (true) 2146# define arch_scale_freq_invariant() true
2118#endif 2147# endif
2119#else /* arch_scale_freq_capacity */ 2148#else
2120#define arch_scale_freq_invariant() (false) 2149# define arch_scale_freq_invariant() false
2121#endif 2150#endif
2122 2151
2123#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL 2152#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
2124
2125static inline unsigned long cpu_util_dl(struct rq *rq) 2153static inline unsigned long cpu_util_dl(struct rq *rq)
2126{ 2154{
2127 return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; 2155 return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
@@ -2131,5 +2159,4 @@ static inline unsigned long cpu_util_cfs(struct rq *rq)
2131{ 2159{
2132 return rq->cfs.avg.util_avg; 2160 return rq->cfs.avg.util_avg;
2133} 2161}
2134
2135#endif 2162#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 940b1fa1d2ce..ab112cbfd7c8 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -1,14 +1,13 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2 2/*
3#include <linux/slab.h> 3 * /proc/schedstat implementation
4#include <linux/fs.h> 4 */
5#include <linux/seq_file.h>
6#include <linux/proc_fs.h>
7
8#include "sched.h" 5#include "sched.h"
9 6
10/* 7/*
11 * bump this up when changing the output format or the meaning of an existing 8 * Current schedstat API version.
9 *
10 * Bump this up when changing the output format or the meaning of an existing
12 * format, so that tools can adapt (or abort) 11 * format, so that tools can adapt (or abort)
13 */ 12 */
14#define SCHEDSTAT_VERSION 15 13#define SCHEDSTAT_VERSION 15
@@ -78,8 +77,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
78 * This itererator needs some explanation. 77 * This itererator needs some explanation.
79 * It returns 1 for the header position. 78 * It returns 1 for the header position.
80 * This means 2 is cpu 0. 79 * This means 2 is cpu 0.
81 * In a hotplugged system some cpus, including cpu 0, may be missing so we have 80 * In a hotplugged system some CPUs, including cpu 0, may be missing so we have
82 * to use cpumask_* to iterate over the cpus. 81 * to use cpumask_* to iterate over the CPUs.
83 */ 82 */
84static void *schedstat_start(struct seq_file *file, loff_t *offset) 83static void *schedstat_start(struct seq_file *file, loff_t *offset)
85{ 84{
@@ -99,12 +98,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset)
99 98
100 if (n < nr_cpu_ids) 99 if (n < nr_cpu_ids)
101 return (void *)(unsigned long)(n + 2); 100 return (void *)(unsigned long)(n + 2);
101
102 return NULL; 102 return NULL;
103} 103}
104 104
105static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) 105static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
106{ 106{
107 (*offset)++; 107 (*offset)++;
108
108 return schedstat_start(file, offset); 109 return schedstat_start(file, offset);
109} 110}
110 111
@@ -134,6 +135,7 @@ static const struct file_operations proc_schedstat_operations = {
134static int __init proc_schedstat_init(void) 135static int __init proc_schedstat_init(void)
135{ 136{
136 proc_create("schedstat", 0, NULL, &proc_schedstat_operations); 137 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
138
137 return 0; 139 return 0;
138} 140}
139subsys_initcall(proc_schedstat_init); 141subsys_initcall(proc_schedstat_init);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 8e7b58de61e7..8aea199a39b4 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -30,35 +30,29 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
30 if (rq) 30 if (rq)
31 rq->rq_sched_info.run_delay += delta; 31 rq->rq_sched_info.run_delay += delta;
32} 32}
33#define schedstat_enabled() static_branch_unlikely(&sched_schedstats) 33#define schedstat_enabled() static_branch_unlikely(&sched_schedstats)
34#define __schedstat_inc(var) do { var++; } while (0) 34#define __schedstat_inc(var) do { var++; } while (0)
35#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) 35#define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0)
36#define __schedstat_add(var, amt) do { var += (amt); } while (0) 36#define __schedstat_add(var, amt) do { var += (amt); } while (0)
37#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) 37#define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0)
38#define __schedstat_set(var, val) do { var = (val); } while (0) 38#define __schedstat_set(var, val) do { var = (val); } while (0)
39#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) 39#define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0)
40#define schedstat_val(var) (var) 40#define schedstat_val(var) (var)
41#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) 41#define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0)
42 42
43#else /* !CONFIG_SCHEDSTATS */ 43#else /* !CONFIG_SCHEDSTATS: */
44static inline void 44static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { }
45rq_sched_info_arrive(struct rq *rq, unsigned long long delta) 45static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { }
46{} 46static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { }
47static inline void 47# define schedstat_enabled() 0
48rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) 48# define __schedstat_inc(var) do { } while (0)
49{} 49# define schedstat_inc(var) do { } while (0)
50static inline void 50# define __schedstat_add(var, amt) do { } while (0)
51rq_sched_info_depart(struct rq *rq, unsigned long long delta) 51# define schedstat_add(var, amt) do { } while (0)
52{} 52# define __schedstat_set(var, val) do { } while (0)
53#define schedstat_enabled() 0 53# define schedstat_set(var, val) do { } while (0)
54#define __schedstat_inc(var) do { } while (0) 54# define schedstat_val(var) 0
55#define schedstat_inc(var) do { } while (0) 55# define schedstat_val_or_zero(var) 0
56#define __schedstat_add(var, amt) do { } while (0)
57#define schedstat_add(var, amt) do { } while (0)
58#define __schedstat_set(var, val) do { } while (0)
59#define schedstat_set(var, val) do { } while (0)
60#define schedstat_val(var) 0
61#define schedstat_val_or_zero(var) 0
62#endif /* CONFIG_SCHEDSTATS */ 56#endif /* CONFIG_SCHEDSTATS */
63 57
64#ifdef CONFIG_SCHED_INFO 58#ifdef CONFIG_SCHED_INFO
@@ -69,9 +63,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
69 63
70/* 64/*
71 * We are interested in knowing how long it was from the *first* time a 65 * We are interested in knowing how long it was from the *first* time a
72 * task was queued to the time that it finally hit a cpu, we call this routine 66 * task was queued to the time that it finally hit a CPU, we call this routine
73 * from dequeue_task() to account for possible rq->clock skew across cpus. The 67 * from dequeue_task() to account for possible rq->clock skew across CPUs. The
74 * delta taken on each cpu would annul the skew. 68 * delta taken on each CPU would annul the skew.
75 */ 69 */
76static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) 70static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
77{ 71{
@@ -87,7 +81,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
87} 81}
88 82
89/* 83/*
90 * Called when a task finally hits the cpu. We can now calculate how 84 * Called when a task finally hits the CPU. We can now calculate how
91 * long it was waiting to run. We also note when it began so that we 85 * long it was waiting to run. We also note when it began so that we
92 * can keep stats on how long its timeslice is. 86 * can keep stats on how long its timeslice is.
93 */ 87 */
@@ -112,9 +106,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
112 */ 106 */
113static inline void sched_info_queued(struct rq *rq, struct task_struct *t) 107static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
114{ 108{
115 if (unlikely(sched_info_on())) 109 if (unlikely(sched_info_on())) {
116 if (!t->sched_info.last_queued) 110 if (!t->sched_info.last_queued)
117 t->sched_info.last_queued = rq_clock(rq); 111 t->sched_info.last_queued = rq_clock(rq);
112 }
118} 113}
119 114
120/* 115/*
@@ -127,8 +122,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
127 */ 122 */
128static inline void sched_info_depart(struct rq *rq, struct task_struct *t) 123static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
129{ 124{
130 unsigned long long delta = rq_clock(rq) - 125 unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival;
131 t->sched_info.last_arrival;
132 126
133 rq_sched_info_depart(rq, delta); 127 rq_sched_info_depart(rq, delta);
134 128
@@ -142,11 +136,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
142 * the idle task.) We are only called when prev != next. 136 * the idle task.) We are only called when prev != next.
143 */ 137 */
144static inline void 138static inline void
145__sched_info_switch(struct rq *rq, 139__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
146 struct task_struct *prev, struct task_struct *next)
147{ 140{
148 /* 141 /*
149 * prev now departs the cpu. It's not interesting to record 142 * prev now departs the CPU. It's not interesting to record
150 * stats about how efficient we were at scheduling the idle 143 * stats about how efficient we were at scheduling the idle
151 * process, however. 144 * process, however.
152 */ 145 */
@@ -156,18 +149,19 @@ __sched_info_switch(struct rq *rq,
156 if (next != rq->idle) 149 if (next != rq->idle)
157 sched_info_arrive(rq, next); 150 sched_info_arrive(rq, next);
158} 151}
152
159static inline void 153static inline void
160sched_info_switch(struct rq *rq, 154sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
161 struct task_struct *prev, struct task_struct *next)
162{ 155{
163 if (unlikely(sched_info_on())) 156 if (unlikely(sched_info_on()))
164 __sched_info_switch(rq, prev, next); 157 __sched_info_switch(rq, prev, next);
165} 158}
166#else 159
167#define sched_info_queued(rq, t) do { } while (0) 160#else /* !CONFIG_SCHED_INFO: */
168#define sched_info_reset_dequeued(t) do { } while (0) 161# define sched_info_queued(rq, t) do { } while (0)
169#define sched_info_dequeued(rq, t) do { } while (0) 162# define sched_info_reset_dequeued(t) do { } while (0)
170#define sched_info_depart(rq, t) do { } while (0) 163# define sched_info_dequeued(rq, t) do { } while (0)
171#define sched_info_arrive(rq, next) do { } while (0) 164# define sched_info_depart(rq, t) do { } while (0)
172#define sched_info_switch(rq, t, next) do { } while (0) 165# define sched_info_arrive(rq, next) do { } while (0)
166# define sched_info_switch(rq, t, next) do { } while (0)
173#endif /* CONFIG_SCHED_INFO */ 167#endif /* CONFIG_SCHED_INFO */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 210b1f2146ff..c183b790ca54 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,6 +1,4 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include "sched.h"
3
4/* 2/*
5 * stop-task scheduling class. 3 * stop-task scheduling class.
6 * 4 *
@@ -9,6 +7,7 @@
9 * 7 *
10 * See kernel/stop_machine.c 8 * See kernel/stop_machine.c
11 */ 9 */
10#include "sched.h"
12 11
13#ifdef CONFIG_SMP 12#ifdef CONFIG_SMP
14static int 13static int
@@ -75,6 +74,14 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
75 cgroup_account_cputime(curr, delta_exec); 74 cgroup_account_cputime(curr, delta_exec);
76} 75}
77 76
77/*
78 * scheduler tick hitting a task of our scheduling class.
79 *
80 * NOTE: This function can be called remotely by the tick offload that
81 * goes along full dynticks. Therefore no local assumption can be made
82 * and everything must be accessed through the @rq and @curr passed in
83 * parameters.
84 */
78static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) 85static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
79{ 86{
80} 87}
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 9ff1555341ed..b6fb2c3b3ff7 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -1,6 +1,8 @@
1// SPDX-License-Identifier: GPL-2.0 1// SPDX-License-Identifier: GPL-2.0
2#include <linux/sched/signal.h> 2/*
3#include <linux/swait.h> 3 * <linux/swait.h> (simple wait queues ) implementation:
4 */
5#include "sched.h"
4 6
5void __init_swait_queue_head(struct swait_queue_head *q, const char *name, 7void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
6 struct lock_class_key *key) 8 struct lock_class_key *key)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 519b024f4e94..64cc564f5255 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2,10 +2,6 @@
2/* 2/*
3 * Scheduler topology setup/handling methods 3 * Scheduler topology setup/handling methods
4 */ 4 */
5#include <linux/sched.h>
6#include <linux/mutex.h>
7#include <linux/sched/isolation.h>
8
9#include "sched.h" 5#include "sched.h"
10 6
11DEFINE_MUTEX(sched_domains_mutex); 7DEFINE_MUTEX(sched_domains_mutex);
@@ -41,8 +37,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
41 if (!(sd->flags & SD_LOAD_BALANCE)) { 37 if (!(sd->flags & SD_LOAD_BALANCE)) {
42 printk("does not load-balance\n"); 38 printk("does not load-balance\n");
43 if (sd->parent) 39 if (sd->parent)
44 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" 40 printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
45 " has parent");
46 return -1; 41 return -1;
47 } 42 }
48 43
@@ -50,12 +45,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
50 cpumask_pr_args(sched_domain_span(sd)), sd->name); 45 cpumask_pr_args(sched_domain_span(sd)), sd->name);
51 46
52 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { 47 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
53 printk(KERN_ERR "ERROR: domain->span does not contain " 48 printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
54 "CPU%d\n", cpu);
55 } 49 }
56 if (!cpumask_test_cpu(cpu, sched_group_span(group))) { 50 if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
57 printk(KERN_ERR "ERROR: domain->groups does not contain" 51 printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
58 " CPU%d\n", cpu);
59 } 52 }
60 53
61 printk(KERN_DEBUG "%*s groups:", level + 1, ""); 54 printk(KERN_DEBUG "%*s groups:", level + 1, "");
@@ -115,8 +108,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
115 108
116 if (sd->parent && 109 if (sd->parent &&
117 !cpumask_subset(groupmask, sched_domain_span(sd->parent))) 110 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
118 printk(KERN_ERR "ERROR: parent span is not a superset " 111 printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
119 "of domain->span\n");
120 return 0; 112 return 0;
121} 113}
122 114
@@ -595,7 +587,7 @@ int group_balance_cpu(struct sched_group *sg)
595 * are not. 587 * are not.
596 * 588 *
597 * This leads to a few particularly weird cases where the sched_domain's are 589 * This leads to a few particularly weird cases where the sched_domain's are
598 * not of the same number for each cpu. Consider: 590 * not of the same number for each CPU. Consider:
599 * 591 *
600 * NUMA-2 0-3 0-3 592 * NUMA-2 0-3 0-3
601 * groups: {0-2},{1-3} {1-3},{0-2} 593 * groups: {0-2},{1-3} {1-3},{0-2}
@@ -780,7 +772,7 @@ fail:
780 * ^ ^ ^ ^ 772 * ^ ^ ^ ^
781 * `-' `-' 773 * `-' `-'
782 * 774 *
783 * The sched_domains are per-cpu and have a two way link (parent & child) and 775 * The sched_domains are per-CPU and have a two way link (parent & child) and
784 * denote the ever growing mask of CPUs belonging to that level of topology. 776 * denote the ever growing mask of CPUs belonging to that level of topology.
785 * 777 *
786 * Each sched_domain has a circular (double) linked list of sched_group's, each 778 * Each sched_domain has a circular (double) linked list of sched_group's, each
@@ -1021,6 +1013,7 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
1021 d->rd = alloc_rootdomain(); 1013 d->rd = alloc_rootdomain();
1022 if (!d->rd) 1014 if (!d->rd)
1023 return sa_sd; 1015 return sa_sd;
1016
1024 return sa_rootdomain; 1017 return sa_rootdomain;
1025} 1018}
1026 1019
@@ -1047,12 +1040,14 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
1047} 1040}
1048 1041
1049#ifdef CONFIG_NUMA 1042#ifdef CONFIG_NUMA
1050static int sched_domains_numa_levels;
1051enum numa_topology_type sched_numa_topology_type; 1043enum numa_topology_type sched_numa_topology_type;
1052static int *sched_domains_numa_distance; 1044
1053int sched_max_numa_distance; 1045static int sched_domains_numa_levels;
1054static struct cpumask ***sched_domains_numa_masks; 1046static int sched_domains_curr_level;
1055static int sched_domains_curr_level; 1047
1048int sched_max_numa_distance;
1049static int *sched_domains_numa_distance;
1050static struct cpumask ***sched_domains_numa_masks;
1056#endif 1051#endif
1057 1052
1058/* 1053/*
@@ -1074,11 +1069,11 @@ static int sched_domains_curr_level;
1074 * SD_ASYM_PACKING - describes SMT quirks 1069 * SD_ASYM_PACKING - describes SMT quirks
1075 */ 1070 */
1076#define TOPOLOGY_SD_FLAGS \ 1071#define TOPOLOGY_SD_FLAGS \
1077 (SD_SHARE_CPUCAPACITY | \ 1072 (SD_SHARE_CPUCAPACITY | \
1078 SD_SHARE_PKG_RESOURCES | \ 1073 SD_SHARE_PKG_RESOURCES | \
1079 SD_NUMA | \ 1074 SD_NUMA | \
1080 SD_ASYM_PACKING | \ 1075 SD_ASYM_PACKING | \
1081 SD_ASYM_CPUCAPACITY | \ 1076 SD_ASYM_CPUCAPACITY | \
1082 SD_SHARE_POWERDOMAIN) 1077 SD_SHARE_POWERDOMAIN)
1083 1078
1084static struct sched_domain * 1079static struct sched_domain *
@@ -1628,7 +1623,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
1628 pr_err(" the %s domain not a subset of the %s domain\n", 1623 pr_err(" the %s domain not a subset of the %s domain\n",
1629 child->name, sd->name); 1624 child->name, sd->name);
1630#endif 1625#endif
1631 /* Fixup, ensure @sd has at least @child cpus. */ 1626 /* Fixup, ensure @sd has at least @child CPUs. */
1632 cpumask_or(sched_domain_span(sd), 1627 cpumask_or(sched_domain_span(sd),
1633 sched_domain_span(sd), 1628 sched_domain_span(sd),
1634 sched_domain_span(child)); 1629 sched_domain_span(child));
@@ -1720,6 +1715,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
1720 ret = 0; 1715 ret = 0;
1721error: 1716error:
1722 __free_domain_allocs(&d, alloc_state, cpu_map); 1717 __free_domain_allocs(&d, alloc_state, cpu_map);
1718
1723 return ret; 1719 return ret;
1724} 1720}
1725 1721
@@ -1824,6 +1820,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
1824 return 1; 1820 return 1;
1825 1821
1826 tmp = SD_ATTR_INIT; 1822 tmp = SD_ATTR_INIT;
1823
1827 return !memcmp(cur ? (cur + idx_cur) : &tmp, 1824 return !memcmp(cur ? (cur + idx_cur) : &tmp,
1828 new ? (new + idx_new) : &tmp, 1825 new ? (new + idx_new) : &tmp,
1829 sizeof(struct sched_domain_attr)); 1826 sizeof(struct sched_domain_attr));
@@ -1929,4 +1926,3 @@ match2:
1929 1926
1930 mutex_unlock(&sched_domains_mutex); 1927 mutex_unlock(&sched_domains_mutex);
1931} 1928}
1932
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 929ecb7d6b78..928be527477e 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -3,14 +3,7 @@
3 * 3 *
4 * (C) 2004 Nadia Yvette Chambers, Oracle 4 * (C) 2004 Nadia Yvette Chambers, Oracle
5 */ 5 */
6#include <linux/init.h> 6#include "sched.h"
7#include <linux/export.h>
8#include <linux/sched/signal.h>
9#include <linux/sched/debug.h>
10#include <linux/mm.h>
11#include <linux/wait.h>
12#include <linux/hash.h>
13#include <linux/kthread.h>
14 7
15void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) 8void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
16{ 9{
@@ -107,6 +100,7 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
107 break; 100 break;
108 } 101 }
109 } 102 }
103
110 return nr_exclusive; 104 return nr_exclusive;
111} 105}
112 106
@@ -317,6 +311,7 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait)
317 spin_unlock(&wq->lock); 311 spin_unlock(&wq->lock);
318 schedule(); 312 schedule();
319 spin_lock(&wq->lock); 313 spin_lock(&wq->lock);
314
320 return 0; 315 return 0;
321} 316}
322EXPORT_SYMBOL(do_wait_intr); 317EXPORT_SYMBOL(do_wait_intr);
@@ -333,6 +328,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait)
333 spin_unlock_irq(&wq->lock); 328 spin_unlock_irq(&wq->lock);
334 schedule(); 329 schedule();
335 spin_lock_irq(&wq->lock); 330 spin_lock_irq(&wq->lock);
331
336 return 0; 332 return 0;
337} 333}
338EXPORT_SYMBOL(do_wait_intr_irq); 334EXPORT_SYMBOL(do_wait_intr_irq);
@@ -378,6 +374,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
378 374
379 if (ret) 375 if (ret)
380 list_del_init(&wq_entry->entry); 376 list_del_init(&wq_entry->entry);
377
381 return ret; 378 return ret;
382} 379}
383EXPORT_SYMBOL(autoremove_wake_function); 380EXPORT_SYMBOL(autoremove_wake_function);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 84cb3acd9260..ed84ab245a05 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,10 +1,7 @@
1/* 1/*
2 * The implementation of the wait_bit*() and related waiting APIs: 2 * The implementation of the wait_bit*() and related waiting APIs:
3 */ 3 */
4#include <linux/wait_bit.h> 4#include "sched.h"
5#include <linux/sched/signal.h>
6#include <linux/sched/debug.h>
7#include <linux/hash.h>
8 5
9#define WAIT_TABLE_BITS 8 6#define WAIT_TABLE_BITS 8
10#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) 7#define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
@@ -29,8 +26,8 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
29 wait_bit->key.bit_nr != key->bit_nr || 26 wait_bit->key.bit_nr != key->bit_nr ||
30 test_bit(key->bit_nr, key->flags)) 27 test_bit(key->bit_nr, key->flags))
31 return 0; 28 return 0;
32 else 29
33 return autoremove_wake_function(wq_entry, mode, sync, key); 30 return autoremove_wake_function(wq_entry, mode, sync, key);
34} 31}
35EXPORT_SYMBOL(wake_bit_function); 32EXPORT_SYMBOL(wake_bit_function);
36 33
@@ -50,7 +47,9 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
50 if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) 47 if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
51 ret = (*action)(&wbq_entry->key, mode); 48 ret = (*action)(&wbq_entry->key, mode);
52 } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); 49 } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
50
53 finish_wait(wq_head, &wbq_entry->wq_entry); 51 finish_wait(wq_head, &wbq_entry->wq_entry);
52
54 return ret; 53 return ret;
55} 54}
56EXPORT_SYMBOL(__wait_on_bit); 55EXPORT_SYMBOL(__wait_on_bit);
@@ -73,6 +72,7 @@ int __sched out_of_line_wait_on_bit_timeout(
73 DEFINE_WAIT_BIT(wq_entry, word, bit); 72 DEFINE_WAIT_BIT(wq_entry, word, bit);
74 73
75 wq_entry.key.timeout = jiffies + timeout; 74 wq_entry.key.timeout = jiffies + timeout;
75
76 return __wait_on_bit(wq_head, &wq_entry, action, mode); 76 return __wait_on_bit(wq_head, &wq_entry, action, mode);
77} 77}
78EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); 78EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
@@ -120,6 +120,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
120void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) 120void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
121{ 121{
122 struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); 122 struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
123
123 if (waitqueue_active(wq_head)) 124 if (waitqueue_active(wq_head))
124 __wake_up(wq_head, TASK_NORMAL, 1, &key); 125 __wake_up(wq_head, TASK_NORMAL, 1, &key);
125} 126}
@@ -148,6 +149,54 @@ void wake_up_bit(void *word, int bit)
148} 149}
149EXPORT_SYMBOL(wake_up_bit); 150EXPORT_SYMBOL(wake_up_bit);
150 151
152wait_queue_head_t *__var_waitqueue(void *p)
153{
154 if (BITS_PER_LONG == 64) {
155 unsigned long q = (unsigned long)p;
156
157 return bit_waitqueue((void *)(q & ~1), q & 1);
158 }
159 return bit_waitqueue(p, 0);
160}
161EXPORT_SYMBOL(__var_waitqueue);
162
163static int
164var_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode,
165 int sync, void *arg)
166{
167 struct wait_bit_key *key = arg;
168 struct wait_bit_queue_entry *wbq_entry =
169 container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
170
171 if (wbq_entry->key.flags != key->flags ||
172 wbq_entry->key.bit_nr != key->bit_nr)
173 return 0;
174
175 return autoremove_wake_function(wq_entry, mode, sync, key);
176}
177
178void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags)
179{
180 *wbq_entry = (struct wait_bit_queue_entry){
181 .key = {
182 .flags = (var),
183 .bit_nr = -1,
184 },
185 .wq_entry = {
186 .private = current,
187 .func = var_wake_function,
188 .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry),
189 },
190 };
191}
192EXPORT_SYMBOL(init_wait_var_entry);
193
194void wake_up_var(void *var)
195{
196 __wake_up_bit(__var_waitqueue(var), var, -1);
197}
198EXPORT_SYMBOL(wake_up_var);
199
151/* 200/*
152 * Manipulate the atomic_t address to produce a better bit waitqueue table hash 201 * Manipulate the atomic_t address to produce a better bit waitqueue table hash
153 * index (we're keying off bit -1, but that would produce a horrible hash 202 * index (we're keying off bit -1, but that would produce a horrible hash
@@ -157,6 +206,7 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
157{ 206{
158 if (BITS_PER_LONG == 64) { 207 if (BITS_PER_LONG == 64) {
159 unsigned long q = (unsigned long)p; 208 unsigned long q = (unsigned long)p;
209
160 return bit_waitqueue((void *)(q & ~1), q & 1); 210 return bit_waitqueue((void *)(q & ~1), q & 1);
161 } 211 }
162 return bit_waitqueue(p, 0); 212 return bit_waitqueue(p, 0);
@@ -173,6 +223,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo
173 wait_bit->key.bit_nr != key->bit_nr || 223 wait_bit->key.bit_nr != key->bit_nr ||
174 atomic_read(val) != 0) 224 atomic_read(val) != 0)
175 return 0; 225 return 0;
226
176 return autoremove_wake_function(wq_entry, mode, sync, key); 227 return autoremove_wake_function(wq_entry, mode, sync, key);
177} 228}
178 229
@@ -196,6 +247,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
196 ret = (*action)(val, mode); 247 ret = (*action)(val, mode);
197 } while (!ret && atomic_read(val) != 0); 248 } while (!ret && atomic_read(val) != 0);
198 finish_wait(wq_head, &wbq_entry->wq_entry); 249 finish_wait(wq_head, &wbq_entry->wq_entry);
250
199 return ret; 251 return ret;
200} 252}
201 253
@@ -226,6 +278,7 @@ __sched int atomic_t_wait(atomic_t *counter, unsigned int mode)
226 schedule(); 278 schedule();
227 if (signal_pending_state(mode, current)) 279 if (signal_pending_state(mode, current))
228 return -EINTR; 280 return -EINTR;
281
229 return 0; 282 return 0;
230} 283}
231EXPORT_SYMBOL(atomic_t_wait); 284EXPORT_SYMBOL(atomic_t_wait);
@@ -250,6 +303,7 @@ __sched int bit_wait(struct wait_bit_key *word, int mode)
250 schedule(); 303 schedule();
251 if (signal_pending_state(mode, current)) 304 if (signal_pending_state(mode, current))
252 return -EINTR; 305 return -EINTR;
306
253 return 0; 307 return 0;
254} 308}
255EXPORT_SYMBOL(bit_wait); 309EXPORT_SYMBOL(bit_wait);
@@ -259,6 +313,7 @@ __sched int bit_wait_io(struct wait_bit_key *word, int mode)
259 io_schedule(); 313 io_schedule();
260 if (signal_pending_state(mode, current)) 314 if (signal_pending_state(mode, current))
261 return -EINTR; 315 return -EINTR;
316
262 return 0; 317 return 0;
263} 318}
264EXPORT_SYMBOL(bit_wait_io); 319EXPORT_SYMBOL(bit_wait_io);
@@ -266,11 +321,13 @@ EXPORT_SYMBOL(bit_wait_io);
266__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) 321__sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
267{ 322{
268 unsigned long now = READ_ONCE(jiffies); 323 unsigned long now = READ_ONCE(jiffies);
324
269 if (time_after_eq(now, word->timeout)) 325 if (time_after_eq(now, word->timeout))
270 return -EAGAIN; 326 return -EAGAIN;
271 schedule_timeout(word->timeout - now); 327 schedule_timeout(word->timeout - now);
272 if (signal_pending_state(mode, current)) 328 if (signal_pending_state(mode, current))
273 return -EINTR; 329 return -EINTR;
330
274 return 0; 331 return 0;
275} 332}
276EXPORT_SYMBOL_GPL(bit_wait_timeout); 333EXPORT_SYMBOL_GPL(bit_wait_timeout);
@@ -278,11 +335,13 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
278__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) 335__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
279{ 336{
280 unsigned long now = READ_ONCE(jiffies); 337 unsigned long now = READ_ONCE(jiffies);
338
281 if (time_after_eq(now, word->timeout)) 339 if (time_after_eq(now, word->timeout))
282 return -EAGAIN; 340 return -EAGAIN;
283 io_schedule_timeout(word->timeout - now); 341 io_schedule_timeout(word->timeout - now);
284 if (signal_pending_state(mode, current)) 342 if (signal_pending_state(mode, current))
285 return -EINTR; 343 return -EINTR;
344
286 return 0; 345 return 0;
287} 346}
288EXPORT_SYMBOL_GPL(bit_wait_io_timeout); 347EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 29a5733eff83..f2fa2e940fe5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -481,11 +481,18 @@ static int __init setup_tick_nohz(char *str)
481 481
482__setup("nohz=", setup_tick_nohz); 482__setup("nohz=", setup_tick_nohz);
483 483
484int tick_nohz_tick_stopped(void) 484bool tick_nohz_tick_stopped(void)
485{ 485{
486 return __this_cpu_read(tick_cpu_sched.tick_stopped); 486 return __this_cpu_read(tick_cpu_sched.tick_stopped);
487} 487}
488 488
489bool tick_nohz_tick_stopped_cpu(int cpu)
490{
491 struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
492
493 return ts->tick_stopped;
494}
495
489/** 496/**
490 * tick_nohz_update_jiffies - update jiffies when idle was interrupted 497 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
491 * 498 *
@@ -741,12 +748,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
741 delta = KTIME_MAX; 748 delta = KTIME_MAX;
742 } 749 }
743 750
744#ifdef CONFIG_NO_HZ_FULL
745 /* Limit the tick delta to the maximum scheduler deferment */
746 if (!ts->inidle)
747 delta = min(delta, scheduler_tick_max_deferment());
748#endif
749
750 /* Calculate the next expiry time */ 751 /* Calculate the next expiry time */
751 if (delta < (KTIME_MAX - basemono)) 752 if (delta < (KTIME_MAX - basemono))
752 expires = basemono + delta; 753 expires = basemono + delta;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6ec6ba65127b..254e636a3d6b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5573,12 +5573,13 @@ static void __init wq_numa_init(void)
5573int __init workqueue_init_early(void) 5573int __init workqueue_init_early(void)
5574{ 5574{
5575 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; 5575 int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
5576 int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
5576 int i, cpu; 5577 int i, cpu;
5577 5578
5578 WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); 5579 WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
5579 5580
5580 BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); 5581 BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
5581 cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); 5582 cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));
5582 5583
5583 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); 5584 pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
5584 5585