diff options
author | Dan Williams <dan.j.williams@intel.com> | 2018-04-09 13:50:17 -0400 |
---|---|---|
committer | Dan Williams <dan.j.williams@intel.com> | 2018-04-09 13:50:17 -0400 |
commit | e13e75b86ef2f88e3a47d672dd4c52a293efb95b (patch) | |
tree | 2617aebd952d1aec09d323f6b2484b93f659e753 | |
parent | 1ed41b5696ccc3ff40a1dee39fe14eff273faf82 (diff) | |
parent | 976431b02c2ef92ae3f8b6a7d699fc554025e118 (diff) |
Merge branch 'for-4.17/dax' into libnvdimm-for-next
60 files changed, 1637 insertions, 1298 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1d1d53f85ddd..50b9837e985b 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt | |||
@@ -1766,6 +1766,17 @@ | |||
1766 | 1766 | ||
1767 | nohz | 1767 | nohz |
1768 | Disable the tick when a single task runs. | 1768 | Disable the tick when a single task runs. |
1769 | |||
1770 | A residual 1Hz tick is offloaded to workqueues, which you | ||
1771 | need to affine to housekeeping through the global | ||
1772 | workqueue's affinity configured via the | ||
1773 | /sys/devices/virtual/workqueue/cpumask sysfs file, or | ||
1774 | by using the 'domain' flag described below. | ||
1775 | |||
1776 | NOTE: by default the global workqueue runs on all CPUs, | ||
1777 | so to protect individual CPUs the 'cpumask' file has to | ||
1778 | be configured manually after bootup. | ||
1779 | |||
1769 | domain | 1780 | domain |
1770 | Isolate from the general SMP balancing and scheduling | 1781 | Isolate from the general SMP balancing and scheduling |
1771 | algorithms. Note that performing domain isolation this way | 1782 | algorithms. Note that performing domain isolation this way |
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig index b79aa8f7a497..e0700bf4893a 100644 --- a/drivers/dax/Kconfig +++ b/drivers/dax/Kconfig | |||
@@ -1,3 +1,7 @@ | |||
1 | config DAX_DRIVER | ||
2 | select DAX | ||
3 | bool | ||
4 | |||
1 | menuconfig DAX | 5 | menuconfig DAX |
2 | tristate "DAX: direct access to differentiated memory" | 6 | tristate "DAX: direct access to differentiated memory" |
3 | select SRCU | 7 | select SRCU |
@@ -16,7 +20,6 @@ config DEV_DAX | |||
16 | baseline memory pool. Mappings of a /dev/daxX.Y device impose | 20 | baseline memory pool. Mappings of a /dev/daxX.Y device impose |
17 | restrictions that make the mapping behavior deterministic. | 21 | restrictions that make the mapping behavior deterministic. |
18 | 22 | ||
19 | |||
20 | config DEV_DAX_PMEM | 23 | config DEV_DAX_PMEM |
21 | tristate "PMEM DAX: direct access to persistent memory" | 24 | tristate "PMEM DAX: direct access to persistent memory" |
22 | depends on LIBNVDIMM && NVDIMM_DAX && DEV_DAX | 25 | depends on LIBNVDIMM && NVDIMM_DAX && DEV_DAX |
diff --git a/drivers/dax/super.c b/drivers/dax/super.c index ecdc292aa4e4..2b2332b605e4 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c | |||
@@ -124,10 +124,19 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize) | |||
124 | return len < 0 ? len : -EIO; | 124 | return len < 0 ? len : -EIO; |
125 | } | 125 | } |
126 | 126 | ||
127 | if ((IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) | 127 | if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) { |
128 | || pfn_t_devmap(pfn)) | 128 | /* |
129 | * An arch that has enabled the pmem api should also | ||
130 | * have its drivers support pfn_t_devmap() | ||
131 | * | ||
132 | * This is a developer warning and should not trigger in | ||
133 | * production. dax_flush() will crash since it depends | ||
134 | * on being able to do (page_address(pfn_to_page())). | ||
135 | */ | ||
136 | WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API)); | ||
137 | } else if (pfn_t_devmap(pfn)) { | ||
129 | /* pass */; | 138 | /* pass */; |
130 | else { | 139 | } else { |
131 | pr_debug("VFS (%s): error: dax support not enabled\n", | 140 | pr_debug("VFS (%s): error: dax support not enabled\n", |
132 | sb->s_id); | 141 | sb->s_id); |
133 | return -EOPNOTSUPP; | 142 | return -EOPNOTSUPP; |
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 2c8ac3688815..edff083f7c4e 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -201,7 +201,7 @@ config BLK_DEV_DM_BUILTIN | |||
201 | config BLK_DEV_DM | 201 | config BLK_DEV_DM |
202 | tristate "Device mapper support" | 202 | tristate "Device mapper support" |
203 | select BLK_DEV_DM_BUILTIN | 203 | select BLK_DEV_DM_BUILTIN |
204 | select DAX | 204 | depends on DAX || DAX=n |
205 | ---help--- | 205 | ---help--- |
206 | Device-mapper is a low level volume manager. It works by allowing | 206 | Device-mapper is a low level volume manager. It works by allowing |
207 | people to specify mappings for ranges of logical sectors. Various | 207 | people to specify mappings for ranges of logical sectors. Various |
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index d5f8eff7c11d..89443e0ededa 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c | |||
@@ -154,6 +154,7 @@ static int linear_iterate_devices(struct dm_target *ti, | |||
154 | return fn(ti, lc->dev, lc->start, ti->len, data); | 154 | return fn(ti, lc->dev, lc->start, ti->len, data); |
155 | } | 155 | } |
156 | 156 | ||
157 | #if IS_ENABLED(CONFIG_DAX_DRIVER) | ||
157 | static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, | 158 | static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, |
158 | long nr_pages, void **kaddr, pfn_t *pfn) | 159 | long nr_pages, void **kaddr, pfn_t *pfn) |
159 | { | 160 | { |
@@ -184,6 +185,11 @@ static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff, | |||
184 | return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); | 185 | return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); |
185 | } | 186 | } |
186 | 187 | ||
188 | #else | ||
189 | #define linear_dax_direct_access NULL | ||
190 | #define linear_dax_copy_from_iter NULL | ||
191 | #endif | ||
192 | |||
187 | static struct target_type linear_target = { | 193 | static struct target_type linear_target = { |
188 | .name = "linear", | 194 | .name = "linear", |
189 | .version = {1, 4, 0}, | 195 | .version = {1, 4, 0}, |
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 3362d866793b..7fcb4216973f 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c | |||
@@ -610,51 +610,6 @@ static int log_mark(struct log_writes_c *lc, char *data) | |||
610 | return 0; | 610 | return 0; |
611 | } | 611 | } |
612 | 612 | ||
613 | static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes, | ||
614 | struct iov_iter *i) | ||
615 | { | ||
616 | struct pending_block *block; | ||
617 | |||
618 | if (!bytes) | ||
619 | return 0; | ||
620 | |||
621 | block = kzalloc(sizeof(struct pending_block), GFP_KERNEL); | ||
622 | if (!block) { | ||
623 | DMERR("Error allocating dax pending block"); | ||
624 | return -ENOMEM; | ||
625 | } | ||
626 | |||
627 | block->data = kzalloc(bytes, GFP_KERNEL); | ||
628 | if (!block->data) { | ||
629 | DMERR("Error allocating dax data space"); | ||
630 | kfree(block); | ||
631 | return -ENOMEM; | ||
632 | } | ||
633 | |||
634 | /* write data provided via the iterator */ | ||
635 | if (!copy_from_iter(block->data, bytes, i)) { | ||
636 | DMERR("Error copying dax data"); | ||
637 | kfree(block->data); | ||
638 | kfree(block); | ||
639 | return -EIO; | ||
640 | } | ||
641 | |||
642 | /* rewind the iterator so that the block driver can use it */ | ||
643 | iov_iter_revert(i, bytes); | ||
644 | |||
645 | block->datalen = bytes; | ||
646 | block->sector = bio_to_dev_sectors(lc, sector); | ||
647 | block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift; | ||
648 | |||
649 | atomic_inc(&lc->pending_blocks); | ||
650 | spin_lock_irq(&lc->blocks_lock); | ||
651 | list_add_tail(&block->list, &lc->unflushed_blocks); | ||
652 | spin_unlock_irq(&lc->blocks_lock); | ||
653 | wake_up_process(lc->log_kthread); | ||
654 | |||
655 | return 0; | ||
656 | } | ||
657 | |||
658 | static void log_writes_dtr(struct dm_target *ti) | 613 | static void log_writes_dtr(struct dm_target *ti) |
659 | { | 614 | { |
660 | struct log_writes_c *lc = ti->private; | 615 | struct log_writes_c *lc = ti->private; |
@@ -920,6 +875,52 @@ static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limit | |||
920 | limits->io_min = limits->physical_block_size; | 875 | limits->io_min = limits->physical_block_size; |
921 | } | 876 | } |
922 | 877 | ||
878 | #if IS_ENABLED(CONFIG_DAX_DRIVER) | ||
879 | static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes, | ||
880 | struct iov_iter *i) | ||
881 | { | ||
882 | struct pending_block *block; | ||
883 | |||
884 | if (!bytes) | ||
885 | return 0; | ||
886 | |||
887 | block = kzalloc(sizeof(struct pending_block), GFP_KERNEL); | ||
888 | if (!block) { | ||
889 | DMERR("Error allocating dax pending block"); | ||
890 | return -ENOMEM; | ||
891 | } | ||
892 | |||
893 | block->data = kzalloc(bytes, GFP_KERNEL); | ||
894 | if (!block->data) { | ||
895 | DMERR("Error allocating dax data space"); | ||
896 | kfree(block); | ||
897 | return -ENOMEM; | ||
898 | } | ||
899 | |||
900 | /* write data provided via the iterator */ | ||
901 | if (!copy_from_iter(block->data, bytes, i)) { | ||
902 | DMERR("Error copying dax data"); | ||
903 | kfree(block->data); | ||
904 | kfree(block); | ||
905 | return -EIO; | ||
906 | } | ||
907 | |||
908 | /* rewind the iterator so that the block driver can use it */ | ||
909 | iov_iter_revert(i, bytes); | ||
910 | |||
911 | block->datalen = bytes; | ||
912 | block->sector = bio_to_dev_sectors(lc, sector); | ||
913 | block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift; | ||
914 | |||
915 | atomic_inc(&lc->pending_blocks); | ||
916 | spin_lock_irq(&lc->blocks_lock); | ||
917 | list_add_tail(&block->list, &lc->unflushed_blocks); | ||
918 | spin_unlock_irq(&lc->blocks_lock); | ||
919 | wake_up_process(lc->log_kthread); | ||
920 | |||
921 | return 0; | ||
922 | } | ||
923 | |||
923 | static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, | 924 | static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, |
924 | long nr_pages, void **kaddr, pfn_t *pfn) | 925 | long nr_pages, void **kaddr, pfn_t *pfn) |
925 | { | 926 | { |
@@ -956,6 +957,10 @@ static size_t log_writes_dax_copy_from_iter(struct dm_target *ti, | |||
956 | dax_copy: | 957 | dax_copy: |
957 | return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i); | 958 | return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i); |
958 | } | 959 | } |
960 | #else | ||
961 | #define log_writes_dax_direct_access NULL | ||
962 | #define log_writes_dax_copy_from_iter NULL | ||
963 | #endif | ||
959 | 964 | ||
960 | static struct target_type log_writes_target = { | 965 | static struct target_type log_writes_target = { |
961 | .name = "log-writes", | 966 | .name = "log-writes", |
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index b5e892149c54..ac2e8ee9d586 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c | |||
@@ -311,6 +311,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio) | |||
311 | return DM_MAPIO_REMAPPED; | 311 | return DM_MAPIO_REMAPPED; |
312 | } | 312 | } |
313 | 313 | ||
314 | #if IS_ENABLED(CONFIG_DAX_DRIVER) | ||
314 | static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, | 315 | static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff, |
315 | long nr_pages, void **kaddr, pfn_t *pfn) | 316 | long nr_pages, void **kaddr, pfn_t *pfn) |
316 | { | 317 | { |
@@ -351,6 +352,11 @@ static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff, | |||
351 | return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); | 352 | return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i); |
352 | } | 353 | } |
353 | 354 | ||
355 | #else | ||
356 | #define stripe_dax_direct_access NULL | ||
357 | #define stripe_dax_copy_from_iter NULL | ||
358 | #endif | ||
359 | |||
354 | /* | 360 | /* |
355 | * Stripe status: | 361 | * Stripe status: |
356 | * | 362 | * |
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 45328d8b2859..bac79f40f3cb 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -1805,7 +1805,7 @@ static void cleanup_mapped_device(struct mapped_device *md) | |||
1805 | static struct mapped_device *alloc_dev(int minor) | 1805 | static struct mapped_device *alloc_dev(int minor) |
1806 | { | 1806 | { |
1807 | int r, numa_node_id = dm_get_numa_node(); | 1807 | int r, numa_node_id = dm_get_numa_node(); |
1808 | struct dax_device *dax_dev; | 1808 | struct dax_device *dax_dev = NULL; |
1809 | struct mapped_device *md; | 1809 | struct mapped_device *md; |
1810 | void *old_md; | 1810 | void *old_md; |
1811 | 1811 | ||
@@ -1871,9 +1871,11 @@ static struct mapped_device *alloc_dev(int minor) | |||
1871 | md->disk->private_data = md; | 1871 | md->disk->private_data = md; |
1872 | sprintf(md->disk->disk_name, "dm-%d", minor); | 1872 | sprintf(md->disk->disk_name, "dm-%d", minor); |
1873 | 1873 | ||
1874 | dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); | 1874 | if (IS_ENABLED(CONFIG_DAX_DRIVER)) { |
1875 | if (!dax_dev) | 1875 | dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops); |
1876 | goto bad; | 1876 | if (!dax_dev) |
1877 | goto bad; | ||
1878 | } | ||
1877 | md->dax_dev = dax_dev; | 1879 | md->dax_dev = dax_dev; |
1878 | 1880 | ||
1879 | add_disk_no_queue_reg(md->disk); | 1881 | add_disk_no_queue_reg(md->disk); |
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig index f6c533c4d09b..85997184e047 100644 --- a/drivers/nvdimm/Kconfig +++ b/drivers/nvdimm/Kconfig | |||
@@ -20,7 +20,7 @@ if LIBNVDIMM | |||
20 | config BLK_DEV_PMEM | 20 | config BLK_DEV_PMEM |
21 | tristate "PMEM: Persistent memory block device support" | 21 | tristate "PMEM: Persistent memory block device support" |
22 | default LIBNVDIMM | 22 | default LIBNVDIMM |
23 | select DAX | 23 | select DAX_DRIVER |
24 | select ND_BTT if BTT | 24 | select ND_BTT if BTT |
25 | select ND_PFN if NVDIMM_PFN | 25 | select ND_PFN if NVDIMM_PFN |
26 | help | 26 | help |
diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig index 1444333210c7..9ac7574e3cfb 100644 --- a/drivers/s390/block/Kconfig +++ b/drivers/s390/block/Kconfig | |||
@@ -15,8 +15,8 @@ config BLK_DEV_XPRAM | |||
15 | 15 | ||
16 | config DCSSBLK | 16 | config DCSSBLK |
17 | def_tristate m | 17 | def_tristate m |
18 | select DAX | ||
19 | select FS_DAX_LIMITED | 18 | select FS_DAX_LIMITED |
19 | select DAX_DRIVER | ||
20 | prompt "DCSSBLK support" | 20 | prompt "DCSSBLK support" |
21 | depends on S390 && BLOCK | 21 | depends on S390 && BLOCK |
22 | help | 22 | help |
diff --git a/fs/block_dev.c b/fs/block_dev.c index fe09ef9c21f3..846ee2d31781 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c | |||
@@ -1946,11 +1946,6 @@ static int blkdev_releasepage(struct page *page, gfp_t wait) | |||
1946 | static int blkdev_writepages(struct address_space *mapping, | 1946 | static int blkdev_writepages(struct address_space *mapping, |
1947 | struct writeback_control *wbc) | 1947 | struct writeback_control *wbc) |
1948 | { | 1948 | { |
1949 | if (dax_mapping(mapping)) { | ||
1950 | struct block_device *bdev = I_BDEV(mapping->host); | ||
1951 | |||
1952 | return dax_writeback_mapping_range(mapping, bdev, wbc); | ||
1953 | } | ||
1954 | return generic_writepages(mapping, wbc); | 1949 | return generic_writepages(mapping, wbc); |
1955 | } | 1950 | } |
1956 | 1951 | ||
@@ -73,16 +73,15 @@ fs_initcall(init_dax_wait_table); | |||
73 | #define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) | 73 | #define RADIX_DAX_ZERO_PAGE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2)) |
74 | #define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) | 74 | #define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3)) |
75 | 75 | ||
76 | static unsigned long dax_radix_sector(void *entry) | 76 | static unsigned long dax_radix_pfn(void *entry) |
77 | { | 77 | { |
78 | return (unsigned long)entry >> RADIX_DAX_SHIFT; | 78 | return (unsigned long)entry >> RADIX_DAX_SHIFT; |
79 | } | 79 | } |
80 | 80 | ||
81 | static void *dax_radix_locked_entry(sector_t sector, unsigned long flags) | 81 | static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags) |
82 | { | 82 | { |
83 | return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | | 83 | return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags | |
84 | ((unsigned long)sector << RADIX_DAX_SHIFT) | | 84 | (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK); |
85 | RADIX_DAX_ENTRY_LOCK); | ||
86 | } | 85 | } |
87 | 86 | ||
88 | static unsigned int dax_radix_order(void *entry) | 87 | static unsigned int dax_radix_order(void *entry) |
@@ -299,6 +298,63 @@ static void put_unlocked_mapping_entry(struct address_space *mapping, | |||
299 | dax_wake_mapping_entry_waiter(mapping, index, entry, false); | 298 | dax_wake_mapping_entry_waiter(mapping, index, entry, false); |
300 | } | 299 | } |
301 | 300 | ||
301 | static unsigned long dax_entry_size(void *entry) | ||
302 | { | ||
303 | if (dax_is_zero_entry(entry)) | ||
304 | return 0; | ||
305 | else if (dax_is_empty_entry(entry)) | ||
306 | return 0; | ||
307 | else if (dax_is_pmd_entry(entry)) | ||
308 | return PMD_SIZE; | ||
309 | else | ||
310 | return PAGE_SIZE; | ||
311 | } | ||
312 | |||
313 | static unsigned long dax_radix_end_pfn(void *entry) | ||
314 | { | ||
315 | return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; | ||
316 | } | ||
317 | |||
318 | /* | ||
319 | * Iterate through all mapped pfns represented by an entry, i.e. skip | ||
320 | * 'empty' and 'zero' entries. | ||
321 | */ | ||
322 | #define for_each_mapped_pfn(entry, pfn) \ | ||
323 | for (pfn = dax_radix_pfn(entry); \ | ||
324 | pfn < dax_radix_end_pfn(entry); pfn++) | ||
325 | |||
326 | static void dax_associate_entry(void *entry, struct address_space *mapping) | ||
327 | { | ||
328 | unsigned long pfn; | ||
329 | |||
330 | if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) | ||
331 | return; | ||
332 | |||
333 | for_each_mapped_pfn(entry, pfn) { | ||
334 | struct page *page = pfn_to_page(pfn); | ||
335 | |||
336 | WARN_ON_ONCE(page->mapping); | ||
337 | page->mapping = mapping; | ||
338 | } | ||
339 | } | ||
340 | |||
341 | static void dax_disassociate_entry(void *entry, struct address_space *mapping, | ||
342 | bool trunc) | ||
343 | { | ||
344 | unsigned long pfn; | ||
345 | |||
346 | if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) | ||
347 | return; | ||
348 | |||
349 | for_each_mapped_pfn(entry, pfn) { | ||
350 | struct page *page = pfn_to_page(pfn); | ||
351 | |||
352 | WARN_ON_ONCE(trunc && page_ref_count(page) > 1); | ||
353 | WARN_ON_ONCE(page->mapping && page->mapping != mapping); | ||
354 | page->mapping = NULL; | ||
355 | } | ||
356 | } | ||
357 | |||
302 | /* | 358 | /* |
303 | * Find radix tree entry at given index. If it points to an exceptional entry, | 359 | * Find radix tree entry at given index. If it points to an exceptional entry, |
304 | * return it with the radix tree entry locked. If the radix tree doesn't | 360 | * return it with the radix tree entry locked. If the radix tree doesn't |
@@ -405,6 +461,7 @@ restart: | |||
405 | } | 461 | } |
406 | 462 | ||
407 | if (pmd_downgrade) { | 463 | if (pmd_downgrade) { |
464 | dax_disassociate_entry(entry, mapping, false); | ||
408 | radix_tree_delete(&mapping->page_tree, index); | 465 | radix_tree_delete(&mapping->page_tree, index); |
409 | mapping->nrexceptional--; | 466 | mapping->nrexceptional--; |
410 | dax_wake_mapping_entry_waiter(mapping, index, entry, | 467 | dax_wake_mapping_entry_waiter(mapping, index, entry, |
@@ -454,6 +511,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping, | |||
454 | (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || | 511 | (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) || |
455 | radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) | 512 | radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))) |
456 | goto out; | 513 | goto out; |
514 | dax_disassociate_entry(entry, mapping, trunc); | ||
457 | radix_tree_delete(page_tree, index); | 515 | radix_tree_delete(page_tree, index); |
458 | mapping->nrexceptional--; | 516 | mapping->nrexceptional--; |
459 | ret = 1; | 517 | ret = 1; |
@@ -526,12 +584,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, | |||
526 | */ | 584 | */ |
527 | static void *dax_insert_mapping_entry(struct address_space *mapping, | 585 | static void *dax_insert_mapping_entry(struct address_space *mapping, |
528 | struct vm_fault *vmf, | 586 | struct vm_fault *vmf, |
529 | void *entry, sector_t sector, | 587 | void *entry, pfn_t pfn_t, |
530 | unsigned long flags, bool dirty) | 588 | unsigned long flags, bool dirty) |
531 | { | 589 | { |
532 | struct radix_tree_root *page_tree = &mapping->page_tree; | 590 | struct radix_tree_root *page_tree = &mapping->page_tree; |
533 | void *new_entry; | 591 | unsigned long pfn = pfn_t_to_pfn(pfn_t); |
534 | pgoff_t index = vmf->pgoff; | 592 | pgoff_t index = vmf->pgoff; |
593 | void *new_entry; | ||
535 | 594 | ||
536 | if (dirty) | 595 | if (dirty) |
537 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); | 596 | __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); |
@@ -546,7 +605,11 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, | |||
546 | } | 605 | } |
547 | 606 | ||
548 | spin_lock_irq(&mapping->tree_lock); | 607 | spin_lock_irq(&mapping->tree_lock); |
549 | new_entry = dax_radix_locked_entry(sector, flags); | 608 | new_entry = dax_radix_locked_entry(pfn, flags); |
609 | if (dax_entry_size(entry) != dax_entry_size(new_entry)) { | ||
610 | dax_disassociate_entry(entry, mapping, false); | ||
611 | dax_associate_entry(new_entry, mapping); | ||
612 | } | ||
550 | 613 | ||
551 | if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { | 614 | if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { |
552 | /* | 615 | /* |
@@ -657,17 +720,14 @@ unlock_pte: | |||
657 | i_mmap_unlock_read(mapping); | 720 | i_mmap_unlock_read(mapping); |
658 | } | 721 | } |
659 | 722 | ||
660 | static int dax_writeback_one(struct block_device *bdev, | 723 | static int dax_writeback_one(struct dax_device *dax_dev, |
661 | struct dax_device *dax_dev, struct address_space *mapping, | 724 | struct address_space *mapping, pgoff_t index, void *entry) |
662 | pgoff_t index, void *entry) | ||
663 | { | 725 | { |
664 | struct radix_tree_root *page_tree = &mapping->page_tree; | 726 | struct radix_tree_root *page_tree = &mapping->page_tree; |
665 | void *entry2, **slot, *kaddr; | 727 | void *entry2, **slot; |
666 | long ret = 0, id; | 728 | unsigned long pfn; |
667 | sector_t sector; | 729 | long ret = 0; |
668 | pgoff_t pgoff; | ||
669 | size_t size; | 730 | size_t size; |
670 | pfn_t pfn; | ||
671 | 731 | ||
672 | /* | 732 | /* |
673 | * A page got tagged dirty in DAX mapping? Something is seriously | 733 | * A page got tagged dirty in DAX mapping? Something is seriously |
@@ -683,10 +743,10 @@ static int dax_writeback_one(struct block_device *bdev, | |||
683 | goto put_unlocked; | 743 | goto put_unlocked; |
684 | /* | 744 | /* |
685 | * Entry got reallocated elsewhere? No need to writeback. We have to | 745 | * Entry got reallocated elsewhere? No need to writeback. We have to |
686 | * compare sectors as we must not bail out due to difference in lockbit | 746 | * compare pfns as we must not bail out due to difference in lockbit |
687 | * or entry type. | 747 | * or entry type. |
688 | */ | 748 | */ |
689 | if (dax_radix_sector(entry2) != dax_radix_sector(entry)) | 749 | if (dax_radix_pfn(entry2) != dax_radix_pfn(entry)) |
690 | goto put_unlocked; | 750 | goto put_unlocked; |
691 | if (WARN_ON_ONCE(dax_is_empty_entry(entry) || | 751 | if (WARN_ON_ONCE(dax_is_empty_entry(entry) || |
692 | dax_is_zero_entry(entry))) { | 752 | dax_is_zero_entry(entry))) { |
@@ -712,33 +772,15 @@ static int dax_writeback_one(struct block_device *bdev, | |||
712 | /* | 772 | /* |
713 | * Even if dax_writeback_mapping_range() was given a wbc->range_start | 773 | * Even if dax_writeback_mapping_range() was given a wbc->range_start |
714 | * in the middle of a PMD, the 'index' we are given will be aligned to | 774 | * in the middle of a PMD, the 'index' we are given will be aligned to |
715 | * the start index of the PMD, as will the sector we pull from | 775 | * the start index of the PMD, as will the pfn we pull from 'entry'. |
716 | * 'entry'. This allows us to flush for PMD_SIZE and not have to | 776 | * This allows us to flush for PMD_SIZE and not have to worry about |
717 | * worry about partial PMD writebacks. | 777 | * partial PMD writebacks. |
718 | */ | 778 | */ |
719 | sector = dax_radix_sector(entry); | 779 | pfn = dax_radix_pfn(entry); |
720 | size = PAGE_SIZE << dax_radix_order(entry); | 780 | size = PAGE_SIZE << dax_radix_order(entry); |
721 | 781 | ||
722 | id = dax_read_lock(); | 782 | dax_mapping_entry_mkclean(mapping, index, pfn); |
723 | ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); | 783 | dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size); |
724 | if (ret) | ||
725 | goto dax_unlock; | ||
726 | |||
727 | /* | ||
728 | * dax_direct_access() may sleep, so cannot hold tree_lock over | ||
729 | * its invocation. | ||
730 | */ | ||
731 | ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn); | ||
732 | if (ret < 0) | ||
733 | goto dax_unlock; | ||
734 | |||
735 | if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) { | ||
736 | ret = -EIO; | ||
737 | goto dax_unlock; | ||
738 | } | ||
739 | |||
740 | dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn)); | ||
741 | dax_flush(dax_dev, kaddr, size); | ||
742 | /* | 784 | /* |
743 | * After we have flushed the cache, we can clear the dirty tag. There | 785 | * After we have flushed the cache, we can clear the dirty tag. There |
744 | * cannot be new dirty data in the pfn after the flush has completed as | 786 | * cannot be new dirty data in the pfn after the flush has completed as |
@@ -749,8 +791,6 @@ static int dax_writeback_one(struct block_device *bdev, | |||
749 | radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); | 791 | radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY); |
750 | spin_unlock_irq(&mapping->tree_lock); | 792 | spin_unlock_irq(&mapping->tree_lock); |
751 | trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); | 793 | trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT); |
752 | dax_unlock: | ||
753 | dax_read_unlock(id); | ||
754 | put_locked_mapping_entry(mapping, index); | 794 | put_locked_mapping_entry(mapping, index); |
755 | return ret; | 795 | return ret; |
756 | 796 | ||
@@ -808,8 +848,8 @@ int dax_writeback_mapping_range(struct address_space *mapping, | |||
808 | break; | 848 | break; |
809 | } | 849 | } |
810 | 850 | ||
811 | ret = dax_writeback_one(bdev, dax_dev, mapping, | 851 | ret = dax_writeback_one(dax_dev, mapping, indices[i], |
812 | indices[i], pvec.pages[i]); | 852 | pvec.pages[i]); |
813 | if (ret < 0) { | 853 | if (ret < 0) { |
814 | mapping_set_error(mapping, ret); | 854 | mapping_set_error(mapping, ret); |
815 | goto out; | 855 | goto out; |
@@ -877,6 +917,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry, | |||
877 | int ret = VM_FAULT_NOPAGE; | 917 | int ret = VM_FAULT_NOPAGE; |
878 | struct page *zero_page; | 918 | struct page *zero_page; |
879 | void *entry2; | 919 | void *entry2; |
920 | pfn_t pfn; | ||
880 | 921 | ||
881 | zero_page = ZERO_PAGE(0); | 922 | zero_page = ZERO_PAGE(0); |
882 | if (unlikely(!zero_page)) { | 923 | if (unlikely(!zero_page)) { |
@@ -884,14 +925,15 @@ static int dax_load_hole(struct address_space *mapping, void *entry, | |||
884 | goto out; | 925 | goto out; |
885 | } | 926 | } |
886 | 927 | ||
887 | entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, | 928 | pfn = page_to_pfn_t(zero_page); |
929 | entry2 = dax_insert_mapping_entry(mapping, vmf, entry, pfn, | ||
888 | RADIX_DAX_ZERO_PAGE, false); | 930 | RADIX_DAX_ZERO_PAGE, false); |
889 | if (IS_ERR(entry2)) { | 931 | if (IS_ERR(entry2)) { |
890 | ret = VM_FAULT_SIGBUS; | 932 | ret = VM_FAULT_SIGBUS; |
891 | goto out; | 933 | goto out; |
892 | } | 934 | } |
893 | 935 | ||
894 | vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page)); | 936 | vm_insert_mixed(vmf->vma, vaddr, pfn); |
895 | out: | 937 | out: |
896 | trace_dax_load_hole(inode, vmf, ret); | 938 | trace_dax_load_hole(inode, vmf, ret); |
897 | return ret; | 939 | return ret; |
@@ -1200,8 +1242,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, | |||
1200 | if (error < 0) | 1242 | if (error < 0) |
1201 | goto error_finish_iomap; | 1243 | goto error_finish_iomap; |
1202 | 1244 | ||
1203 | entry = dax_insert_mapping_entry(mapping, vmf, entry, | 1245 | entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, |
1204 | dax_iomap_sector(&iomap, pos), | ||
1205 | 0, write && !sync); | 1246 | 0, write && !sync); |
1206 | if (IS_ERR(entry)) { | 1247 | if (IS_ERR(entry)) { |
1207 | error = PTR_ERR(entry); | 1248 | error = PTR_ERR(entry); |
@@ -1280,13 +1321,15 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, | |||
1280 | void *ret = NULL; | 1321 | void *ret = NULL; |
1281 | spinlock_t *ptl; | 1322 | spinlock_t *ptl; |
1282 | pmd_t pmd_entry; | 1323 | pmd_t pmd_entry; |
1324 | pfn_t pfn; | ||
1283 | 1325 | ||
1284 | zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); | 1326 | zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); |
1285 | 1327 | ||
1286 | if (unlikely(!zero_page)) | 1328 | if (unlikely(!zero_page)) |
1287 | goto fallback; | 1329 | goto fallback; |
1288 | 1330 | ||
1289 | ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, | 1331 | pfn = page_to_pfn_t(zero_page); |
1332 | ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn, | ||
1290 | RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); | 1333 | RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false); |
1291 | if (IS_ERR(ret)) | 1334 | if (IS_ERR(ret)) |
1292 | goto fallback; | 1335 | goto fallback; |
@@ -1409,8 +1452,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, | |||
1409 | if (error < 0) | 1452 | if (error < 0) |
1410 | goto finish_iomap; | 1453 | goto finish_iomap; |
1411 | 1454 | ||
1412 | entry = dax_insert_mapping_entry(mapping, vmf, entry, | 1455 | entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn, |
1413 | dax_iomap_sector(&iomap, pos), | ||
1414 | RADIX_DAX_PMD, write && !sync); | 1456 | RADIX_DAX_PMD, write && !sync); |
1415 | if (IS_ERR(entry)) | 1457 | if (IS_ERR(entry)) |
1416 | goto finish_iomap; | 1458 | goto finish_iomap; |
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 032295e1d386..cc40802ddfa8 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h | |||
@@ -814,6 +814,7 @@ extern const struct inode_operations ext2_file_inode_operations; | |||
814 | extern const struct file_operations ext2_file_operations; | 814 | extern const struct file_operations ext2_file_operations; |
815 | 815 | ||
816 | /* inode.c */ | 816 | /* inode.c */ |
817 | extern void ext2_set_file_ops(struct inode *inode); | ||
817 | extern const struct address_space_operations ext2_aops; | 818 | extern const struct address_space_operations ext2_aops; |
818 | extern const struct address_space_operations ext2_nobh_aops; | 819 | extern const struct address_space_operations ext2_nobh_aops; |
819 | extern const struct iomap_ops ext2_iomap_ops; | 820 | extern const struct iomap_ops ext2_iomap_ops; |
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 9b2ac55ac34f..1e01fabef130 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c | |||
@@ -940,9 +940,6 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) | |||
940 | loff_t offset = iocb->ki_pos; | 940 | loff_t offset = iocb->ki_pos; |
941 | ssize_t ret; | 941 | ssize_t ret; |
942 | 942 | ||
943 | if (WARN_ON_ONCE(IS_DAX(inode))) | ||
944 | return -EIO; | ||
945 | |||
946 | ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); | 943 | ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block); |
947 | if (ret < 0 && iov_iter_rw(iter) == WRITE) | 944 | if (ret < 0 && iov_iter_rw(iter) == WRITE) |
948 | ext2_write_failed(mapping, offset + count); | 945 | ext2_write_failed(mapping, offset + count); |
@@ -952,17 +949,16 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) | |||
952 | static int | 949 | static int |
953 | ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) | 950 | ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) |
954 | { | 951 | { |
955 | #ifdef CONFIG_FS_DAX | ||
956 | if (dax_mapping(mapping)) { | ||
957 | return dax_writeback_mapping_range(mapping, | ||
958 | mapping->host->i_sb->s_bdev, | ||
959 | wbc); | ||
960 | } | ||
961 | #endif | ||
962 | |||
963 | return mpage_writepages(mapping, wbc, ext2_get_block); | 952 | return mpage_writepages(mapping, wbc, ext2_get_block); |
964 | } | 953 | } |
965 | 954 | ||
955 | static int | ||
956 | ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc) | ||
957 | { | ||
958 | return dax_writeback_mapping_range(mapping, | ||
959 | mapping->host->i_sb->s_bdev, wbc); | ||
960 | } | ||
961 | |||
966 | const struct address_space_operations ext2_aops = { | 962 | const struct address_space_operations ext2_aops = { |
967 | .readpage = ext2_readpage, | 963 | .readpage = ext2_readpage, |
968 | .readpages = ext2_readpages, | 964 | .readpages = ext2_readpages, |
@@ -990,6 +986,13 @@ const struct address_space_operations ext2_nobh_aops = { | |||
990 | .error_remove_page = generic_error_remove_page, | 986 | .error_remove_page = generic_error_remove_page, |
991 | }; | 987 | }; |
992 | 988 | ||
989 | static const struct address_space_operations ext2_dax_aops = { | ||
990 | .writepages = ext2_dax_writepages, | ||
991 | .direct_IO = noop_direct_IO, | ||
992 | .set_page_dirty = noop_set_page_dirty, | ||
993 | .invalidatepage = noop_invalidatepage, | ||
994 | }; | ||
995 | |||
993 | /* | 996 | /* |
994 | * Probably it should be a library function... search for first non-zero word | 997 | * Probably it should be a library function... search for first non-zero word |
995 | * or memcmp with zero_page, whatever is better for particular architecture. | 998 | * or memcmp with zero_page, whatever is better for particular architecture. |
@@ -1388,6 +1391,18 @@ void ext2_set_inode_flags(struct inode *inode) | |||
1388 | inode->i_flags |= S_DAX; | 1391 | inode->i_flags |= S_DAX; |
1389 | } | 1392 | } |
1390 | 1393 | ||
1394 | void ext2_set_file_ops(struct inode *inode) | ||
1395 | { | ||
1396 | inode->i_op = &ext2_file_inode_operations; | ||
1397 | inode->i_fop = &ext2_file_operations; | ||
1398 | if (IS_DAX(inode)) | ||
1399 | inode->i_mapping->a_ops = &ext2_dax_aops; | ||
1400 | else if (test_opt(inode->i_sb, NOBH)) | ||
1401 | inode->i_mapping->a_ops = &ext2_nobh_aops; | ||
1402 | else | ||
1403 | inode->i_mapping->a_ops = &ext2_aops; | ||
1404 | } | ||
1405 | |||
1391 | struct inode *ext2_iget (struct super_block *sb, unsigned long ino) | 1406 | struct inode *ext2_iget (struct super_block *sb, unsigned long ino) |
1392 | { | 1407 | { |
1393 | struct ext2_inode_info *ei; | 1408 | struct ext2_inode_info *ei; |
@@ -1480,14 +1495,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) | |||
1480 | ei->i_data[n] = raw_inode->i_block[n]; | 1495 | ei->i_data[n] = raw_inode->i_block[n]; |
1481 | 1496 | ||
1482 | if (S_ISREG(inode->i_mode)) { | 1497 | if (S_ISREG(inode->i_mode)) { |
1483 | inode->i_op = &ext2_file_inode_operations; | 1498 | ext2_set_file_ops(inode); |
1484 | if (test_opt(inode->i_sb, NOBH)) { | ||
1485 | inode->i_mapping->a_ops = &ext2_nobh_aops; | ||
1486 | inode->i_fop = &ext2_file_operations; | ||
1487 | } else { | ||
1488 | inode->i_mapping->a_ops = &ext2_aops; | ||
1489 | inode->i_fop = &ext2_file_operations; | ||
1490 | } | ||
1491 | } else if (S_ISDIR(inode->i_mode)) { | 1499 | } else if (S_ISDIR(inode->i_mode)) { |
1492 | inode->i_op = &ext2_dir_inode_operations; | 1500 | inode->i_op = &ext2_dir_inode_operations; |
1493 | inode->i_fop = &ext2_dir_operations; | 1501 | inode->i_fop = &ext2_dir_operations; |
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index e078075dc66f..55f7caadb093 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c | |||
@@ -107,14 +107,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode | |||
107 | if (IS_ERR(inode)) | 107 | if (IS_ERR(inode)) |
108 | return PTR_ERR(inode); | 108 | return PTR_ERR(inode); |
109 | 109 | ||
110 | inode->i_op = &ext2_file_inode_operations; | 110 | ext2_set_file_ops(inode); |
111 | if (test_opt(inode->i_sb, NOBH)) { | ||
112 | inode->i_mapping->a_ops = &ext2_nobh_aops; | ||
113 | inode->i_fop = &ext2_file_operations; | ||
114 | } else { | ||
115 | inode->i_mapping->a_ops = &ext2_aops; | ||
116 | inode->i_fop = &ext2_file_operations; | ||
117 | } | ||
118 | mark_inode_dirty(inode); | 111 | mark_inode_dirty(inode); |
119 | return ext2_add_nondir(dentry, inode); | 112 | return ext2_add_nondir(dentry, inode); |
120 | } | 113 | } |
@@ -125,14 +118,7 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) | |||
125 | if (IS_ERR(inode)) | 118 | if (IS_ERR(inode)) |
126 | return PTR_ERR(inode); | 119 | return PTR_ERR(inode); |
127 | 120 | ||
128 | inode->i_op = &ext2_file_inode_operations; | 121 | ext2_set_file_ops(inode); |
129 | if (test_opt(inode->i_sb, NOBH)) { | ||
130 | inode->i_mapping->a_ops = &ext2_nobh_aops; | ||
131 | inode->i_fop = &ext2_file_operations; | ||
132 | } else { | ||
133 | inode->i_mapping->a_ops = &ext2_aops; | ||
134 | inode->i_fop = &ext2_file_operations; | ||
135 | } | ||
136 | mark_inode_dirty(inode); | 122 | mark_inode_dirty(inode); |
137 | d_tmpfile(dentry, inode); | 123 | d_tmpfile(dentry, inode); |
138 | unlock_new_inode(inode); | 124 | unlock_new_inode(inode); |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c94780075b04..249a97b19181 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -2725,12 +2725,6 @@ static int ext4_writepages(struct address_space *mapping, | |||
2725 | percpu_down_read(&sbi->s_journal_flag_rwsem); | 2725 | percpu_down_read(&sbi->s_journal_flag_rwsem); |
2726 | trace_ext4_writepages(inode, wbc); | 2726 | trace_ext4_writepages(inode, wbc); |
2727 | 2727 | ||
2728 | if (dax_mapping(mapping)) { | ||
2729 | ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, | ||
2730 | wbc); | ||
2731 | goto out_writepages; | ||
2732 | } | ||
2733 | |||
2734 | /* | 2728 | /* |
2735 | * No pages to write? This is mainly a kludge to avoid starting | 2729 | * No pages to write? This is mainly a kludge to avoid starting |
2736 | * a transaction for special inodes like journal inode on last iput() | 2730 | * a transaction for special inodes like journal inode on last iput() |
@@ -2955,6 +2949,27 @@ out_writepages: | |||
2955 | return ret; | 2949 | return ret; |
2956 | } | 2950 | } |
2957 | 2951 | ||
2952 | static int ext4_dax_writepages(struct address_space *mapping, | ||
2953 | struct writeback_control *wbc) | ||
2954 | { | ||
2955 | int ret; | ||
2956 | long nr_to_write = wbc->nr_to_write; | ||
2957 | struct inode *inode = mapping->host; | ||
2958 | struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); | ||
2959 | |||
2960 | if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) | ||
2961 | return -EIO; | ||
2962 | |||
2963 | percpu_down_read(&sbi->s_journal_flag_rwsem); | ||
2964 | trace_ext4_writepages(inode, wbc); | ||
2965 | |||
2966 | ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc); | ||
2967 | trace_ext4_writepages_result(inode, wbc, ret, | ||
2968 | nr_to_write - wbc->nr_to_write); | ||
2969 | percpu_up_read(&sbi->s_journal_flag_rwsem); | ||
2970 | return ret; | ||
2971 | } | ||
2972 | |||
2958 | static int ext4_nonda_switch(struct super_block *sb) | 2973 | static int ext4_nonda_switch(struct super_block *sb) |
2959 | { | 2974 | { |
2960 | s64 free_clusters, dirty_clusters; | 2975 | s64 free_clusters, dirty_clusters; |
@@ -3857,10 +3872,6 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter) | |||
3857 | if (ext4_has_inline_data(inode)) | 3872 | if (ext4_has_inline_data(inode)) |
3858 | return 0; | 3873 | return 0; |
3859 | 3874 | ||
3860 | /* DAX uses iomap path now */ | ||
3861 | if (WARN_ON_ONCE(IS_DAX(inode))) | ||
3862 | return 0; | ||
3863 | |||
3864 | trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); | 3875 | trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); |
3865 | if (iov_iter_rw(iter) == READ) | 3876 | if (iov_iter_rw(iter) == READ) |
3866 | ret = ext4_direct_IO_read(iocb, iter); | 3877 | ret = ext4_direct_IO_read(iocb, iter); |
@@ -3946,6 +3957,13 @@ static const struct address_space_operations ext4_da_aops = { | |||
3946 | .error_remove_page = generic_error_remove_page, | 3957 | .error_remove_page = generic_error_remove_page, |
3947 | }; | 3958 | }; |
3948 | 3959 | ||
3960 | static const struct address_space_operations ext4_dax_aops = { | ||
3961 | .writepages = ext4_dax_writepages, | ||
3962 | .direct_IO = noop_direct_IO, | ||
3963 | .set_page_dirty = noop_set_page_dirty, | ||
3964 | .invalidatepage = noop_invalidatepage, | ||
3965 | }; | ||
3966 | |||
3949 | void ext4_set_aops(struct inode *inode) | 3967 | void ext4_set_aops(struct inode *inode) |
3950 | { | 3968 | { |
3951 | switch (ext4_inode_journal_mode(inode)) { | 3969 | switch (ext4_inode_journal_mode(inode)) { |
@@ -3958,7 +3976,9 @@ void ext4_set_aops(struct inode *inode) | |||
3958 | default: | 3976 | default: |
3959 | BUG(); | 3977 | BUG(); |
3960 | } | 3978 | } |
3961 | if (test_opt(inode->i_sb, DELALLOC)) | 3979 | if (IS_DAX(inode)) |
3980 | inode->i_mapping->a_ops = &ext4_dax_aops; | ||
3981 | else if (test_opt(inode->i_sb, DELALLOC)) | ||
3962 | inode->i_mapping->a_ops = &ext4_da_aops; | 3982 | inode->i_mapping->a_ops = &ext4_da_aops; |
3963 | else | 3983 | else |
3964 | inode->i_mapping->a_ops = &ext4_aops; | 3984 | inode->i_mapping->a_ops = &ext4_aops; |
diff --git a/fs/libfs.c b/fs/libfs.c index 7ff3cb904acd..0fb590d79f30 100644 --- a/fs/libfs.c +++ b/fs/libfs.c | |||
@@ -1060,6 +1060,45 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) | |||
1060 | } | 1060 | } |
1061 | EXPORT_SYMBOL(noop_fsync); | 1061 | EXPORT_SYMBOL(noop_fsync); |
1062 | 1062 | ||
1063 | int noop_set_page_dirty(struct page *page) | ||
1064 | { | ||
1065 | /* | ||
1066 | * Unlike __set_page_dirty_no_writeback that handles dirty page | ||
1067 | * tracking in the page object, dax does all dirty tracking in | ||
1068 | * the inode address_space in response to mkwrite faults. In the | ||
1069 | * dax case we only need to worry about potentially dirty CPU | ||
1070 | * caches, not dirty page cache pages to write back. | ||
1071 | * | ||
1072 | * This callback is defined to prevent fallback to | ||
1073 | * __set_page_dirty_buffers() in set_page_dirty(). | ||
1074 | */ | ||
1075 | return 0; | ||
1076 | } | ||
1077 | EXPORT_SYMBOL_GPL(noop_set_page_dirty); | ||
1078 | |||
1079 | void noop_invalidatepage(struct page *page, unsigned int offset, | ||
1080 | unsigned int length) | ||
1081 | { | ||
1082 | /* | ||
1083 | * There is no page cache to invalidate in the dax case, however | ||
1084 | * we need this callback defined to prevent falling back to | ||
1085 | * block_invalidatepage() in do_invalidatepage(). | ||
1086 | */ | ||
1087 | } | ||
1088 | EXPORT_SYMBOL_GPL(noop_invalidatepage); | ||
1089 | |||
1090 | ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter) | ||
1091 | { | ||
1092 | /* | ||
1093 | * iomap based filesystems support direct I/O without need for | ||
1094 | * this callback. However, it still needs to be set in | ||
1095 | * inode->a_ops so that open/fcntl know that direct I/O is | ||
1096 | * generally supported. | ||
1097 | */ | ||
1098 | return -EINVAL; | ||
1099 | } | ||
1100 | EXPORT_SYMBOL_GPL(noop_direct_IO); | ||
1101 | |||
1063 | /* Because kfree isn't assignment-compatible with void(void*) ;-/ */ | 1102 | /* Because kfree isn't assignment-compatible with void(void*) ;-/ */ |
1064 | void kfree_link(void *p) | 1103 | void kfree_link(void *p) |
1065 | { | 1104 | { |
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 9c6a830da0ee..e7a56c4786ff 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c | |||
@@ -1194,16 +1194,22 @@ xfs_vm_writepages( | |||
1194 | int ret; | 1194 | int ret; |
1195 | 1195 | ||
1196 | xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); | 1196 | xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); |
1197 | if (dax_mapping(mapping)) | ||
1198 | return dax_writeback_mapping_range(mapping, | ||
1199 | xfs_find_bdev_for_inode(mapping->host), wbc); | ||
1200 | |||
1201 | ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc); | 1197 | ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc); |
1202 | if (wpc.ioend) | 1198 | if (wpc.ioend) |
1203 | ret = xfs_submit_ioend(wbc, wpc.ioend, ret); | 1199 | ret = xfs_submit_ioend(wbc, wpc.ioend, ret); |
1204 | return ret; | 1200 | return ret; |
1205 | } | 1201 | } |
1206 | 1202 | ||
1203 | STATIC int | ||
1204 | xfs_dax_writepages( | ||
1205 | struct address_space *mapping, | ||
1206 | struct writeback_control *wbc) | ||
1207 | { | ||
1208 | xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); | ||
1209 | return dax_writeback_mapping_range(mapping, | ||
1210 | xfs_find_bdev_for_inode(mapping->host), wbc); | ||
1211 | } | ||
1212 | |||
1207 | /* | 1213 | /* |
1208 | * Called to move a page into cleanable state - and from there | 1214 | * Called to move a page into cleanable state - and from there |
1209 | * to be released. The page should already be clean. We always | 1215 | * to be released. The page should already be clean. We always |
@@ -1367,17 +1373,6 @@ out_unlock: | |||
1367 | return error; | 1373 | return error; |
1368 | } | 1374 | } |
1369 | 1375 | ||
1370 | STATIC ssize_t | ||
1371 | xfs_vm_direct_IO( | ||
1372 | struct kiocb *iocb, | ||
1373 | struct iov_iter *iter) | ||
1374 | { | ||
1375 | /* | ||
1376 | * We just need the method present so that open/fcntl allow direct I/O. | ||
1377 | */ | ||
1378 | return -EINVAL; | ||
1379 | } | ||
1380 | |||
1381 | STATIC sector_t | 1376 | STATIC sector_t |
1382 | xfs_vm_bmap( | 1377 | xfs_vm_bmap( |
1383 | struct address_space *mapping, | 1378 | struct address_space *mapping, |
@@ -1500,8 +1495,15 @@ const struct address_space_operations xfs_address_space_operations = { | |||
1500 | .releasepage = xfs_vm_releasepage, | 1495 | .releasepage = xfs_vm_releasepage, |
1501 | .invalidatepage = xfs_vm_invalidatepage, | 1496 | .invalidatepage = xfs_vm_invalidatepage, |
1502 | .bmap = xfs_vm_bmap, | 1497 | .bmap = xfs_vm_bmap, |
1503 | .direct_IO = xfs_vm_direct_IO, | 1498 | .direct_IO = noop_direct_IO, |
1504 | .migratepage = buffer_migrate_page, | 1499 | .migratepage = buffer_migrate_page, |
1505 | .is_partially_uptodate = block_is_partially_uptodate, | 1500 | .is_partially_uptodate = block_is_partially_uptodate, |
1506 | .error_remove_page = generic_error_remove_page, | 1501 | .error_remove_page = generic_error_remove_page, |
1507 | }; | 1502 | }; |
1503 | |||
1504 | const struct address_space_operations xfs_dax_aops = { | ||
1505 | .writepages = xfs_dax_writepages, | ||
1506 | .direct_IO = noop_direct_IO, | ||
1507 | .set_page_dirty = noop_set_page_dirty, | ||
1508 | .invalidatepage = noop_invalidatepage, | ||
1509 | }; | ||
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index 88c85ea63da0..69346d460dfa 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h | |||
@@ -54,6 +54,7 @@ struct xfs_ioend { | |||
54 | }; | 54 | }; |
55 | 55 | ||
56 | extern const struct address_space_operations xfs_address_space_operations; | 56 | extern const struct address_space_operations xfs_address_space_operations; |
57 | extern const struct address_space_operations xfs_dax_aops; | ||
57 | 58 | ||
58 | int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); | 59 | int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); |
59 | 60 | ||
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 56475fcd76f2..951e84df5576 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c | |||
@@ -1272,7 +1272,10 @@ xfs_setup_iops( | |||
1272 | case S_IFREG: | 1272 | case S_IFREG: |
1273 | inode->i_op = &xfs_inode_operations; | 1273 | inode->i_op = &xfs_inode_operations; |
1274 | inode->i_fop = &xfs_file_operations; | 1274 | inode->i_fop = &xfs_file_operations; |
1275 | inode->i_mapping->a_ops = &xfs_address_space_operations; | 1275 | if (IS_DAX(inode)) |
1276 | inode->i_mapping->a_ops = &xfs_dax_aops; | ||
1277 | else | ||
1278 | inode->i_mapping->a_ops = &xfs_address_space_operations; | ||
1276 | break; | 1279 | break; |
1277 | case S_IFDIR: | 1280 | case S_IFDIR: |
1278 | if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) | 1281 | if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb)) |
diff --git a/include/linux/dax.h b/include/linux/dax.h index 0185ecdae135..f9eb22ad341e 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h | |||
@@ -26,18 +26,42 @@ extern struct attribute_group dax_attribute_group; | |||
26 | 26 | ||
27 | #if IS_ENABLED(CONFIG_DAX) | 27 | #if IS_ENABLED(CONFIG_DAX) |
28 | struct dax_device *dax_get_by_host(const char *host); | 28 | struct dax_device *dax_get_by_host(const char *host); |
29 | struct dax_device *alloc_dax(void *private, const char *host, | ||
30 | const struct dax_operations *ops); | ||
29 | void put_dax(struct dax_device *dax_dev); | 31 | void put_dax(struct dax_device *dax_dev); |
32 | void kill_dax(struct dax_device *dax_dev); | ||
33 | void dax_write_cache(struct dax_device *dax_dev, bool wc); | ||
34 | bool dax_write_cache_enabled(struct dax_device *dax_dev); | ||
30 | #else | 35 | #else |
31 | static inline struct dax_device *dax_get_by_host(const char *host) | 36 | static inline struct dax_device *dax_get_by_host(const char *host) |
32 | { | 37 | { |
33 | return NULL; | 38 | return NULL; |
34 | } | 39 | } |
35 | 40 | static inline struct dax_device *alloc_dax(void *private, const char *host, | |
41 | const struct dax_operations *ops) | ||
42 | { | ||
43 | /* | ||
44 | * Callers should check IS_ENABLED(CONFIG_DAX) to know if this | ||
45 | * NULL is an error or expected. | ||
46 | */ | ||
47 | return NULL; | ||
48 | } | ||
36 | static inline void put_dax(struct dax_device *dax_dev) | 49 | static inline void put_dax(struct dax_device *dax_dev) |
37 | { | 50 | { |
38 | } | 51 | } |
52 | static inline void kill_dax(struct dax_device *dax_dev) | ||
53 | { | ||
54 | } | ||
55 | static inline void dax_write_cache(struct dax_device *dax_dev, bool wc) | ||
56 | { | ||
57 | } | ||
58 | static inline bool dax_write_cache_enabled(struct dax_device *dax_dev) | ||
59 | { | ||
60 | return false; | ||
61 | } | ||
39 | #endif | 62 | #endif |
40 | 63 | ||
64 | struct writeback_control; | ||
41 | int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); | 65 | int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff); |
42 | #if IS_ENABLED(CONFIG_FS_DAX) | 66 | #if IS_ENABLED(CONFIG_FS_DAX) |
43 | int __bdev_dax_supported(struct super_block *sb, int blocksize); | 67 | int __bdev_dax_supported(struct super_block *sb, int blocksize); |
@@ -57,6 +81,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev) | |||
57 | } | 81 | } |
58 | 82 | ||
59 | struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev); | 83 | struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev); |
84 | int dax_writeback_mapping_range(struct address_space *mapping, | ||
85 | struct block_device *bdev, struct writeback_control *wbc); | ||
60 | #else | 86 | #else |
61 | static inline int bdev_dax_supported(struct super_block *sb, int blocksize) | 87 | static inline int bdev_dax_supported(struct super_block *sb, int blocksize) |
62 | { | 88 | { |
@@ -76,22 +102,23 @@ static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev) | |||
76 | { | 102 | { |
77 | return NULL; | 103 | return NULL; |
78 | } | 104 | } |
105 | |||
106 | static inline int dax_writeback_mapping_range(struct address_space *mapping, | ||
107 | struct block_device *bdev, struct writeback_control *wbc) | ||
108 | { | ||
109 | return -EOPNOTSUPP; | ||
110 | } | ||
79 | #endif | 111 | #endif |
80 | 112 | ||
81 | int dax_read_lock(void); | 113 | int dax_read_lock(void); |
82 | void dax_read_unlock(int id); | 114 | void dax_read_unlock(int id); |
83 | struct dax_device *alloc_dax(void *private, const char *host, | ||
84 | const struct dax_operations *ops); | ||
85 | bool dax_alive(struct dax_device *dax_dev); | 115 | bool dax_alive(struct dax_device *dax_dev); |
86 | void kill_dax(struct dax_device *dax_dev); | ||
87 | void *dax_get_private(struct dax_device *dax_dev); | 116 | void *dax_get_private(struct dax_device *dax_dev); |
88 | long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, | 117 | long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages, |
89 | void **kaddr, pfn_t *pfn); | 118 | void **kaddr, pfn_t *pfn); |
90 | size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, | 119 | size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr, |
91 | size_t bytes, struct iov_iter *i); | 120 | size_t bytes, struct iov_iter *i); |
92 | void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); | 121 | void dax_flush(struct dax_device *dax_dev, void *addr, size_t size); |
93 | void dax_write_cache(struct dax_device *dax_dev, bool wc); | ||
94 | bool dax_write_cache_enabled(struct dax_device *dax_dev); | ||
95 | 122 | ||
96 | ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, | 123 | ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, |
97 | const struct iomap_ops *ops); | 124 | const struct iomap_ops *ops); |
@@ -121,7 +148,4 @@ static inline bool dax_mapping(struct address_space *mapping) | |||
121 | return mapping->host && IS_DAX(mapping->host); | 148 | return mapping->host && IS_DAX(mapping->host); |
122 | } | 149 | } |
123 | 150 | ||
124 | struct writeback_control; | ||
125 | int dax_writeback_mapping_range(struct address_space *mapping, | ||
126 | struct block_device *bdev, struct writeback_control *wbc); | ||
127 | #endif | 151 | #endif |
diff --git a/include/linux/fs.h b/include/linux/fs.h index c6baf767619e..a3bb2aedbc2b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h | |||
@@ -3130,6 +3130,10 @@ extern int simple_rmdir(struct inode *, struct dentry *); | |||
3130 | extern int simple_rename(struct inode *, struct dentry *, | 3130 | extern int simple_rename(struct inode *, struct dentry *, |
3131 | struct inode *, struct dentry *, unsigned int); | 3131 | struct inode *, struct dentry *, unsigned int); |
3132 | extern int noop_fsync(struct file *, loff_t, loff_t, int); | 3132 | extern int noop_fsync(struct file *, loff_t, loff_t, int); |
3133 | extern int noop_set_page_dirty(struct page *page); | ||
3134 | extern void noop_invalidatepage(struct page *page, unsigned int offset, | ||
3135 | unsigned int length); | ||
3136 | extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter); | ||
3133 | extern int simple_empty(struct dentry *); | 3137 | extern int simple_empty(struct dentry *); |
3134 | extern int simple_readpage(struct file *file, struct page *page); | 3138 | extern int simple_readpage(struct file *file, struct page *page); |
3135 | extern int simple_write_begin(struct file *file, struct address_space *mapping, | 3139 | extern int simple_write_begin(struct file *file, struct address_space *mapping, |
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index a5bc8728ead7..0cb034331cbb 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h | |||
@@ -1,8 +1,4 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _LINUX_SCHED_DEADLINE_H | ||
3 | #define _LINUX_SCHED_DEADLINE_H | ||
4 | |||
5 | #include <linux/sched.h> | ||
6 | 2 | ||
7 | /* | 3 | /* |
8 | * SCHED_DEADLINE tasks has negative priorities, reflecting | 4 | * SCHED_DEADLINE tasks has negative priorities, reflecting |
@@ -28,5 +24,3 @@ static inline bool dl_time_before(u64 a, u64 b) | |||
28 | { | 24 | { |
29 | return (s64)(a - b) < 0; | 25 | return (s64)(a - b) < 0; |
30 | } | 26 | } |
31 | |||
32 | #endif /* _LINUX_SCHED_DEADLINE_H */ | ||
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h index d849431c8060..4a6582c27dea 100644 --- a/include/linux/sched/isolation.h +++ b/include/linux/sched/isolation.h | |||
@@ -12,6 +12,7 @@ enum hk_flags { | |||
12 | HK_FLAG_SCHED = (1 << 3), | 12 | HK_FLAG_SCHED = (1 << 3), |
13 | HK_FLAG_TICK = (1 << 4), | 13 | HK_FLAG_TICK = (1 << 4), |
14 | HK_FLAG_DOMAIN = (1 << 5), | 14 | HK_FLAG_DOMAIN = (1 << 5), |
15 | HK_FLAG_WQ = (1 << 6), | ||
15 | }; | 16 | }; |
16 | 17 | ||
17 | #ifdef CONFIG_CPU_ISOLATION | 18 | #ifdef CONFIG_CPU_ISOLATION |
diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h index 3d3a97d9399d..094217273ff9 100644 --- a/include/linux/sched/nohz.h +++ b/include/linux/sched/nohz.h | |||
@@ -37,8 +37,4 @@ extern void wake_up_nohz_cpu(int cpu); | |||
37 | static inline void wake_up_nohz_cpu(int cpu) { } | 37 | static inline void wake_up_nohz_cpu(int cpu) { } |
38 | #endif | 38 | #endif |
39 | 39 | ||
40 | #ifdef CONFIG_NO_HZ_FULL | ||
41 | extern u64 scheduler_tick_max_deferment(void); | ||
42 | #endif | ||
43 | |||
44 | #endif /* _LINUX_SCHED_NOHZ_H */ | 40 | #endif /* _LINUX_SCHED_NOHZ_H */ |
diff --git a/include/linux/tick.h b/include/linux/tick.h index 7cc35921218e..7f8c9a127f5a 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h | |||
@@ -113,7 +113,8 @@ enum tick_dep_bits { | |||
113 | 113 | ||
114 | #ifdef CONFIG_NO_HZ_COMMON | 114 | #ifdef CONFIG_NO_HZ_COMMON |
115 | extern bool tick_nohz_enabled; | 115 | extern bool tick_nohz_enabled; |
116 | extern int tick_nohz_tick_stopped(void); | 116 | extern bool tick_nohz_tick_stopped(void); |
117 | extern bool tick_nohz_tick_stopped_cpu(int cpu); | ||
117 | extern void tick_nohz_idle_enter(void); | 118 | extern void tick_nohz_idle_enter(void); |
118 | extern void tick_nohz_idle_exit(void); | 119 | extern void tick_nohz_idle_exit(void); |
119 | extern void tick_nohz_irq_exit(void); | 120 | extern void tick_nohz_irq_exit(void); |
@@ -125,6 +126,7 @@ extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time); | |||
125 | #else /* !CONFIG_NO_HZ_COMMON */ | 126 | #else /* !CONFIG_NO_HZ_COMMON */ |
126 | #define tick_nohz_enabled (0) | 127 | #define tick_nohz_enabled (0) |
127 | static inline int tick_nohz_tick_stopped(void) { return 0; } | 128 | static inline int tick_nohz_tick_stopped(void) { return 0; } |
129 | static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; } | ||
128 | static inline void tick_nohz_idle_enter(void) { } | 130 | static inline void tick_nohz_idle_enter(void) { } |
129 | static inline void tick_nohz_idle_exit(void) { } | 131 | static inline void tick_nohz_idle_exit(void) { } |
130 | 132 | ||
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h index 61b39eaf7cad..3fcdb75d69cf 100644 --- a/include/linux/wait_bit.h +++ b/include/linux/wait_bit.h | |||
@@ -262,4 +262,74 @@ int wait_on_atomic_t(atomic_t *val, wait_atomic_t_action_f action, unsigned mode | |||
262 | return out_of_line_wait_on_atomic_t(val, action, mode); | 262 | return out_of_line_wait_on_atomic_t(val, action, mode); |
263 | } | 263 | } |
264 | 264 | ||
265 | extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags); | ||
266 | extern void wake_up_var(void *var); | ||
267 | extern wait_queue_head_t *__var_waitqueue(void *p); | ||
268 | |||
269 | #define ___wait_var_event(var, condition, state, exclusive, ret, cmd) \ | ||
270 | ({ \ | ||
271 | __label__ __out; \ | ||
272 | struct wait_queue_head *__wq_head = __var_waitqueue(var); \ | ||
273 | struct wait_bit_queue_entry __wbq_entry; \ | ||
274 | long __ret = ret; /* explicit shadow */ \ | ||
275 | \ | ||
276 | init_wait_var_entry(&__wbq_entry, var, \ | ||
277 | exclusive ? WQ_FLAG_EXCLUSIVE : 0); \ | ||
278 | for (;;) { \ | ||
279 | long __int = prepare_to_wait_event(__wq_head, \ | ||
280 | &__wbq_entry.wq_entry, \ | ||
281 | state); \ | ||
282 | if (condition) \ | ||
283 | break; \ | ||
284 | \ | ||
285 | if (___wait_is_interruptible(state) && __int) { \ | ||
286 | __ret = __int; \ | ||
287 | goto __out; \ | ||
288 | } \ | ||
289 | \ | ||
290 | cmd; \ | ||
291 | } \ | ||
292 | finish_wait(__wq_head, &__wbq_entry.wq_entry); \ | ||
293 | __out: __ret; \ | ||
294 | }) | ||
295 | |||
296 | #define __wait_var_event(var, condition) \ | ||
297 | ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ | ||
298 | schedule()) | ||
299 | |||
300 | #define wait_var_event(var, condition) \ | ||
301 | do { \ | ||
302 | might_sleep(); \ | ||
303 | if (condition) \ | ||
304 | break; \ | ||
305 | __wait_var_event(var, condition); \ | ||
306 | } while (0) | ||
307 | |||
308 | #define __wait_var_event_killable(var, condition) \ | ||
309 | ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0, \ | ||
310 | schedule()) | ||
311 | |||
312 | #define wait_var_event_killable(var, condition) \ | ||
313 | ({ \ | ||
314 | int __ret = 0; \ | ||
315 | might_sleep(); \ | ||
316 | if (!(condition)) \ | ||
317 | __ret = __wait_var_event_killable(var, condition); \ | ||
318 | __ret; \ | ||
319 | }) | ||
320 | |||
321 | #define __wait_var_event_timeout(var, condition, timeout) \ | ||
322 | ___wait_var_event(var, ___wait_cond_timeout(condition), \ | ||
323 | TASK_UNINTERRUPTIBLE, 0, timeout, \ | ||
324 | __ret = schedule_timeout(__ret)) | ||
325 | |||
326 | #define wait_var_event_timeout(var, condition, timeout) \ | ||
327 | ({ \ | ||
328 | long __ret = timeout; \ | ||
329 | might_sleep(); \ | ||
330 | if (!___wait_cond_timeout(condition)) \ | ||
331 | __ret = __wait_var_event_timeout(var, condition, timeout); \ | ||
332 | __ret; \ | ||
333 | }) | ||
334 | |||
265 | #endif /* _LINUX_WAIT_BIT_H */ | 335 | #endif /* _LINUX_WAIT_BIT_H */ |
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index e2f9d4feff40..d9a02b318108 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile | |||
@@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer | |||
17 | endif | 17 | endif |
18 | 18 | ||
19 | obj-y += core.o loadavg.o clock.o cputime.o | 19 | obj-y += core.o loadavg.o clock.o cputime.o |
20 | obj-y += idle_task.o fair.o rt.o deadline.o | 20 | obj-y += idle.o fair.o rt.o deadline.o |
21 | obj-y += wait.o wait_bit.o swait.o completion.o idle.o | 21 | obj-y += wait.o wait_bit.o swait.o completion.o |
22 | |||
22 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o | 23 | obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o |
23 | obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o | 24 | obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o |
24 | obj-$(CONFIG_SCHEDSTATS) += stats.o | 25 | obj-$(CONFIG_SCHEDSTATS) += stats.o |
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c index bb4b9fe026a1..6be6c575b6cd 100644 --- a/kernel/sched/autogroup.c +++ b/kernel/sched/autogroup.c | |||
@@ -1,10 +1,7 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/proc_fs.h> | 2 | /* |
3 | #include <linux/seq_file.h> | 3 | * Auto-group scheduling implementation: |
4 | #include <linux/utsname.h> | 4 | */ |
5 | #include <linux/security.h> | ||
6 | #include <linux/export.h> | ||
7 | |||
8 | #include "sched.h" | 5 | #include "sched.h" |
9 | 6 | ||
10 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; | 7 | unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1; |
@@ -168,18 +165,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag) | |||
168 | autogroup_kref_put(prev); | 165 | autogroup_kref_put(prev); |
169 | } | 166 | } |
170 | 167 | ||
171 | /* Allocates GFP_KERNEL, cannot be called under any spinlock */ | 168 | /* Allocates GFP_KERNEL, cannot be called under any spinlock: */ |
172 | void sched_autogroup_create_attach(struct task_struct *p) | 169 | void sched_autogroup_create_attach(struct task_struct *p) |
173 | { | 170 | { |
174 | struct autogroup *ag = autogroup_create(); | 171 | struct autogroup *ag = autogroup_create(); |
175 | 172 | ||
176 | autogroup_move_group(p, ag); | 173 | autogroup_move_group(p, ag); |
177 | /* drop extra reference added by autogroup_create() */ | 174 | |
175 | /* Drop extra reference added by autogroup_create(): */ | ||
178 | autogroup_kref_put(ag); | 176 | autogroup_kref_put(ag); |
179 | } | 177 | } |
180 | EXPORT_SYMBOL(sched_autogroup_create_attach); | 178 | EXPORT_SYMBOL(sched_autogroup_create_attach); |
181 | 179 | ||
182 | /* Cannot be called under siglock. Currently has no users */ | 180 | /* Cannot be called under siglock. Currently has no users: */ |
183 | void sched_autogroup_detach(struct task_struct *p) | 181 | void sched_autogroup_detach(struct task_struct *p) |
184 | { | 182 | { |
185 | autogroup_move_group(p, &autogroup_default); | 183 | autogroup_move_group(p, &autogroup_default); |
@@ -202,7 +200,6 @@ static int __init setup_autogroup(char *str) | |||
202 | 200 | ||
203 | return 1; | 201 | return 1; |
204 | } | 202 | } |
205 | |||
206 | __setup("noautogroup", setup_autogroup); | 203 | __setup("noautogroup", setup_autogroup); |
207 | 204 | ||
208 | #ifdef CONFIG_PROC_FS | 205 | #ifdef CONFIG_PROC_FS |
@@ -224,7 +221,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) | |||
224 | if (nice < 0 && !can_nice(current, nice)) | 221 | if (nice < 0 && !can_nice(current, nice)) |
225 | return -EPERM; | 222 | return -EPERM; |
226 | 223 | ||
227 | /* this is a heavy operation taking global locks.. */ | 224 | /* This is a heavy operation, taking global locks.. */ |
228 | if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) | 225 | if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next)) |
229 | return -EAGAIN; | 226 | return -EAGAIN; |
230 | 227 | ||
@@ -267,4 +264,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen) | |||
267 | 264 | ||
268 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); | 265 | return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id); |
269 | } | 266 | } |
270 | #endif /* CONFIG_SCHED_DEBUG */ | 267 | #endif |
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h index 27cd22b89824..b96419974a1f 100644 --- a/kernel/sched/autogroup.h +++ b/kernel/sched/autogroup.h | |||
@@ -1,15 +1,11 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifdef CONFIG_SCHED_AUTOGROUP | 2 | #ifdef CONFIG_SCHED_AUTOGROUP |
3 | 3 | ||
4 | #include <linux/kref.h> | ||
5 | #include <linux/rwsem.h> | ||
6 | #include <linux/sched/autogroup.h> | ||
7 | |||
8 | struct autogroup { | 4 | struct autogroup { |
9 | /* | 5 | /* |
10 | * reference doesn't mean how many thread attach to this | 6 | * Reference doesn't mean how many threads attach to this |
11 | * autogroup now. It just stands for the number of task | 7 | * autogroup now. It just stands for the number of tasks |
12 | * could use this autogroup. | 8 | * which could use this autogroup. |
13 | */ | 9 | */ |
14 | struct kref kref; | 10 | struct kref kref; |
15 | struct task_group *tg; | 11 | struct task_group *tg; |
@@ -56,11 +52,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg) | |||
56 | return tg; | 52 | return tg; |
57 | } | 53 | } |
58 | 54 | ||
59 | #ifdef CONFIG_SCHED_DEBUG | ||
60 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) | 55 | static inline int autogroup_path(struct task_group *tg, char *buf, int buflen) |
61 | { | 56 | { |
62 | return 0; | 57 | return 0; |
63 | } | 58 | } |
64 | #endif | ||
65 | 59 | ||
66 | #endif /* CONFIG_SCHED_AUTOGROUP */ | 60 | #endif /* CONFIG_SCHED_AUTOGROUP */ |
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index e086babe6c61..10c83e73837a 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * sched_clock for unstable cpu clocks | 2 | * sched_clock() for unstable CPU clocks |
3 | * | 3 | * |
4 | * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra | 4 | * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra |
5 | * | 5 | * |
@@ -11,7 +11,7 @@ | |||
11 | * Guillaume Chazarain <guichaz@gmail.com> | 11 | * Guillaume Chazarain <guichaz@gmail.com> |
12 | * | 12 | * |
13 | * | 13 | * |
14 | * What: | 14 | * What this file implements: |
15 | * | 15 | * |
16 | * cpu_clock(i) provides a fast (execution time) high resolution | 16 | * cpu_clock(i) provides a fast (execution time) high resolution |
17 | * clock with bounded drift between CPUs. The value of cpu_clock(i) | 17 | * clock with bounded drift between CPUs. The value of cpu_clock(i) |
@@ -26,11 +26,11 @@ | |||
26 | * at 0 on boot (but people really shouldn't rely on that). | 26 | * at 0 on boot (but people really shouldn't rely on that). |
27 | * | 27 | * |
28 | * cpu_clock(i) -- can be used from any context, including NMI. | 28 | * cpu_clock(i) -- can be used from any context, including NMI. |
29 | * local_clock() -- is cpu_clock() on the current cpu. | 29 | * local_clock() -- is cpu_clock() on the current CPU. |
30 | * | 30 | * |
31 | * sched_clock_cpu(i) | 31 | * sched_clock_cpu(i) |
32 | * | 32 | * |
33 | * How: | 33 | * How it is implemented: |
34 | * | 34 | * |
35 | * The implementation either uses sched_clock() when | 35 | * The implementation either uses sched_clock() when |
36 | * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the | 36 | * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the |
@@ -52,19 +52,7 @@ | |||
52 | * that is otherwise invisible (TSC gets stopped). | 52 | * that is otherwise invisible (TSC gets stopped). |
53 | * | 53 | * |
54 | */ | 54 | */ |
55 | #include <linux/spinlock.h> | 55 | #include "sched.h" |
56 | #include <linux/hardirq.h> | ||
57 | #include <linux/export.h> | ||
58 | #include <linux/percpu.h> | ||
59 | #include <linux/ktime.h> | ||
60 | #include <linux/sched.h> | ||
61 | #include <linux/nmi.h> | ||
62 | #include <linux/sched/clock.h> | ||
63 | #include <linux/static_key.h> | ||
64 | #include <linux/workqueue.h> | ||
65 | #include <linux/compiler.h> | ||
66 | #include <linux/tick.h> | ||
67 | #include <linux/init.h> | ||
68 | 56 | ||
69 | /* | 57 | /* |
70 | * Scheduler clock - returns current time in nanosec units. | 58 | * Scheduler clock - returns current time in nanosec units. |
@@ -302,21 +290,21 @@ again: | |||
302 | * cmpxchg64 below only protects one readout. | 290 | * cmpxchg64 below only protects one readout. |
303 | * | 291 | * |
304 | * We must reread via sched_clock_local() in the retry case on | 292 | * We must reread via sched_clock_local() in the retry case on |
305 | * 32bit as an NMI could use sched_clock_local() via the | 293 | * 32-bit kernels as an NMI could use sched_clock_local() via the |
306 | * tracer and hit between the readout of | 294 | * tracer and hit between the readout of |
307 | * the low32bit and the high 32bit portion. | 295 | * the low 32-bit and the high 32-bit portion. |
308 | */ | 296 | */ |
309 | this_clock = sched_clock_local(my_scd); | 297 | this_clock = sched_clock_local(my_scd); |
310 | /* | 298 | /* |
311 | * We must enforce atomic readout on 32bit, otherwise the | 299 | * We must enforce atomic readout on 32-bit, otherwise the |
312 | * update on the remote cpu can hit inbetween the readout of | 300 | * update on the remote CPU can hit inbetween the readout of |
313 | * the low32bit and the high 32bit portion. | 301 | * the low 32-bit and the high 32-bit portion. |
314 | */ | 302 | */ |
315 | remote_clock = cmpxchg64(&scd->clock, 0, 0); | 303 | remote_clock = cmpxchg64(&scd->clock, 0, 0); |
316 | #else | 304 | #else |
317 | /* | 305 | /* |
318 | * On 64bit the read of [my]scd->clock is atomic versus the | 306 | * On 64-bit kernels the read of [my]scd->clock is atomic versus the |
319 | * update, so we can avoid the above 32bit dance. | 307 | * update, so we can avoid the above 32-bit dance. |
320 | */ | 308 | */ |
321 | sched_clock_local(my_scd); | 309 | sched_clock_local(my_scd); |
322 | again: | 310 | again: |
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c index 0926aef10dad..5d2d56b0817a 100644 --- a/kernel/sched/completion.c +++ b/kernel/sched/completion.c | |||
@@ -11,10 +11,7 @@ | |||
11 | * typically be used for exclusion which gives rise to priority inversion. | 11 | * typically be used for exclusion which gives rise to priority inversion. |
12 | * Waiting for completion is a typically sync point, but not an exclusion point. | 12 | * Waiting for completion is a typically sync point, but not an exclusion point. |
13 | */ | 13 | */ |
14 | 14 | #include "sched.h" | |
15 | #include <linux/sched/signal.h> | ||
16 | #include <linux/sched/debug.h> | ||
17 | #include <linux/completion.h> | ||
18 | 15 | ||
19 | /** | 16 | /** |
20 | * complete: - signals a single thread waiting on this completion | 17 | * complete: - signals a single thread waiting on this completion |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c94895bc5a2c..74e750ffe64f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -5,37 +5,11 @@ | |||
5 | * | 5 | * |
6 | * Copyright (C) 1991-2002 Linus Torvalds | 6 | * Copyright (C) 1991-2002 Linus Torvalds |
7 | */ | 7 | */ |
8 | #include <linux/sched.h> | 8 | #include "sched.h" |
9 | #include <linux/sched/clock.h> | ||
10 | #include <uapi/linux/sched/types.h> | ||
11 | #include <linux/sched/loadavg.h> | ||
12 | #include <linux/sched/hotplug.h> | ||
13 | #include <linux/wait_bit.h> | ||
14 | #include <linux/cpuset.h> | ||
15 | #include <linux/delayacct.h> | ||
16 | #include <linux/init_task.h> | ||
17 | #include <linux/context_tracking.h> | ||
18 | #include <linux/rcupdate_wait.h> | ||
19 | #include <linux/compat.h> | ||
20 | |||
21 | #include <linux/blkdev.h> | ||
22 | #include <linux/kprobes.h> | ||
23 | #include <linux/mmu_context.h> | ||
24 | #include <linux/module.h> | ||
25 | #include <linux/nmi.h> | ||
26 | #include <linux/prefetch.h> | ||
27 | #include <linux/profile.h> | ||
28 | #include <linux/security.h> | ||
29 | #include <linux/syscalls.h> | ||
30 | #include <linux/sched/isolation.h> | ||
31 | 9 | ||
32 | #include <asm/switch_to.h> | 10 | #include <asm/switch_to.h> |
33 | #include <asm/tlb.h> | 11 | #include <asm/tlb.h> |
34 | #ifdef CONFIG_PARAVIRT | ||
35 | #include <asm/paravirt.h> | ||
36 | #endif | ||
37 | 12 | ||
38 | #include "sched.h" | ||
39 | #include "../workqueue_internal.h" | 13 | #include "../workqueue_internal.h" |
40 | #include "../smpboot.h" | 14 | #include "../smpboot.h" |
41 | 15 | ||
@@ -135,7 +109,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf) | |||
135 | * [L] ->on_rq | 109 | * [L] ->on_rq |
136 | * RELEASE (rq->lock) | 110 | * RELEASE (rq->lock) |
137 | * | 111 | * |
138 | * If we observe the old cpu in task_rq_lock, the acquire of | 112 | * If we observe the old CPU in task_rq_lock, the acquire of |
139 | * the old rq->lock will fully serialize against the stores. | 113 | * the old rq->lock will fully serialize against the stores. |
140 | * | 114 | * |
141 | * If we observe the new CPU in task_rq_lock, the acquire will | 115 | * If we observe the new CPU in task_rq_lock, the acquire will |
@@ -333,7 +307,7 @@ void hrtick_start(struct rq *rq, u64 delay) | |||
333 | } | 307 | } |
334 | #endif /* CONFIG_SMP */ | 308 | #endif /* CONFIG_SMP */ |
335 | 309 | ||
336 | static void init_rq_hrtick(struct rq *rq) | 310 | static void hrtick_rq_init(struct rq *rq) |
337 | { | 311 | { |
338 | #ifdef CONFIG_SMP | 312 | #ifdef CONFIG_SMP |
339 | rq->hrtick_csd_pending = 0; | 313 | rq->hrtick_csd_pending = 0; |
@@ -351,7 +325,7 @@ static inline void hrtick_clear(struct rq *rq) | |||
351 | { | 325 | { |
352 | } | 326 | } |
353 | 327 | ||
354 | static inline void init_rq_hrtick(struct rq *rq) | 328 | static inline void hrtick_rq_init(struct rq *rq) |
355 | { | 329 | { |
356 | } | 330 | } |
357 | #endif /* CONFIG_SCHED_HRTICK */ | 331 | #endif /* CONFIG_SCHED_HRTICK */ |
@@ -1457,7 +1431,7 @@ EXPORT_SYMBOL_GPL(kick_process); | |||
1457 | * | 1431 | * |
1458 | * - cpu_active must be a subset of cpu_online | 1432 | * - cpu_active must be a subset of cpu_online |
1459 | * | 1433 | * |
1460 | * - on cpu-up we allow per-cpu kthreads on the online && !active cpu, | 1434 | * - on CPU-up we allow per-CPU kthreads on the online && !active CPU, |
1461 | * see __set_cpus_allowed_ptr(). At this point the newly online | 1435 | * see __set_cpus_allowed_ptr(). At this point the newly online |
1462 | * CPU isn't yet part of the sched domains, and balancing will not | 1436 | * CPU isn't yet part of the sched domains, and balancing will not |
1463 | * see it. | 1437 | * see it. |
@@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq) | |||
2629 | raw_spin_unlock_irq(&rq->lock); | 2603 | raw_spin_unlock_irq(&rq->lock); |
2630 | } | 2604 | } |
2631 | 2605 | ||
2606 | /* | ||
2607 | * NOP if the arch has not defined these: | ||
2608 | */ | ||
2609 | |||
2610 | #ifndef prepare_arch_switch | ||
2611 | # define prepare_arch_switch(next) do { } while (0) | ||
2612 | #endif | ||
2613 | |||
2614 | #ifndef finish_arch_post_lock_switch | ||
2615 | # define finish_arch_post_lock_switch() do { } while (0) | ||
2616 | #endif | ||
2617 | |||
2632 | /** | 2618 | /** |
2633 | * prepare_task_switch - prepare to switch tasks | 2619 | * prepare_task_switch - prepare to switch tasks |
2634 | * @rq: the runqueue preparing to switch | 2620 | * @rq: the runqueue preparing to switch |
@@ -3037,7 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3037 | 3023 | ||
3038 | #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) | 3024 | #if defined(CONFIG_64BIT) && defined(CONFIG_SMP) |
3039 | /* | 3025 | /* |
3040 | * 64-bit doesn't need locks to atomically read a 64bit value. | 3026 | * 64-bit doesn't need locks to atomically read a 64-bit value. |
3041 | * So we have a optimization chance when the task's delta_exec is 0. | 3027 | * So we have a optimization chance when the task's delta_exec is 0. |
3042 | * Reading ->on_cpu is racy, but this is ok. | 3028 | * Reading ->on_cpu is racy, but this is ok. |
3043 | * | 3029 | * |
@@ -3096,35 +3082,99 @@ void scheduler_tick(void) | |||
3096 | rq->idle_balance = idle_cpu(cpu); | 3082 | rq->idle_balance = idle_cpu(cpu); |
3097 | trigger_load_balance(rq); | 3083 | trigger_load_balance(rq); |
3098 | #endif | 3084 | #endif |
3099 | rq_last_tick_reset(rq); | ||
3100 | } | 3085 | } |
3101 | 3086 | ||
3102 | #ifdef CONFIG_NO_HZ_FULL | 3087 | #ifdef CONFIG_NO_HZ_FULL |
3103 | /** | 3088 | |
3104 | * scheduler_tick_max_deferment | 3089 | struct tick_work { |
3105 | * | 3090 | int cpu; |
3106 | * Keep at least one tick per second when a single | 3091 | struct delayed_work work; |
3107 | * active task is running because the scheduler doesn't | 3092 | }; |
3108 | * yet completely support full dynticks environment. | 3093 | |
3109 | * | 3094 | static struct tick_work __percpu *tick_work_cpu; |
3110 | * This makes sure that uptime, CFS vruntime, load | 3095 | |
3111 | * balancing, etc... continue to move forward, even | 3096 | static void sched_tick_remote(struct work_struct *work) |
3112 | * with a very low granularity. | ||
3113 | * | ||
3114 | * Return: Maximum deferment in nanoseconds. | ||
3115 | */ | ||
3116 | u64 scheduler_tick_max_deferment(void) | ||
3117 | { | 3097 | { |
3118 | struct rq *rq = this_rq(); | 3098 | struct delayed_work *dwork = to_delayed_work(work); |
3119 | unsigned long next, now = READ_ONCE(jiffies); | 3099 | struct tick_work *twork = container_of(dwork, struct tick_work, work); |
3100 | int cpu = twork->cpu; | ||
3101 | struct rq *rq = cpu_rq(cpu); | ||
3102 | struct rq_flags rf; | ||
3120 | 3103 | ||
3121 | next = rq->last_sched_tick + HZ; | 3104 | /* |
3105 | * Handle the tick only if it appears the remote CPU is running in full | ||
3106 | * dynticks mode. The check is racy by nature, but missing a tick or | ||
3107 | * having one too much is no big deal because the scheduler tick updates | ||
3108 | * statistics and checks timeslices in a time-independent way, regardless | ||
3109 | * of when exactly it is running. | ||
3110 | */ | ||
3111 | if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) { | ||
3112 | struct task_struct *curr; | ||
3113 | u64 delta; | ||
3122 | 3114 | ||
3123 | if (time_before_eq(next, now)) | 3115 | rq_lock_irq(rq, &rf); |
3124 | return 0; | 3116 | update_rq_clock(rq); |
3117 | curr = rq->curr; | ||
3118 | delta = rq_clock_task(rq) - curr->se.exec_start; | ||
3125 | 3119 | ||
3126 | return jiffies_to_nsecs(next - now); | 3120 | /* |
3121 | * Make sure the next tick runs within a reasonable | ||
3122 | * amount of time. | ||
3123 | */ | ||
3124 | WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); | ||
3125 | curr->sched_class->task_tick(rq, curr, 0); | ||
3126 | rq_unlock_irq(rq, &rf); | ||
3127 | } | ||
3128 | |||
3129 | /* | ||
3130 | * Run the remote tick once per second (1Hz). This arbitrary | ||
3131 | * frequency is large enough to avoid overload but short enough | ||
3132 | * to keep scheduler internal stats reasonably up to date. | ||
3133 | */ | ||
3134 | queue_delayed_work(system_unbound_wq, dwork, HZ); | ||
3127 | } | 3135 | } |
3136 | |||
3137 | static void sched_tick_start(int cpu) | ||
3138 | { | ||
3139 | struct tick_work *twork; | ||
3140 | |||
3141 | if (housekeeping_cpu(cpu, HK_FLAG_TICK)) | ||
3142 | return; | ||
3143 | |||
3144 | WARN_ON_ONCE(!tick_work_cpu); | ||
3145 | |||
3146 | twork = per_cpu_ptr(tick_work_cpu, cpu); | ||
3147 | twork->cpu = cpu; | ||
3148 | INIT_DELAYED_WORK(&twork->work, sched_tick_remote); | ||
3149 | queue_delayed_work(system_unbound_wq, &twork->work, HZ); | ||
3150 | } | ||
3151 | |||
3152 | #ifdef CONFIG_HOTPLUG_CPU | ||
3153 | static void sched_tick_stop(int cpu) | ||
3154 | { | ||
3155 | struct tick_work *twork; | ||
3156 | |||
3157 | if (housekeeping_cpu(cpu, HK_FLAG_TICK)) | ||
3158 | return; | ||
3159 | |||
3160 | WARN_ON_ONCE(!tick_work_cpu); | ||
3161 | |||
3162 | twork = per_cpu_ptr(tick_work_cpu, cpu); | ||
3163 | cancel_delayed_work_sync(&twork->work); | ||
3164 | } | ||
3165 | #endif /* CONFIG_HOTPLUG_CPU */ | ||
3166 | |||
3167 | int __init sched_tick_offload_init(void) | ||
3168 | { | ||
3169 | tick_work_cpu = alloc_percpu(struct tick_work); | ||
3170 | BUG_ON(!tick_work_cpu); | ||
3171 | |||
3172 | return 0; | ||
3173 | } | ||
3174 | |||
3175 | #else /* !CONFIG_NO_HZ_FULL */ | ||
3176 | static inline void sched_tick_start(int cpu) { } | ||
3177 | static inline void sched_tick_stop(int cpu) { } | ||
3128 | #endif | 3178 | #endif |
3129 | 3179 | ||
3130 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ | 3180 | #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ |
@@ -5786,6 +5836,7 @@ int sched_cpu_starting(unsigned int cpu) | |||
5786 | { | 5836 | { |
5787 | set_cpu_rq_start_time(cpu); | 5837 | set_cpu_rq_start_time(cpu); |
5788 | sched_rq_cpu_starting(cpu); | 5838 | sched_rq_cpu_starting(cpu); |
5839 | sched_tick_start(cpu); | ||
5789 | return 0; | 5840 | return 0; |
5790 | } | 5841 | } |
5791 | 5842 | ||
@@ -5797,6 +5848,7 @@ int sched_cpu_dying(unsigned int cpu) | |||
5797 | 5848 | ||
5798 | /* Handle pending wakeups and then migrate everything off */ | 5849 | /* Handle pending wakeups and then migrate everything off */ |
5799 | sched_ttwu_pending(); | 5850 | sched_ttwu_pending(); |
5851 | sched_tick_stop(cpu); | ||
5800 | 5852 | ||
5801 | rq_lock_irqsave(rq, &rf); | 5853 | rq_lock_irqsave(rq, &rf); |
5802 | if (rq->rd) { | 5854 | if (rq->rd) { |
@@ -6024,11 +6076,8 @@ void __init sched_init(void) | |||
6024 | rq->last_load_update_tick = jiffies; | 6076 | rq->last_load_update_tick = jiffies; |
6025 | rq->nohz_flags = 0; | 6077 | rq->nohz_flags = 0; |
6026 | #endif | 6078 | #endif |
6027 | #ifdef CONFIG_NO_HZ_FULL | ||
6028 | rq->last_sched_tick = 0; | ||
6029 | #endif | ||
6030 | #endif /* CONFIG_SMP */ | 6079 | #endif /* CONFIG_SMP */ |
6031 | init_rq_hrtick(rq); | 6080 | hrtick_rq_init(rq); |
6032 | atomic_set(&rq->nr_iowait, 0); | 6081 | atomic_set(&rq->nr_iowait, 0); |
6033 | } | 6082 | } |
6034 | 6083 | ||
@@ -7027,3 +7076,5 @@ const u32 sched_prio_to_wmult[40] = { | |||
7027 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, | 7076 | /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, |
7028 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, | 7077 | /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, |
7029 | }; | 7078 | }; |
7079 | |||
7080 | #undef CREATE_TRACE_POINTS | ||
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 44ab32a4fab6..9fbb10383434 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c | |||
@@ -1,24 +1,13 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/cgroup.h> | ||
3 | #include <linux/slab.h> | ||
4 | #include <linux/percpu.h> | ||
5 | #include <linux/spinlock.h> | ||
6 | #include <linux/cpumask.h> | ||
7 | #include <linux/seq_file.h> | ||
8 | #include <linux/rcupdate.h> | ||
9 | #include <linux/kernel_stat.h> | ||
10 | #include <linux/err.h> | ||
11 | |||
12 | #include "sched.h" | ||
13 | |||
14 | /* | 2 | /* |
15 | * CPU accounting code for task groups. | 3 | * CPU accounting code for task groups. |
16 | * | 4 | * |
17 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh | 5 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh |
18 | * (balbir@in.ibm.com). | 6 | * (balbir@in.ibm.com). |
19 | */ | 7 | */ |
8 | #include "sched.h" | ||
20 | 9 | ||
21 | /* Time spent by the tasks of the cpu accounting group executing in ... */ | 10 | /* Time spent by the tasks of the CPU accounting group executing in ... */ |
22 | enum cpuacct_stat_index { | 11 | enum cpuacct_stat_index { |
23 | CPUACCT_STAT_USER, /* ... user mode */ | 12 | CPUACCT_STAT_USER, /* ... user mode */ |
24 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ | 13 | CPUACCT_STAT_SYSTEM, /* ... kernel mode */ |
@@ -35,12 +24,12 @@ struct cpuacct_usage { | |||
35 | u64 usages[CPUACCT_STAT_NSTATS]; | 24 | u64 usages[CPUACCT_STAT_NSTATS]; |
36 | }; | 25 | }; |
37 | 26 | ||
38 | /* track cpu usage of a group of tasks and its child groups */ | 27 | /* track CPU usage of a group of tasks and its child groups */ |
39 | struct cpuacct { | 28 | struct cpuacct { |
40 | struct cgroup_subsys_state css; | 29 | struct cgroup_subsys_state css; |
41 | /* cpuusage holds pointer to a u64-type object on every cpu */ | 30 | /* cpuusage holds pointer to a u64-type object on every CPU */ |
42 | struct cpuacct_usage __percpu *cpuusage; | 31 | struct cpuacct_usage __percpu *cpuusage; |
43 | struct kernel_cpustat __percpu *cpustat; | 32 | struct kernel_cpustat __percpu *cpustat; |
44 | }; | 33 | }; |
45 | 34 | ||
46 | static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) | 35 | static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) |
@@ -48,7 +37,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css) | |||
48 | return css ? container_of(css, struct cpuacct, css) : NULL; | 37 | return css ? container_of(css, struct cpuacct, css) : NULL; |
49 | } | 38 | } |
50 | 39 | ||
51 | /* return cpu accounting group to which this task belongs */ | 40 | /* Return CPU accounting group to which this task belongs */ |
52 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | 41 | static inline struct cpuacct *task_ca(struct task_struct *tsk) |
53 | { | 42 | { |
54 | return css_ca(task_css(tsk, cpuacct_cgrp_id)); | 43 | return css_ca(task_css(tsk, cpuacct_cgrp_id)); |
@@ -65,7 +54,7 @@ static struct cpuacct root_cpuacct = { | |||
65 | .cpuusage = &root_cpuacct_cpuusage, | 54 | .cpuusage = &root_cpuacct_cpuusage, |
66 | }; | 55 | }; |
67 | 56 | ||
68 | /* create a new cpu accounting group */ | 57 | /* Create a new CPU accounting group */ |
69 | static struct cgroup_subsys_state * | 58 | static struct cgroup_subsys_state * |
70 | cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) | 59 | cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) |
71 | { | 60 | { |
@@ -96,7 +85,7 @@ out: | |||
96 | return ERR_PTR(-ENOMEM); | 85 | return ERR_PTR(-ENOMEM); |
97 | } | 86 | } |
98 | 87 | ||
99 | /* destroy an existing cpu accounting group */ | 88 | /* Destroy an existing CPU accounting group */ |
100 | static void cpuacct_css_free(struct cgroup_subsys_state *css) | 89 | static void cpuacct_css_free(struct cgroup_subsys_state *css) |
101 | { | 90 | { |
102 | struct cpuacct *ca = css_ca(css); | 91 | struct cpuacct *ca = css_ca(css); |
@@ -162,7 +151,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) | |||
162 | #endif | 151 | #endif |
163 | } | 152 | } |
164 | 153 | ||
165 | /* return total cpu usage (in nanoseconds) of a group */ | 154 | /* Return total CPU usage (in nanoseconds) of a group */ |
166 | static u64 __cpuusage_read(struct cgroup_subsys_state *css, | 155 | static u64 __cpuusage_read(struct cgroup_subsys_state *css, |
167 | enum cpuacct_stat_index index) | 156 | enum cpuacct_stat_index index) |
168 | { | 157 | { |
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 8d9562d890d3..50316455ea66 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c | |||
@@ -10,11 +10,7 @@ | |||
10 | * as published by the Free Software Foundation; version 2 | 10 | * as published by the Free Software Foundation; version 2 |
11 | * of the License. | 11 | * of the License. |
12 | */ | 12 | */ |
13 | 13 | #include "sched.h" | |
14 | #include <linux/gfp.h> | ||
15 | #include <linux/kernel.h> | ||
16 | #include <linux/slab.h> | ||
17 | #include "cpudeadline.h" | ||
18 | 14 | ||
19 | static inline int parent(int i) | 15 | static inline int parent(int i) |
20 | { | 16 | { |
@@ -42,8 +38,9 @@ static void cpudl_heapify_down(struct cpudl *cp, int idx) | |||
42 | return; | 38 | return; |
43 | 39 | ||
44 | /* adapted from lib/prio_heap.c */ | 40 | /* adapted from lib/prio_heap.c */ |
45 | while(1) { | 41 | while (1) { |
46 | u64 largest_dl; | 42 | u64 largest_dl; |
43 | |||
47 | l = left_child(idx); | 44 | l = left_child(idx); |
48 | r = right_child(idx); | 45 | r = right_child(idx); |
49 | largest = idx; | 46 | largest = idx; |
@@ -131,6 +128,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
131 | return 1; | 128 | return 1; |
132 | } else { | 129 | } else { |
133 | int best_cpu = cpudl_maximum(cp); | 130 | int best_cpu = cpudl_maximum(cp); |
131 | |||
134 | WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); | 132 | WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); |
135 | 133 | ||
136 | if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && | 134 | if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) && |
@@ -145,9 +143,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, | |||
145 | } | 143 | } |
146 | 144 | ||
147 | /* | 145 | /* |
148 | * cpudl_clear - remove a cpu from the cpudl max-heap | 146 | * cpudl_clear - remove a CPU from the cpudl max-heap |
149 | * @cp: the cpudl max-heap context | 147 | * @cp: the cpudl max-heap context |
150 | * @cpu: the target cpu | 148 | * @cpu: the target CPU |
151 | * | 149 | * |
152 | * Notes: assumes cpu_rq(cpu)->lock is locked | 150 | * Notes: assumes cpu_rq(cpu)->lock is locked |
153 | * | 151 | * |
@@ -186,8 +184,8 @@ void cpudl_clear(struct cpudl *cp, int cpu) | |||
186 | /* | 184 | /* |
187 | * cpudl_set - update the cpudl max-heap | 185 | * cpudl_set - update the cpudl max-heap |
188 | * @cp: the cpudl max-heap context | 186 | * @cp: the cpudl max-heap context |
189 | * @cpu: the target cpu | 187 | * @cpu: the target CPU |
190 | * @dl: the new earliest deadline for this cpu | 188 | * @dl: the new earliest deadline for this CPU |
191 | * | 189 | * |
192 | * Notes: assumes cpu_rq(cpu)->lock is locked | 190 | * Notes: assumes cpu_rq(cpu)->lock is locked |
193 | * | 191 | * |
@@ -205,6 +203,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) | |||
205 | old_idx = cp->elements[cpu].idx; | 203 | old_idx = cp->elements[cpu].idx; |
206 | if (old_idx == IDX_INVALID) { | 204 | if (old_idx == IDX_INVALID) { |
207 | int new_idx = cp->size++; | 205 | int new_idx = cp->size++; |
206 | |||
208 | cp->elements[new_idx].dl = dl; | 207 | cp->elements[new_idx].dl = dl; |
209 | cp->elements[new_idx].cpu = cpu; | 208 | cp->elements[new_idx].cpu = cpu; |
210 | cp->elements[cpu].idx = new_idx; | 209 | cp->elements[cpu].idx = new_idx; |
@@ -221,7 +220,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl) | |||
221 | /* | 220 | /* |
222 | * cpudl_set_freecpu - Set the cpudl.free_cpus | 221 | * cpudl_set_freecpu - Set the cpudl.free_cpus |
223 | * @cp: the cpudl max-heap context | 222 | * @cp: the cpudl max-heap context |
224 | * @cpu: rd attached cpu | 223 | * @cpu: rd attached CPU |
225 | */ | 224 | */ |
226 | void cpudl_set_freecpu(struct cpudl *cp, int cpu) | 225 | void cpudl_set_freecpu(struct cpudl *cp, int cpu) |
227 | { | 226 | { |
@@ -231,7 +230,7 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu) | |||
231 | /* | 230 | /* |
232 | * cpudl_clear_freecpu - Clear the cpudl.free_cpus | 231 | * cpudl_clear_freecpu - Clear the cpudl.free_cpus |
233 | * @cp: the cpudl max-heap context | 232 | * @cp: the cpudl max-heap context |
234 | * @cpu: rd attached cpu | 233 | * @cpu: rd attached CPU |
235 | */ | 234 | */ |
236 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu) | 235 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu) |
237 | { | 236 | { |
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h index b010d26e108e..0adeda93b5fb 100644 --- a/kernel/sched/cpudeadline.h +++ b/kernel/sched/cpudeadline.h | |||
@@ -1,35 +1,26 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _LINUX_CPUDL_H | ||
3 | #define _LINUX_CPUDL_H | ||
4 | 2 | ||
5 | #include <linux/sched.h> | 3 | #define IDX_INVALID -1 |
6 | #include <linux/sched/deadline.h> | ||
7 | |||
8 | #define IDX_INVALID -1 | ||
9 | 4 | ||
10 | struct cpudl_item { | 5 | struct cpudl_item { |
11 | u64 dl; | 6 | u64 dl; |
12 | int cpu; | 7 | int cpu; |
13 | int idx; | 8 | int idx; |
14 | }; | 9 | }; |
15 | 10 | ||
16 | struct cpudl { | 11 | struct cpudl { |
17 | raw_spinlock_t lock; | 12 | raw_spinlock_t lock; |
18 | int size; | 13 | int size; |
19 | cpumask_var_t free_cpus; | 14 | cpumask_var_t free_cpus; |
20 | struct cpudl_item *elements; | 15 | struct cpudl_item *elements; |
21 | }; | 16 | }; |
22 | 17 | ||
23 | |||
24 | #ifdef CONFIG_SMP | 18 | #ifdef CONFIG_SMP |
25 | int cpudl_find(struct cpudl *cp, struct task_struct *p, | 19 | int cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask); |
26 | struct cpumask *later_mask); | ||
27 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl); | 20 | void cpudl_set(struct cpudl *cp, int cpu, u64 dl); |
28 | void cpudl_clear(struct cpudl *cp, int cpu); | 21 | void cpudl_clear(struct cpudl *cp, int cpu); |
29 | int cpudl_init(struct cpudl *cp); | 22 | int cpudl_init(struct cpudl *cp); |
30 | void cpudl_set_freecpu(struct cpudl *cp, int cpu); | 23 | void cpudl_set_freecpu(struct cpudl *cp, int cpu); |
31 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu); | 24 | void cpudl_clear_freecpu(struct cpudl *cp, int cpu); |
32 | void cpudl_cleanup(struct cpudl *cp); | 25 | void cpudl_cleanup(struct cpudl *cp); |
33 | #endif /* CONFIG_SMP */ | 26 | #endif /* CONFIG_SMP */ |
34 | |||
35 | #endif /* _LINUX_CPUDL_H */ | ||
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c index dbc51442ecbc..5e54cbcae673 100644 --- a/kernel/sched/cpufreq.c +++ b/kernel/sched/cpufreq.c | |||
@@ -8,7 +8,6 @@ | |||
8 | * it under the terms of the GNU General Public License version 2 as | 8 | * it under the terms of the GNU General Public License version 2 as |
9 | * published by the Free Software Foundation. | 9 | * published by the Free Software Foundation. |
10 | */ | 10 | */ |
11 | |||
12 | #include "sched.h" | 11 | #include "sched.h" |
13 | 12 | ||
14 | DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); | 13 | DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); |
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 7936f548e071..feb5f89020f2 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c | |||
@@ -11,61 +11,57 @@ | |||
11 | 11 | ||
12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt | 12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
13 | 13 | ||
14 | #include <linux/cpufreq.h> | ||
15 | #include <linux/kthread.h> | ||
16 | #include <uapi/linux/sched/types.h> | ||
17 | #include <linux/slab.h> | ||
18 | #include <trace/events/power.h> | ||
19 | |||
20 | #include "sched.h" | 14 | #include "sched.h" |
21 | 15 | ||
16 | #include <trace/events/power.h> | ||
17 | |||
22 | struct sugov_tunables { | 18 | struct sugov_tunables { |
23 | struct gov_attr_set attr_set; | 19 | struct gov_attr_set attr_set; |
24 | unsigned int rate_limit_us; | 20 | unsigned int rate_limit_us; |
25 | }; | 21 | }; |
26 | 22 | ||
27 | struct sugov_policy { | 23 | struct sugov_policy { |
28 | struct cpufreq_policy *policy; | 24 | struct cpufreq_policy *policy; |
29 | 25 | ||
30 | struct sugov_tunables *tunables; | 26 | struct sugov_tunables *tunables; |
31 | struct list_head tunables_hook; | 27 | struct list_head tunables_hook; |
32 | 28 | ||
33 | raw_spinlock_t update_lock; /* For shared policies */ | 29 | raw_spinlock_t update_lock; /* For shared policies */ |
34 | u64 last_freq_update_time; | 30 | u64 last_freq_update_time; |
35 | s64 freq_update_delay_ns; | 31 | s64 freq_update_delay_ns; |
36 | unsigned int next_freq; | 32 | unsigned int next_freq; |
37 | unsigned int cached_raw_freq; | 33 | unsigned int cached_raw_freq; |
38 | 34 | ||
39 | /* The next fields are only needed if fast switch cannot be used. */ | 35 | /* The next fields are only needed if fast switch cannot be used: */ |
40 | struct irq_work irq_work; | 36 | struct irq_work irq_work; |
41 | struct kthread_work work; | 37 | struct kthread_work work; |
42 | struct mutex work_lock; | 38 | struct mutex work_lock; |
43 | struct kthread_worker worker; | 39 | struct kthread_worker worker; |
44 | struct task_struct *thread; | 40 | struct task_struct *thread; |
45 | bool work_in_progress; | 41 | bool work_in_progress; |
46 | 42 | ||
47 | bool need_freq_update; | 43 | bool need_freq_update; |
48 | }; | 44 | }; |
49 | 45 | ||
50 | struct sugov_cpu { | 46 | struct sugov_cpu { |
51 | struct update_util_data update_util; | 47 | struct update_util_data update_util; |
52 | struct sugov_policy *sg_policy; | 48 | struct sugov_policy *sg_policy; |
53 | unsigned int cpu; | 49 | unsigned int cpu; |
54 | 50 | ||
55 | bool iowait_boost_pending; | 51 | bool iowait_boost_pending; |
56 | unsigned int iowait_boost; | 52 | unsigned int iowait_boost; |
57 | unsigned int iowait_boost_max; | 53 | unsigned int iowait_boost_max; |
58 | u64 last_update; | 54 | u64 last_update; |
59 | 55 | ||
60 | /* The fields below are only needed when sharing a policy. */ | 56 | /* The fields below are only needed when sharing a policy: */ |
61 | unsigned long util_cfs; | 57 | unsigned long util_cfs; |
62 | unsigned long util_dl; | 58 | unsigned long util_dl; |
63 | unsigned long max; | 59 | unsigned long max; |
64 | unsigned int flags; | 60 | unsigned int flags; |
65 | 61 | ||
66 | /* The field below is for single-CPU policies only. */ | 62 | /* The field below is for single-CPU policies only: */ |
67 | #ifdef CONFIG_NO_HZ_COMMON | 63 | #ifdef CONFIG_NO_HZ_COMMON |
68 | unsigned long saved_idle_calls; | 64 | unsigned long saved_idle_calls; |
69 | #endif | 65 | #endif |
70 | }; | 66 | }; |
71 | 67 | ||
@@ -79,9 +75,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) | |||
79 | 75 | ||
80 | /* | 76 | /* |
81 | * Since cpufreq_update_util() is called with rq->lock held for | 77 | * Since cpufreq_update_util() is called with rq->lock held for |
82 | * the @target_cpu, our per-cpu data is fully serialized. | 78 | * the @target_cpu, our per-CPU data is fully serialized. |
83 | * | 79 | * |
84 | * However, drivers cannot in general deal with cross-cpu | 80 | * However, drivers cannot in general deal with cross-CPU |
85 | * requests, so while get_next_freq() will work, our | 81 | * requests, so while get_next_freq() will work, our |
86 | * sugov_update_commit() call may not for the fast switching platforms. | 82 | * sugov_update_commit() call may not for the fast switching platforms. |
87 | * | 83 | * |
@@ -111,6 +107,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time) | |||
111 | } | 107 | } |
112 | 108 | ||
113 | delta_ns = time - sg_policy->last_freq_update_time; | 109 | delta_ns = time - sg_policy->last_freq_update_time; |
110 | |||
114 | return delta_ns >= sg_policy->freq_update_delay_ns; | 111 | return delta_ns >= sg_policy->freq_update_delay_ns; |
115 | } | 112 | } |
116 | 113 | ||
@@ -345,8 +342,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) | |||
345 | return get_next_freq(sg_policy, util, max); | 342 | return get_next_freq(sg_policy, util, max); |
346 | } | 343 | } |
347 | 344 | ||
348 | static void sugov_update_shared(struct update_util_data *hook, u64 time, | 345 | static void |
349 | unsigned int flags) | 346 | sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags) |
350 | { | 347 | { |
351 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); | 348 | struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util); |
352 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; | 349 | struct sugov_policy *sg_policy = sg_cpu->sg_policy; |
@@ -423,8 +420,8 @@ static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf) | |||
423 | return sprintf(buf, "%u\n", tunables->rate_limit_us); | 420 | return sprintf(buf, "%u\n", tunables->rate_limit_us); |
424 | } | 421 | } |
425 | 422 | ||
426 | static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, | 423 | static ssize_t |
427 | size_t count) | 424 | rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count) |
428 | { | 425 | { |
429 | struct sugov_tunables *tunables = to_sugov_tunables(attr_set); | 426 | struct sugov_tunables *tunables = to_sugov_tunables(attr_set); |
430 | struct sugov_policy *sg_policy; | 427 | struct sugov_policy *sg_policy; |
@@ -479,11 +476,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) | |||
479 | { | 476 | { |
480 | struct task_struct *thread; | 477 | struct task_struct *thread; |
481 | struct sched_attr attr = { | 478 | struct sched_attr attr = { |
482 | .size = sizeof(struct sched_attr), | 479 | .size = sizeof(struct sched_attr), |
483 | .sched_policy = SCHED_DEADLINE, | 480 | .sched_policy = SCHED_DEADLINE, |
484 | .sched_flags = SCHED_FLAG_SUGOV, | 481 | .sched_flags = SCHED_FLAG_SUGOV, |
485 | .sched_nice = 0, | 482 | .sched_nice = 0, |
486 | .sched_priority = 0, | 483 | .sched_priority = 0, |
487 | /* | 484 | /* |
488 | * Fake (unused) bandwidth; workaround to "fix" | 485 | * Fake (unused) bandwidth; workaround to "fix" |
489 | * priority inheritance. | 486 | * priority inheritance. |
@@ -663,21 +660,21 @@ static int sugov_start(struct cpufreq_policy *policy) | |||
663 | struct sugov_policy *sg_policy = policy->governor_data; | 660 | struct sugov_policy *sg_policy = policy->governor_data; |
664 | unsigned int cpu; | 661 | unsigned int cpu; |
665 | 662 | ||
666 | sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; | 663 | sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC; |
667 | sg_policy->last_freq_update_time = 0; | 664 | sg_policy->last_freq_update_time = 0; |
668 | sg_policy->next_freq = UINT_MAX; | 665 | sg_policy->next_freq = UINT_MAX; |
669 | sg_policy->work_in_progress = false; | 666 | sg_policy->work_in_progress = false; |
670 | sg_policy->need_freq_update = false; | 667 | sg_policy->need_freq_update = false; |
671 | sg_policy->cached_raw_freq = 0; | 668 | sg_policy->cached_raw_freq = 0; |
672 | 669 | ||
673 | for_each_cpu(cpu, policy->cpus) { | 670 | for_each_cpu(cpu, policy->cpus) { |
674 | struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); | 671 | struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu); |
675 | 672 | ||
676 | memset(sg_cpu, 0, sizeof(*sg_cpu)); | 673 | memset(sg_cpu, 0, sizeof(*sg_cpu)); |
677 | sg_cpu->cpu = cpu; | 674 | sg_cpu->cpu = cpu; |
678 | sg_cpu->sg_policy = sg_policy; | 675 | sg_cpu->sg_policy = sg_policy; |
679 | sg_cpu->flags = 0; | 676 | sg_cpu->flags = 0; |
680 | sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; | 677 | sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq; |
681 | } | 678 | } |
682 | 679 | ||
683 | for_each_cpu(cpu, policy->cpus) { | 680 | for_each_cpu(cpu, policy->cpus) { |
@@ -721,14 +718,14 @@ static void sugov_limits(struct cpufreq_policy *policy) | |||
721 | } | 718 | } |
722 | 719 | ||
723 | static struct cpufreq_governor schedutil_gov = { | 720 | static struct cpufreq_governor schedutil_gov = { |
724 | .name = "schedutil", | 721 | .name = "schedutil", |
725 | .owner = THIS_MODULE, | 722 | .owner = THIS_MODULE, |
726 | .dynamic_switching = true, | 723 | .dynamic_switching = true, |
727 | .init = sugov_init, | 724 | .init = sugov_init, |
728 | .exit = sugov_exit, | 725 | .exit = sugov_exit, |
729 | .start = sugov_start, | 726 | .start = sugov_start, |
730 | .stop = sugov_stop, | 727 | .stop = sugov_stop, |
731 | .limits = sugov_limits, | 728 | .limits = sugov_limits, |
732 | }; | 729 | }; |
733 | 730 | ||
734 | #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL | 731 | #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL |
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 2511aba36b89..daaadf939ccb 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c | |||
@@ -14,7 +14,7 @@ | |||
14 | * | 14 | * |
15 | * going from the lowest priority to the highest. CPUs in the INVALID state | 15 | * going from the lowest priority to the highest. CPUs in the INVALID state |
16 | * are not eligible for routing. The system maintains this state with | 16 | * are not eligible for routing. The system maintains this state with |
17 | * a 2 dimensional bitmap (the first for priority class, the second for cpus | 17 | * a 2 dimensional bitmap (the first for priority class, the second for CPUs |
18 | * in that class). Therefore a typical application without affinity | 18 | * in that class). Therefore a typical application without affinity |
19 | * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit | 19 | * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit |
20 | * searches). For tasks with affinity restrictions, the algorithm has a | 20 | * searches). For tasks with affinity restrictions, the algorithm has a |
@@ -26,12 +26,7 @@ | |||
26 | * as published by the Free Software Foundation; version 2 | 26 | * as published by the Free Software Foundation; version 2 |
27 | * of the License. | 27 | * of the License. |
28 | */ | 28 | */ |
29 | 29 | #include "sched.h" | |
30 | #include <linux/gfp.h> | ||
31 | #include <linux/sched.h> | ||
32 | #include <linux/sched/rt.h> | ||
33 | #include <linux/slab.h> | ||
34 | #include "cpupri.h" | ||
35 | 30 | ||
36 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ | 31 | /* Convert between a 140 based task->prio, and our 102 based cpupri */ |
37 | static int convert_prio(int prio) | 32 | static int convert_prio(int prio) |
@@ -128,9 +123,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p, | |||
128 | } | 123 | } |
129 | 124 | ||
130 | /** | 125 | /** |
131 | * cpupri_set - update the cpu priority setting | 126 | * cpupri_set - update the CPU priority setting |
132 | * @cp: The cpupri context | 127 | * @cp: The cpupri context |
133 | * @cpu: The target cpu | 128 | * @cpu: The target CPU |
134 | * @newpri: The priority (INVALID-RT99) to assign to this CPU | 129 | * @newpri: The priority (INVALID-RT99) to assign to this CPU |
135 | * | 130 | * |
136 | * Note: Assumes cpu_rq(cpu)->lock is locked | 131 | * Note: Assumes cpu_rq(cpu)->lock is locked |
@@ -151,7 +146,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri) | |||
151 | return; | 146 | return; |
152 | 147 | ||
153 | /* | 148 | /* |
154 | * If the cpu was currently mapped to a different value, we | 149 | * If the CPU was currently mapped to a different value, we |
155 | * need to map it to the new value then remove the old value. | 150 | * need to map it to the new value then remove the old value. |
156 | * Note, we must add the new value first, otherwise we risk the | 151 | * Note, we must add the new value first, otherwise we risk the |
157 | * cpu being missed by the priority loop in cpupri_find. | 152 | * cpu being missed by the priority loop in cpupri_find. |
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index bab050019071..7dc20a3232e7 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h | |||
@@ -1,32 +1,25 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _LINUX_CPUPRI_H | ||
3 | #define _LINUX_CPUPRI_H | ||
4 | |||
5 | #include <linux/sched.h> | ||
6 | 2 | ||
7 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) | 3 | #define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) |
8 | 4 | ||
9 | #define CPUPRI_INVALID -1 | 5 | #define CPUPRI_INVALID -1 |
10 | #define CPUPRI_IDLE 0 | 6 | #define CPUPRI_IDLE 0 |
11 | #define CPUPRI_NORMAL 1 | 7 | #define CPUPRI_NORMAL 1 |
12 | /* values 2-101 are RT priorities 0-99 */ | 8 | /* values 2-101 are RT priorities 0-99 */ |
13 | 9 | ||
14 | struct cpupri_vec { | 10 | struct cpupri_vec { |
15 | atomic_t count; | 11 | atomic_t count; |
16 | cpumask_var_t mask; | 12 | cpumask_var_t mask; |
17 | }; | 13 | }; |
18 | 14 | ||
19 | struct cpupri { | 15 | struct cpupri { |
20 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; | 16 | struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES]; |
21 | int *cpu_to_pri; | 17 | int *cpu_to_pri; |
22 | }; | 18 | }; |
23 | 19 | ||
24 | #ifdef CONFIG_SMP | 20 | #ifdef CONFIG_SMP |
25 | int cpupri_find(struct cpupri *cp, | 21 | int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); |
26 | struct task_struct *p, struct cpumask *lowest_mask); | ||
27 | void cpupri_set(struct cpupri *cp, int cpu, int pri); | 22 | void cpupri_set(struct cpupri *cp, int cpu, int pri); |
28 | int cpupri_init(struct cpupri *cp); | 23 | int cpupri_init(struct cpupri *cp); |
29 | void cpupri_cleanup(struct cpupri *cp); | 24 | void cpupri_cleanup(struct cpupri *cp); |
30 | #endif | 25 | #endif |
31 | |||
32 | #endif /* _LINUX_CPUPRI_H */ | ||
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index bac6ac9a4ec7..0796f938c4f0 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c | |||
@@ -1,10 +1,6 @@ | |||
1 | #include <linux/export.h> | 1 | /* |
2 | #include <linux/sched.h> | 2 | * Simple CPU accounting cgroup controller |
3 | #include <linux/tsacct_kern.h> | 3 | */ |
4 | #include <linux/kernel_stat.h> | ||
5 | #include <linux/static_key.h> | ||
6 | #include <linux/context_tracking.h> | ||
7 | #include <linux/sched/cputime.h> | ||
8 | #include "sched.h" | 4 | #include "sched.h" |
9 | 5 | ||
10 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 6 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
@@ -113,9 +109,9 @@ static inline void task_group_account_field(struct task_struct *p, int index, | |||
113 | } | 109 | } |
114 | 110 | ||
115 | /* | 111 | /* |
116 | * Account user cpu time to a process. | 112 | * Account user CPU time to a process. |
117 | * @p: the process that the cpu time gets accounted to | 113 | * @p: the process that the CPU time gets accounted to |
118 | * @cputime: the cpu time spent in user space since the last update | 114 | * @cputime: the CPU time spent in user space since the last update |
119 | */ | 115 | */ |
120 | void account_user_time(struct task_struct *p, u64 cputime) | 116 | void account_user_time(struct task_struct *p, u64 cputime) |
121 | { | 117 | { |
@@ -135,9 +131,9 @@ void account_user_time(struct task_struct *p, u64 cputime) | |||
135 | } | 131 | } |
136 | 132 | ||
137 | /* | 133 | /* |
138 | * Account guest cpu time to a process. | 134 | * Account guest CPU time to a process. |
139 | * @p: the process that the cpu time gets accounted to | 135 | * @p: the process that the CPU time gets accounted to |
140 | * @cputime: the cpu time spent in virtual machine since the last update | 136 | * @cputime: the CPU time spent in virtual machine since the last update |
141 | */ | 137 | */ |
142 | void account_guest_time(struct task_struct *p, u64 cputime) | 138 | void account_guest_time(struct task_struct *p, u64 cputime) |
143 | { | 139 | { |
@@ -159,9 +155,9 @@ void account_guest_time(struct task_struct *p, u64 cputime) | |||
159 | } | 155 | } |
160 | 156 | ||
161 | /* | 157 | /* |
162 | * Account system cpu time to a process and desired cpustat field | 158 | * Account system CPU time to a process and desired cpustat field |
163 | * @p: the process that the cpu time gets accounted to | 159 | * @p: the process that the CPU time gets accounted to |
164 | * @cputime: the cpu time spent in kernel space since the last update | 160 | * @cputime: the CPU time spent in kernel space since the last update |
165 | * @index: pointer to cpustat field that has to be updated | 161 | * @index: pointer to cpustat field that has to be updated |
166 | */ | 162 | */ |
167 | void account_system_index_time(struct task_struct *p, | 163 | void account_system_index_time(struct task_struct *p, |
@@ -179,10 +175,10 @@ void account_system_index_time(struct task_struct *p, | |||
179 | } | 175 | } |
180 | 176 | ||
181 | /* | 177 | /* |
182 | * Account system cpu time to a process. | 178 | * Account system CPU time to a process. |
183 | * @p: the process that the cpu time gets accounted to | 179 | * @p: the process that the CPU time gets accounted to |
184 | * @hardirq_offset: the offset to subtract from hardirq_count() | 180 | * @hardirq_offset: the offset to subtract from hardirq_count() |
185 | * @cputime: the cpu time spent in kernel space since the last update | 181 | * @cputime: the CPU time spent in kernel space since the last update |
186 | */ | 182 | */ |
187 | void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) | 183 | void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) |
188 | { | 184 | { |
@@ -205,7 +201,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime) | |||
205 | 201 | ||
206 | /* | 202 | /* |
207 | * Account for involuntary wait time. | 203 | * Account for involuntary wait time. |
208 | * @cputime: the cpu time spent in involuntary wait | 204 | * @cputime: the CPU time spent in involuntary wait |
209 | */ | 205 | */ |
210 | void account_steal_time(u64 cputime) | 206 | void account_steal_time(u64 cputime) |
211 | { | 207 | { |
@@ -216,7 +212,7 @@ void account_steal_time(u64 cputime) | |||
216 | 212 | ||
217 | /* | 213 | /* |
218 | * Account for idle time. | 214 | * Account for idle time. |
219 | * @cputime: the cpu time spent in idle wait | 215 | * @cputime: the CPU time spent in idle wait |
220 | */ | 216 | */ |
221 | void account_idle_time(u64 cputime) | 217 | void account_idle_time(u64 cputime) |
222 | { | 218 | { |
@@ -338,7 +334,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) | |||
338 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 334 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
339 | /* | 335 | /* |
340 | * Account a tick to a process and cpustat | 336 | * Account a tick to a process and cpustat |
341 | * @p: the process that the cpu time gets accounted to | 337 | * @p: the process that the CPU time gets accounted to |
342 | * @user_tick: is the tick from userspace | 338 | * @user_tick: is the tick from userspace |
343 | * @rq: the pointer to rq | 339 | * @rq: the pointer to rq |
344 | * | 340 | * |
@@ -400,17 +396,16 @@ static void irqtime_account_idle_ticks(int ticks) | |||
400 | irqtime_account_process_tick(current, 0, rq, ticks); | 396 | irqtime_account_process_tick(current, 0, rq, ticks); |
401 | } | 397 | } |
402 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ | 398 | #else /* CONFIG_IRQ_TIME_ACCOUNTING */ |
403 | static inline void irqtime_account_idle_ticks(int ticks) {} | 399 | static inline void irqtime_account_idle_ticks(int ticks) { } |
404 | static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, | 400 | static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick, |
405 | struct rq *rq, int nr_ticks) {} | 401 | struct rq *rq, int nr_ticks) { } |
406 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ | 402 | #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ |
407 | 403 | ||
408 | /* | 404 | /* |
409 | * Use precise platform statistics if available: | 405 | * Use precise platform statistics if available: |
410 | */ | 406 | */ |
411 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING | 407 | #ifdef CONFIG_VIRT_CPU_ACCOUNTING |
412 | 408 | # ifndef __ARCH_HAS_VTIME_TASK_SWITCH | |
413 | #ifndef __ARCH_HAS_VTIME_TASK_SWITCH | ||
414 | void vtime_common_task_switch(struct task_struct *prev) | 409 | void vtime_common_task_switch(struct task_struct *prev) |
415 | { | 410 | { |
416 | if (is_idle_task(prev)) | 411 | if (is_idle_task(prev)) |
@@ -421,8 +416,7 @@ void vtime_common_task_switch(struct task_struct *prev) | |||
421 | vtime_flush(prev); | 416 | vtime_flush(prev); |
422 | arch_vtime_task_switch(prev); | 417 | arch_vtime_task_switch(prev); |
423 | } | 418 | } |
424 | #endif | 419 | # endif |
425 | |||
426 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ | 420 | #endif /* CONFIG_VIRT_CPU_ACCOUNTING */ |
427 | 421 | ||
428 | 422 | ||
@@ -469,10 +463,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) | |||
469 | *ut = cputime.utime; | 463 | *ut = cputime.utime; |
470 | *st = cputime.stime; | 464 | *st = cputime.stime; |
471 | } | 465 | } |
472 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ | 466 | |
467 | #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */ | ||
468 | |||
473 | /* | 469 | /* |
474 | * Account a single tick of cpu time. | 470 | * Account a single tick of CPU time. |
475 | * @p: the process that the cpu time gets accounted to | 471 | * @p: the process that the CPU time gets accounted to |
476 | * @user_tick: indicates if the tick is a user or a system tick | 472 | * @user_tick: indicates if the tick is a user or a system tick |
477 | */ | 473 | */ |
478 | void account_process_tick(struct task_struct *p, int user_tick) | 474 | void account_process_tick(struct task_struct *p, int user_tick) |
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9df09782025c..8b7c2b35bec9 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c | |||
@@ -17,9 +17,6 @@ | |||
17 | */ | 17 | */ |
18 | #include "sched.h" | 18 | #include "sched.h" |
19 | 19 | ||
20 | #include <linux/slab.h> | ||
21 | #include <uapi/linux/sched/types.h> | ||
22 | |||
23 | struct dl_bandwidth def_dl_bandwidth; | 20 | struct dl_bandwidth def_dl_bandwidth; |
24 | 21 | ||
25 | static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) | 22 | static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) |
@@ -514,7 +511,7 @@ static DEFINE_PER_CPU(struct callback_head, dl_pull_head); | |||
514 | static void push_dl_tasks(struct rq *); | 511 | static void push_dl_tasks(struct rq *); |
515 | static void pull_dl_task(struct rq *); | 512 | static void pull_dl_task(struct rq *); |
516 | 513 | ||
517 | static inline void queue_push_tasks(struct rq *rq) | 514 | static inline void deadline_queue_push_tasks(struct rq *rq) |
518 | { | 515 | { |
519 | if (!has_pushable_dl_tasks(rq)) | 516 | if (!has_pushable_dl_tasks(rq)) |
520 | return; | 517 | return; |
@@ -522,7 +519,7 @@ static inline void queue_push_tasks(struct rq *rq) | |||
522 | queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks); | 519 | queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks); |
523 | } | 520 | } |
524 | 521 | ||
525 | static inline void queue_pull_task(struct rq *rq) | 522 | static inline void deadline_queue_pull_task(struct rq *rq) |
526 | { | 523 | { |
527 | queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task); | 524 | queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task); |
528 | } | 525 | } |
@@ -539,12 +536,12 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p | |||
539 | 536 | ||
540 | /* | 537 | /* |
541 | * If we cannot preempt any rq, fall back to pick any | 538 | * If we cannot preempt any rq, fall back to pick any |
542 | * online cpu. | 539 | * online CPU: |
543 | */ | 540 | */ |
544 | cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); | 541 | cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed); |
545 | if (cpu >= nr_cpu_ids) { | 542 | if (cpu >= nr_cpu_ids) { |
546 | /* | 543 | /* |
547 | * Fail to find any suitable cpu. | 544 | * Failed to find any suitable CPU. |
548 | * The task will never come back! | 545 | * The task will never come back! |
549 | */ | 546 | */ |
550 | BUG_ON(dl_bandwidth_enabled()); | 547 | BUG_ON(dl_bandwidth_enabled()); |
@@ -597,19 +594,18 @@ static inline void pull_dl_task(struct rq *rq) | |||
597 | { | 594 | { |
598 | } | 595 | } |
599 | 596 | ||
600 | static inline void queue_push_tasks(struct rq *rq) | 597 | static inline void deadline_queue_push_tasks(struct rq *rq) |
601 | { | 598 | { |
602 | } | 599 | } |
603 | 600 | ||
604 | static inline void queue_pull_task(struct rq *rq) | 601 | static inline void deadline_queue_pull_task(struct rq *rq) |
605 | { | 602 | { |
606 | } | 603 | } |
607 | #endif /* CONFIG_SMP */ | 604 | #endif /* CONFIG_SMP */ |
608 | 605 | ||
609 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); | 606 | static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); |
610 | static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); | 607 | static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); |
611 | static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, | 608 | static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); |
612 | int flags); | ||
613 | 609 | ||
614 | /* | 610 | /* |
615 | * We are being explicitly informed that a new instance is starting, | 611 | * We are being explicitly informed that a new instance is starting, |
@@ -1763,7 +1759,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | |||
1763 | if (hrtick_enabled(rq)) | 1759 | if (hrtick_enabled(rq)) |
1764 | start_hrtick_dl(rq, p); | 1760 | start_hrtick_dl(rq, p); |
1765 | 1761 | ||
1766 | queue_push_tasks(rq); | 1762 | deadline_queue_push_tasks(rq); |
1767 | 1763 | ||
1768 | return p; | 1764 | return p; |
1769 | } | 1765 | } |
@@ -1776,6 +1772,14 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p) | |||
1776 | enqueue_pushable_dl_task(rq, p); | 1772 | enqueue_pushable_dl_task(rq, p); |
1777 | } | 1773 | } |
1778 | 1774 | ||
1775 | /* | ||
1776 | * scheduler tick hitting a task of our scheduling class. | ||
1777 | * | ||
1778 | * NOTE: This function can be called remotely by the tick offload that | ||
1779 | * goes along full dynticks. Therefore no local assumption can be made | ||
1780 | * and everything must be accessed through the @rq and @curr passed in | ||
1781 | * parameters. | ||
1782 | */ | ||
1779 | static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) | 1783 | static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued) |
1780 | { | 1784 | { |
1781 | update_curr_dl(rq); | 1785 | update_curr_dl(rq); |
@@ -1865,7 +1869,7 @@ static int find_later_rq(struct task_struct *task) | |||
1865 | 1869 | ||
1866 | /* | 1870 | /* |
1867 | * We have to consider system topology and task affinity | 1871 | * We have to consider system topology and task affinity |
1868 | * first, then we can look for a suitable cpu. | 1872 | * first, then we can look for a suitable CPU. |
1869 | */ | 1873 | */ |
1870 | if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) | 1874 | if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask)) |
1871 | return -1; | 1875 | return -1; |
@@ -1879,7 +1883,7 @@ static int find_later_rq(struct task_struct *task) | |||
1879 | * Now we check how well this matches with task's | 1883 | * Now we check how well this matches with task's |
1880 | * affinity and system topology. | 1884 | * affinity and system topology. |
1881 | * | 1885 | * |
1882 | * The last cpu where the task run is our first | 1886 | * The last CPU where the task run is our first |
1883 | * guess, since it is most likely cache-hot there. | 1887 | * guess, since it is most likely cache-hot there. |
1884 | */ | 1888 | */ |
1885 | if (cpumask_test_cpu(cpu, later_mask)) | 1889 | if (cpumask_test_cpu(cpu, later_mask)) |
@@ -1909,9 +1913,9 @@ static int find_later_rq(struct task_struct *task) | |||
1909 | best_cpu = cpumask_first_and(later_mask, | 1913 | best_cpu = cpumask_first_and(later_mask, |
1910 | sched_domain_span(sd)); | 1914 | sched_domain_span(sd)); |
1911 | /* | 1915 | /* |
1912 | * Last chance: if a cpu being in both later_mask | 1916 | * Last chance: if a CPU being in both later_mask |
1913 | * and current sd span is valid, that becomes our | 1917 | * and current sd span is valid, that becomes our |
1914 | * choice. Of course, the latest possible cpu is | 1918 | * choice. Of course, the latest possible CPU is |
1915 | * already under consideration through later_mask. | 1919 | * already under consideration through later_mask. |
1916 | */ | 1920 | */ |
1917 | if (best_cpu < nr_cpu_ids) { | 1921 | if (best_cpu < nr_cpu_ids) { |
@@ -2067,7 +2071,7 @@ retry: | |||
2067 | if (task == next_task) { | 2071 | if (task == next_task) { |
2068 | /* | 2072 | /* |
2069 | * The task is still there. We don't try | 2073 | * The task is still there. We don't try |
2070 | * again, some other cpu will pull it when ready. | 2074 | * again, some other CPU will pull it when ready. |
2071 | */ | 2075 | */ |
2072 | goto out; | 2076 | goto out; |
2073 | } | 2077 | } |
@@ -2300,12 +2304,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) | |||
2300 | /* | 2304 | /* |
2301 | * Since this might be the only -deadline task on the rq, | 2305 | * Since this might be the only -deadline task on the rq, |
2302 | * this is the right place to try to pull some other one | 2306 | * this is the right place to try to pull some other one |
2303 | * from an overloaded cpu, if any. | 2307 | * from an overloaded CPU, if any. |
2304 | */ | 2308 | */ |
2305 | if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) | 2309 | if (!task_on_rq_queued(p) || rq->dl.dl_nr_running) |
2306 | return; | 2310 | return; |
2307 | 2311 | ||
2308 | queue_pull_task(rq); | 2312 | deadline_queue_pull_task(rq); |
2309 | } | 2313 | } |
2310 | 2314 | ||
2311 | /* | 2315 | /* |
@@ -2327,7 +2331,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) | |||
2327 | if (rq->curr != p) { | 2331 | if (rq->curr != p) { |
2328 | #ifdef CONFIG_SMP | 2332 | #ifdef CONFIG_SMP |
2329 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) | 2333 | if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) |
2330 | queue_push_tasks(rq); | 2334 | deadline_queue_push_tasks(rq); |
2331 | #endif | 2335 | #endif |
2332 | if (dl_task(rq->curr)) | 2336 | if (dl_task(rq->curr)) |
2333 | check_preempt_curr_dl(rq, p, 0); | 2337 | check_preempt_curr_dl(rq, p, 0); |
@@ -2352,7 +2356,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, | |||
2352 | * or lowering its prio, so... | 2356 | * or lowering its prio, so... |
2353 | */ | 2357 | */ |
2354 | if (!rq->dl.overloaded) | 2358 | if (!rq->dl.overloaded) |
2355 | queue_pull_task(rq); | 2359 | deadline_queue_pull_task(rq); |
2356 | 2360 | ||
2357 | /* | 2361 | /* |
2358 | * If we now have a earlier deadline task than p, | 2362 | * If we now have a earlier deadline task than p, |
@@ -2626,17 +2630,17 @@ void __dl_clear_params(struct task_struct *p) | |||
2626 | { | 2630 | { |
2627 | struct sched_dl_entity *dl_se = &p->dl; | 2631 | struct sched_dl_entity *dl_se = &p->dl; |
2628 | 2632 | ||
2629 | dl_se->dl_runtime = 0; | 2633 | dl_se->dl_runtime = 0; |
2630 | dl_se->dl_deadline = 0; | 2634 | dl_se->dl_deadline = 0; |
2631 | dl_se->dl_period = 0; | 2635 | dl_se->dl_period = 0; |
2632 | dl_se->flags = 0; | 2636 | dl_se->flags = 0; |
2633 | dl_se->dl_bw = 0; | 2637 | dl_se->dl_bw = 0; |
2634 | dl_se->dl_density = 0; | 2638 | dl_se->dl_density = 0; |
2635 | 2639 | ||
2636 | dl_se->dl_throttled = 0; | 2640 | dl_se->dl_throttled = 0; |
2637 | dl_se->dl_yielded = 0; | 2641 | dl_se->dl_yielded = 0; |
2638 | dl_se->dl_non_contending = 0; | 2642 | dl_se->dl_non_contending = 0; |
2639 | dl_se->dl_overrun = 0; | 2643 | dl_se->dl_overrun = 0; |
2640 | } | 2644 | } |
2641 | 2645 | ||
2642 | bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) | 2646 | bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) |
@@ -2655,21 +2659,22 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr) | |||
2655 | #ifdef CONFIG_SMP | 2659 | #ifdef CONFIG_SMP |
2656 | int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) | 2660 | int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed) |
2657 | { | 2661 | { |
2658 | unsigned int dest_cpu = cpumask_any_and(cpu_active_mask, | 2662 | unsigned int dest_cpu; |
2659 | cs_cpus_allowed); | ||
2660 | struct dl_bw *dl_b; | 2663 | struct dl_bw *dl_b; |
2661 | bool overflow; | 2664 | bool overflow; |
2662 | int cpus, ret; | 2665 | int cpus, ret; |
2663 | unsigned long flags; | 2666 | unsigned long flags; |
2664 | 2667 | ||
2668 | dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed); | ||
2669 | |||
2665 | rcu_read_lock_sched(); | 2670 | rcu_read_lock_sched(); |
2666 | dl_b = dl_bw_of(dest_cpu); | 2671 | dl_b = dl_bw_of(dest_cpu); |
2667 | raw_spin_lock_irqsave(&dl_b->lock, flags); | 2672 | raw_spin_lock_irqsave(&dl_b->lock, flags); |
2668 | cpus = dl_bw_cpus(dest_cpu); | 2673 | cpus = dl_bw_cpus(dest_cpu); |
2669 | overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); | 2674 | overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw); |
2670 | if (overflow) | 2675 | if (overflow) { |
2671 | ret = -EBUSY; | 2676 | ret = -EBUSY; |
2672 | else { | 2677 | } else { |
2673 | /* | 2678 | /* |
2674 | * We reserve space for this task in the destination | 2679 | * We reserve space for this task in the destination |
2675 | * root_domain, as we can't fail after this point. | 2680 | * root_domain, as we can't fail after this point. |
@@ -2681,6 +2686,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo | |||
2681 | } | 2686 | } |
2682 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 2687 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
2683 | rcu_read_unlock_sched(); | 2688 | rcu_read_unlock_sched(); |
2689 | |||
2684 | return ret; | 2690 | return ret; |
2685 | } | 2691 | } |
2686 | 2692 | ||
@@ -2701,6 +2707,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, | |||
2701 | ret = 0; | 2707 | ret = 0; |
2702 | raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); | 2708 | raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags); |
2703 | rcu_read_unlock_sched(); | 2709 | rcu_read_unlock_sched(); |
2710 | |||
2704 | return ret; | 2711 | return ret; |
2705 | } | 2712 | } |
2706 | 2713 | ||
@@ -2718,6 +2725,7 @@ bool dl_cpu_busy(unsigned int cpu) | |||
2718 | overflow = __dl_overflow(dl_b, cpus, 0, 0); | 2725 | overflow = __dl_overflow(dl_b, cpus, 0, 0); |
2719 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); | 2726 | raw_spin_unlock_irqrestore(&dl_b->lock, flags); |
2720 | rcu_read_unlock_sched(); | 2727 | rcu_read_unlock_sched(); |
2728 | |||
2721 | return overflow; | 2729 | return overflow; |
2722 | } | 2730 | } |
2723 | #endif | 2731 | #endif |
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 72c401b3b15c..99e825b76633 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c | |||
@@ -1,7 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * kernel/sched/debug.c | 2 | * kernel/sched/debug.c |
3 | * | 3 | * |
4 | * Print the CFS rbtree | 4 | * Print the CFS rbtree and other debugging details |
5 | * | 5 | * |
6 | * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar | 6 | * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar |
7 | * | 7 | * |
@@ -9,16 +9,6 @@ | |||
9 | * it under the terms of the GNU General Public License version 2 as | 9 | * it under the terms of the GNU General Public License version 2 as |
10 | * published by the Free Software Foundation. | 10 | * published by the Free Software Foundation. |
11 | */ | 11 | */ |
12 | |||
13 | #include <linux/proc_fs.h> | ||
14 | #include <linux/sched/mm.h> | ||
15 | #include <linux/sched/task.h> | ||
16 | #include <linux/seq_file.h> | ||
17 | #include <linux/kallsyms.h> | ||
18 | #include <linux/utsname.h> | ||
19 | #include <linux/mempolicy.h> | ||
20 | #include <linux/debugfs.h> | ||
21 | |||
22 | #include "sched.h" | 12 | #include "sched.h" |
23 | 13 | ||
24 | static DEFINE_SPINLOCK(sched_debug_lock); | 14 | static DEFINE_SPINLOCK(sched_debug_lock); |
@@ -274,34 +264,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
274 | if (table == NULL) | 264 | if (table == NULL) |
275 | return NULL; | 265 | return NULL; |
276 | 266 | ||
277 | set_table_entry(&table[0], "min_interval", &sd->min_interval, | 267 | set_table_entry(&table[0] , "min_interval", &sd->min_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); |
278 | sizeof(long), 0644, proc_doulongvec_minmax, false); | 268 | set_table_entry(&table[1] , "max_interval", &sd->max_interval, sizeof(long), 0644, proc_doulongvec_minmax, false); |
279 | set_table_entry(&table[1], "max_interval", &sd->max_interval, | 269 | set_table_entry(&table[2] , "busy_idx", &sd->busy_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
280 | sizeof(long), 0644, proc_doulongvec_minmax, false); | 270 | set_table_entry(&table[3] , "idle_idx", &sd->idle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
281 | set_table_entry(&table[2], "busy_idx", &sd->busy_idx, | 271 | set_table_entry(&table[4] , "newidle_idx", &sd->newidle_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
282 | sizeof(int), 0644, proc_dointvec_minmax, true); | 272 | set_table_entry(&table[5] , "wake_idx", &sd->wake_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
283 | set_table_entry(&table[3], "idle_idx", &sd->idle_idx, | 273 | set_table_entry(&table[6] , "forkexec_idx", &sd->forkexec_idx, sizeof(int) , 0644, proc_dointvec_minmax, true ); |
284 | sizeof(int), 0644, proc_dointvec_minmax, true); | 274 | set_table_entry(&table[7] , "busy_factor", &sd->busy_factor, sizeof(int) , 0644, proc_dointvec_minmax, false); |
285 | set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, | 275 | set_table_entry(&table[8] , "imbalance_pct", &sd->imbalance_pct, sizeof(int) , 0644, proc_dointvec_minmax, false); |
286 | sizeof(int), 0644, proc_dointvec_minmax, true); | 276 | set_table_entry(&table[9] , "cache_nice_tries", &sd->cache_nice_tries, sizeof(int) , 0644, proc_dointvec_minmax, false); |
287 | set_table_entry(&table[5], "wake_idx", &sd->wake_idx, | 277 | set_table_entry(&table[10], "flags", &sd->flags, sizeof(int) , 0644, proc_dointvec_minmax, false); |
288 | sizeof(int), 0644, proc_dointvec_minmax, true); | 278 | set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false); |
289 | set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, | 279 | set_table_entry(&table[12], "name", sd->name, CORENAME_MAX_SIZE, 0444, proc_dostring, false); |
290 | sizeof(int), 0644, proc_dointvec_minmax, true); | ||
291 | set_table_entry(&table[7], "busy_factor", &sd->busy_factor, | ||
292 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
293 | set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, | ||
294 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
295 | set_table_entry(&table[9], "cache_nice_tries", | ||
296 | &sd->cache_nice_tries, | ||
297 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
298 | set_table_entry(&table[10], "flags", &sd->flags, | ||
299 | sizeof(int), 0644, proc_dointvec_minmax, false); | ||
300 | set_table_entry(&table[11], "max_newidle_lb_cost", | ||
301 | &sd->max_newidle_lb_cost, | ||
302 | sizeof(long), 0644, proc_doulongvec_minmax, false); | ||
303 | set_table_entry(&table[12], "name", sd->name, | ||
304 | CORENAME_MAX_SIZE, 0444, proc_dostring, false); | ||
305 | /* &table[13] is terminator */ | 280 | /* &table[13] is terminator */ |
306 | 281 | ||
307 | return table; | 282 | return table; |
@@ -332,8 +307,8 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) | |||
332 | return table; | 307 | return table; |
333 | } | 308 | } |
334 | 309 | ||
335 | static cpumask_var_t sd_sysctl_cpus; | 310 | static cpumask_var_t sd_sysctl_cpus; |
336 | static struct ctl_table_header *sd_sysctl_header; | 311 | static struct ctl_table_header *sd_sysctl_header; |
337 | 312 | ||
338 | void register_sched_domain_sysctl(void) | 313 | void register_sched_domain_sysctl(void) |
339 | { | 314 | { |
@@ -413,14 +388,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
413 | { | 388 | { |
414 | struct sched_entity *se = tg->se[cpu]; | 389 | struct sched_entity *se = tg->se[cpu]; |
415 | 390 | ||
416 | #define P(F) \ | 391 | #define P(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) |
417 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)F) | 392 | #define P_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) |
418 | #define P_SCHEDSTAT(F) \ | 393 | #define PN(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) |
419 | SEQ_printf(m, " .%-30s: %lld\n", #F, (long long)schedstat_val(F)) | 394 | #define PN_SCHEDSTAT(F) SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) |
420 | #define PN(F) \ | ||
421 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F)) | ||
422 | #define PN_SCHEDSTAT(F) \ | ||
423 | SEQ_printf(m, " .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F))) | ||
424 | 395 | ||
425 | if (!se) | 396 | if (!se) |
426 | return; | 397 | return; |
@@ -428,6 +399,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
428 | PN(se->exec_start); | 399 | PN(se->exec_start); |
429 | PN(se->vruntime); | 400 | PN(se->vruntime); |
430 | PN(se->sum_exec_runtime); | 401 | PN(se->sum_exec_runtime); |
402 | |||
431 | if (schedstat_enabled()) { | 403 | if (schedstat_enabled()) { |
432 | PN_SCHEDSTAT(se->statistics.wait_start); | 404 | PN_SCHEDSTAT(se->statistics.wait_start); |
433 | PN_SCHEDSTAT(se->statistics.sleep_start); | 405 | PN_SCHEDSTAT(se->statistics.sleep_start); |
@@ -440,6 +412,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group | |||
440 | PN_SCHEDSTAT(se->statistics.wait_sum); | 412 | PN_SCHEDSTAT(se->statistics.wait_sum); |
441 | P_SCHEDSTAT(se->statistics.wait_count); | 413 | P_SCHEDSTAT(se->statistics.wait_count); |
442 | } | 414 | } |
415 | |||
443 | P(se->load.weight); | 416 | P(se->load.weight); |
444 | P(se->runnable_weight); | 417 | P(se->runnable_weight); |
445 | #ifdef CONFIG_SMP | 418 | #ifdef CONFIG_SMP |
@@ -464,6 +437,7 @@ static char *task_group_path(struct task_group *tg) | |||
464 | return group_path; | 437 | return group_path; |
465 | 438 | ||
466 | cgroup_path(tg->css.cgroup, group_path, PATH_MAX); | 439 | cgroup_path(tg->css.cgroup, group_path, PATH_MAX); |
440 | |||
467 | return group_path; | 441 | return group_path; |
468 | } | 442 | } |
469 | #endif | 443 | #endif |
@@ -804,9 +778,9 @@ void sysrq_sched_debug_show(void) | |||
804 | /* | 778 | /* |
805 | * This itererator needs some explanation. | 779 | * This itererator needs some explanation. |
806 | * It returns 1 for the header position. | 780 | * It returns 1 for the header position. |
807 | * This means 2 is cpu 0. | 781 | * This means 2 is CPU 0. |
808 | * In a hotplugged system some cpus, including cpu 0, may be missing so we have | 782 | * In a hotplugged system some CPUs, including CPU 0, may be missing so we have |
809 | * to use cpumask_* to iterate over the cpus. | 783 | * to use cpumask_* to iterate over the CPUs. |
810 | */ | 784 | */ |
811 | static void *sched_debug_start(struct seq_file *file, loff_t *offset) | 785 | static void *sched_debug_start(struct seq_file *file, loff_t *offset) |
812 | { | 786 | { |
@@ -826,6 +800,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset) | |||
826 | 800 | ||
827 | if (n < nr_cpu_ids) | 801 | if (n < nr_cpu_ids) |
828 | return (void *)(unsigned long)(n + 2); | 802 | return (void *)(unsigned long)(n + 2); |
803 | |||
829 | return NULL; | 804 | return NULL; |
830 | } | 805 | } |
831 | 806 | ||
@@ -840,10 +815,10 @@ static void sched_debug_stop(struct seq_file *file, void *data) | |||
840 | } | 815 | } |
841 | 816 | ||
842 | static const struct seq_operations sched_debug_sops = { | 817 | static const struct seq_operations sched_debug_sops = { |
843 | .start = sched_debug_start, | 818 | .start = sched_debug_start, |
844 | .next = sched_debug_next, | 819 | .next = sched_debug_next, |
845 | .stop = sched_debug_stop, | 820 | .stop = sched_debug_stop, |
846 | .show = sched_debug_show, | 821 | .show = sched_debug_show, |
847 | }; | 822 | }; |
848 | 823 | ||
849 | static int sched_debug_release(struct inode *inode, struct file *file) | 824 | static int sched_debug_release(struct inode *inode, struct file *file) |
@@ -881,14 +856,10 @@ static int __init init_sched_debug_procfs(void) | |||
881 | 856 | ||
882 | __initcall(init_sched_debug_procfs); | 857 | __initcall(init_sched_debug_procfs); |
883 | 858 | ||
884 | #define __P(F) \ | 859 | #define __P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) |
885 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F) | 860 | #define P(F) SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) |
886 | #define P(F) \ | 861 | #define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) |
887 | SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F) | 862 | #define PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) |
888 | #define __PN(F) \ | ||
889 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F)) | ||
890 | #define PN(F) \ | ||
891 | SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F)) | ||
892 | 863 | ||
893 | 864 | ||
894 | #ifdef CONFIG_NUMA_BALANCING | 865 | #ifdef CONFIG_NUMA_BALANCING |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5eb3ffc9be84..f5591071ae98 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -20,25 +20,10 @@ | |||
20 | * Adaptive scheduling granularity, math enhancements by Peter Zijlstra | 20 | * Adaptive scheduling granularity, math enhancements by Peter Zijlstra |
21 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra | 21 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra |
22 | */ | 22 | */ |
23 | 23 | #include "sched.h" | |
24 | #include <linux/sched/mm.h> | ||
25 | #include <linux/sched/topology.h> | ||
26 | |||
27 | #include <linux/latencytop.h> | ||
28 | #include <linux/cpumask.h> | ||
29 | #include <linux/cpuidle.h> | ||
30 | #include <linux/slab.h> | ||
31 | #include <linux/profile.h> | ||
32 | #include <linux/interrupt.h> | ||
33 | #include <linux/mempolicy.h> | ||
34 | #include <linux/migrate.h> | ||
35 | #include <linux/task_work.h> | ||
36 | #include <linux/sched/isolation.h> | ||
37 | 24 | ||
38 | #include <trace/events/sched.h> | 25 | #include <trace/events/sched.h> |
39 | 26 | ||
40 | #include "sched.h" | ||
41 | |||
42 | /* | 27 | /* |
43 | * Targeted preemption latency for CPU-bound tasks: | 28 | * Targeted preemption latency for CPU-bound tasks: |
44 | * | 29 | * |
@@ -103,7 +88,7 @@ const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | |||
103 | 88 | ||
104 | #ifdef CONFIG_SMP | 89 | #ifdef CONFIG_SMP |
105 | /* | 90 | /* |
106 | * For asym packing, by default the lower numbered cpu has higher priority. | 91 | * For asym packing, by default the lower numbered CPU has higher priority. |
107 | */ | 92 | */ |
108 | int __weak arch_asym_cpu_priority(int cpu) | 93 | int __weak arch_asym_cpu_priority(int cpu) |
109 | { | 94 | { |
@@ -1181,7 +1166,7 @@ pid_t task_numa_group_id(struct task_struct *p) | |||
1181 | } | 1166 | } |
1182 | 1167 | ||
1183 | /* | 1168 | /* |
1184 | * The averaged statistics, shared & private, memory & cpu, | 1169 | * The averaged statistics, shared & private, memory & CPU, |
1185 | * occupy the first half of the array. The second half of the | 1170 | * occupy the first half of the array. The second half of the |
1186 | * array is for current counters, which are averaged into the | 1171 | * array is for current counters, which are averaged into the |
1187 | * first set by task_numa_placement. | 1172 | * first set by task_numa_placement. |
@@ -1587,7 +1572,7 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1587 | * be incurred if the tasks were swapped. | 1572 | * be incurred if the tasks were swapped. |
1588 | */ | 1573 | */ |
1589 | if (cur) { | 1574 | if (cur) { |
1590 | /* Skip this swap candidate if cannot move to the source cpu */ | 1575 | /* Skip this swap candidate if cannot move to the source CPU: */ |
1591 | if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) | 1576 | if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed)) |
1592 | goto unlock; | 1577 | goto unlock; |
1593 | 1578 | ||
@@ -1631,7 +1616,7 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1631 | goto balance; | 1616 | goto balance; |
1632 | } | 1617 | } |
1633 | 1618 | ||
1634 | /* Balance doesn't matter much if we're running a task per cpu */ | 1619 | /* Balance doesn't matter much if we're running a task per CPU: */ |
1635 | if (imp > env->best_imp && src_rq->nr_running == 1 && | 1620 | if (imp > env->best_imp && src_rq->nr_running == 1 && |
1636 | dst_rq->nr_running == 1) | 1621 | dst_rq->nr_running == 1) |
1637 | goto assign; | 1622 | goto assign; |
@@ -1676,7 +1661,7 @@ balance: | |||
1676 | */ | 1661 | */ |
1677 | if (!cur) { | 1662 | if (!cur) { |
1678 | /* | 1663 | /* |
1679 | * select_idle_siblings() uses an per-cpu cpumask that | 1664 | * select_idle_siblings() uses an per-CPU cpumask that |
1680 | * can be used from IRQ context. | 1665 | * can be used from IRQ context. |
1681 | */ | 1666 | */ |
1682 | local_irq_disable(); | 1667 | local_irq_disable(); |
@@ -1869,6 +1854,7 @@ static int task_numa_migrate(struct task_struct *p) | |||
1869 | static void numa_migrate_preferred(struct task_struct *p) | 1854 | static void numa_migrate_preferred(struct task_struct *p) |
1870 | { | 1855 | { |
1871 | unsigned long interval = HZ; | 1856 | unsigned long interval = HZ; |
1857 | unsigned long numa_migrate_retry; | ||
1872 | 1858 | ||
1873 | /* This task has no NUMA fault statistics yet */ | 1859 | /* This task has no NUMA fault statistics yet */ |
1874 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) | 1860 | if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults)) |
@@ -1876,7 +1862,18 @@ static void numa_migrate_preferred(struct task_struct *p) | |||
1876 | 1862 | ||
1877 | /* Periodically retry migrating the task to the preferred node */ | 1863 | /* Periodically retry migrating the task to the preferred node */ |
1878 | interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); | 1864 | interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16); |
1879 | p->numa_migrate_retry = jiffies + interval; | 1865 | numa_migrate_retry = jiffies + interval; |
1866 | |||
1867 | /* | ||
1868 | * Check that the new retry threshold is after the current one. If | ||
1869 | * the retry is in the future, it implies that wake_affine has | ||
1870 | * temporarily asked NUMA balancing to backoff from placement. | ||
1871 | */ | ||
1872 | if (numa_migrate_retry > p->numa_migrate_retry) | ||
1873 | return; | ||
1874 | |||
1875 | /* Safe to try placing the task on the preferred node */ | ||
1876 | p->numa_migrate_retry = numa_migrate_retry; | ||
1880 | 1877 | ||
1881 | /* Success if task is already running on preferred CPU */ | 1878 | /* Success if task is already running on preferred CPU */ |
1882 | if (task_node(p) == p->numa_preferred_nid) | 1879 | if (task_node(p) == p->numa_preferred_nid) |
@@ -2823,7 +2820,7 @@ void reweight_task(struct task_struct *p, int prio) | |||
2823 | } | 2820 | } |
2824 | 2821 | ||
2825 | #ifdef CONFIG_FAIR_GROUP_SCHED | 2822 | #ifdef CONFIG_FAIR_GROUP_SCHED |
2826 | # ifdef CONFIG_SMP | 2823 | #ifdef CONFIG_SMP |
2827 | /* | 2824 | /* |
2828 | * All this does is approximate the hierarchical proportion which includes that | 2825 | * All this does is approximate the hierarchical proportion which includes that |
2829 | * global sum we all love to hate. | 2826 | * global sum we all love to hate. |
@@ -2974,7 +2971,7 @@ static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares) | |||
2974 | 2971 | ||
2975 | return clamp_t(long, runnable, MIN_SHARES, shares); | 2972 | return clamp_t(long, runnable, MIN_SHARES, shares); |
2976 | } | 2973 | } |
2977 | # endif /* CONFIG_SMP */ | 2974 | #endif /* CONFIG_SMP */ |
2978 | 2975 | ||
2979 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); | 2976 | static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); |
2980 | 2977 | ||
@@ -3350,7 +3347,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) | |||
3350 | } | 3347 | } |
3351 | 3348 | ||
3352 | /* | 3349 | /* |
3353 | * Called within set_task_rq() right before setting a task's cpu. The | 3350 | * Called within set_task_rq() right before setting a task's CPU. The |
3354 | * caller only guarantees p->pi_lock is held; no other assumptions, | 3351 | * caller only guarantees p->pi_lock is held; no other assumptions, |
3355 | * including the state of rq->lock, should be made. | 3352 | * including the state of rq->lock, should be made. |
3356 | */ | 3353 | */ |
@@ -3529,7 +3526,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf | |||
3529 | 3526 | ||
3530 | /* | 3527 | /* |
3531 | * runnable_sum can't be lower than running_sum | 3528 | * runnable_sum can't be lower than running_sum |
3532 | * As running sum is scale with cpu capacity wehreas the runnable sum | 3529 | * As running sum is scale with CPU capacity wehreas the runnable sum |
3533 | * is not we rescale running_sum 1st | 3530 | * is not we rescale running_sum 1st |
3534 | */ | 3531 | */ |
3535 | running_sum = se->avg.util_sum / | 3532 | running_sum = se->avg.util_sum / |
@@ -4676,7 +4673,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) | |||
4676 | if (!se) | 4673 | if (!se) |
4677 | add_nr_running(rq, task_delta); | 4674 | add_nr_running(rq, task_delta); |
4678 | 4675 | ||
4679 | /* determine whether we need to wake up potentially idle cpu */ | 4676 | /* Determine whether we need to wake up potentially idle CPU: */ |
4680 | if (rq->curr == rq->idle && rq->cfs.nr_running) | 4677 | if (rq->curr == rq->idle && rq->cfs.nr_running) |
4681 | resched_curr(rq); | 4678 | resched_curr(rq); |
4682 | } | 4679 | } |
@@ -5041,7 +5038,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | |||
5041 | } | 5038 | } |
5042 | 5039 | ||
5043 | /* | 5040 | /* |
5044 | * Both these cpu hotplug callbacks race against unregister_fair_sched_group() | 5041 | * Both these CPU hotplug callbacks race against unregister_fair_sched_group() |
5045 | * | 5042 | * |
5046 | * The race is harmless, since modifying bandwidth settings of unhooked group | 5043 | * The race is harmless, since modifying bandwidth settings of unhooked group |
5047 | * bits doesn't do much. | 5044 | * bits doesn't do much. |
@@ -5086,7 +5083,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) | |||
5086 | */ | 5083 | */ |
5087 | cfs_rq->runtime_remaining = 1; | 5084 | cfs_rq->runtime_remaining = 1; |
5088 | /* | 5085 | /* |
5089 | * Offline rq is schedulable till cpu is completely disabled | 5086 | * Offline rq is schedulable till CPU is completely disabled |
5090 | * in take_cpu_down(), so we prevent new cfs throttling here. | 5087 | * in take_cpu_down(), so we prevent new cfs throttling here. |
5091 | */ | 5088 | */ |
5092 | cfs_rq->runtime_enabled = 0; | 5089 | cfs_rq->runtime_enabled = 0; |
@@ -5323,8 +5320,8 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask); | |||
5323 | * | 5320 | * |
5324 | * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load | 5321 | * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load |
5325 | * | 5322 | * |
5326 | * If a cpu misses updates for n ticks (as it was idle) and update gets | 5323 | * If a CPU misses updates for n ticks (as it was idle) and update gets |
5327 | * called on the n+1-th tick when cpu may be busy, then we have: | 5324 | * called on the n+1-th tick when CPU may be busy, then we have: |
5328 | * | 5325 | * |
5329 | * load_n = (1 - 1/2^i)^n * load_0 | 5326 | * load_n = (1 - 1/2^i)^n * load_0 |
5330 | * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load | 5327 | * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load |
@@ -5468,7 +5465,7 @@ static unsigned long weighted_cpuload(struct rq *rq) | |||
5468 | #ifdef CONFIG_NO_HZ_COMMON | 5465 | #ifdef CONFIG_NO_HZ_COMMON |
5469 | /* | 5466 | /* |
5470 | * There is no sane way to deal with nohz on smp when using jiffies because the | 5467 | * There is no sane way to deal with nohz on smp when using jiffies because the |
5471 | * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading | 5468 | * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading |
5472 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. | 5469 | * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. |
5473 | * | 5470 | * |
5474 | * Therefore we need to avoid the delta approach from the regular tick when | 5471 | * Therefore we need to avoid the delta approach from the regular tick when |
@@ -5579,7 +5576,7 @@ void cpu_load_update_active(struct rq *this_rq) | |||
5579 | } | 5576 | } |
5580 | 5577 | ||
5581 | /* | 5578 | /* |
5582 | * Return a low guess at the load of a migration-source cpu weighted | 5579 | * Return a low guess at the load of a migration-source CPU weighted |
5583 | * according to the scheduling class and "nice" value. | 5580 | * according to the scheduling class and "nice" value. |
5584 | * | 5581 | * |
5585 | * We want to under-estimate the load of migration sources, to | 5582 | * We want to under-estimate the load of migration sources, to |
@@ -5597,7 +5594,7 @@ static unsigned long source_load(int cpu, int type) | |||
5597 | } | 5594 | } |
5598 | 5595 | ||
5599 | /* | 5596 | /* |
5600 | * Return a high guess at the load of a migration-target cpu weighted | 5597 | * Return a high guess at the load of a migration-target CPU weighted |
5601 | * according to the scheduling class and "nice" value. | 5598 | * according to the scheduling class and "nice" value. |
5602 | */ | 5599 | */ |
5603 | static unsigned long target_load(int cpu, int type) | 5600 | static unsigned long target_load(int cpu, int type) |
@@ -5724,7 +5721,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, | |||
5724 | unsigned long task_load; | 5721 | unsigned long task_load; |
5725 | 5722 | ||
5726 | this_eff_load = target_load(this_cpu, sd->wake_idx); | 5723 | this_eff_load = target_load(this_cpu, sd->wake_idx); |
5727 | prev_eff_load = source_load(prev_cpu, sd->wake_idx); | ||
5728 | 5724 | ||
5729 | if (sync) { | 5725 | if (sync) { |
5730 | unsigned long current_load = task_h_load(current); | 5726 | unsigned long current_load = task_h_load(current); |
@@ -5742,18 +5738,69 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, | |||
5742 | this_eff_load *= 100; | 5738 | this_eff_load *= 100; |
5743 | this_eff_load *= capacity_of(prev_cpu); | 5739 | this_eff_load *= capacity_of(prev_cpu); |
5744 | 5740 | ||
5741 | prev_eff_load = source_load(prev_cpu, sd->wake_idx); | ||
5745 | prev_eff_load -= task_load; | 5742 | prev_eff_load -= task_load; |
5746 | if (sched_feat(WA_BIAS)) | 5743 | if (sched_feat(WA_BIAS)) |
5747 | prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; | 5744 | prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; |
5748 | prev_eff_load *= capacity_of(this_cpu); | 5745 | prev_eff_load *= capacity_of(this_cpu); |
5749 | 5746 | ||
5750 | return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits; | 5747 | /* |
5748 | * If sync, adjust the weight of prev_eff_load such that if | ||
5749 | * prev_eff == this_eff that select_idle_sibling() will consider | ||
5750 | * stacking the wakee on top of the waker if no other CPU is | ||
5751 | * idle. | ||
5752 | */ | ||
5753 | if (sync) | ||
5754 | prev_eff_load += 1; | ||
5755 | |||
5756 | return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits; | ||
5757 | } | ||
5758 | |||
5759 | #ifdef CONFIG_NUMA_BALANCING | ||
5760 | static void | ||
5761 | update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) | ||
5762 | { | ||
5763 | unsigned long interval; | ||
5764 | |||
5765 | if (!static_branch_likely(&sched_numa_balancing)) | ||
5766 | return; | ||
5767 | |||
5768 | /* If balancing has no preference then continue gathering data */ | ||
5769 | if (p->numa_preferred_nid == -1) | ||
5770 | return; | ||
5771 | |||
5772 | /* | ||
5773 | * If the wakeup is not affecting locality then it is neutral from | ||
5774 | * the perspective of NUMA balacing so continue gathering data. | ||
5775 | */ | ||
5776 | if (cpu_to_node(prev_cpu) == cpu_to_node(target)) | ||
5777 | return; | ||
5778 | |||
5779 | /* | ||
5780 | * Temporarily prevent NUMA balancing trying to place waker/wakee after | ||
5781 | * wakee has been moved by wake_affine. This will potentially allow | ||
5782 | * related tasks to converge and update their data placement. The | ||
5783 | * 4 * numa_scan_period is to allow the two-pass filter to migrate | ||
5784 | * hot data to the wakers node. | ||
5785 | */ | ||
5786 | interval = max(sysctl_numa_balancing_scan_delay, | ||
5787 | p->numa_scan_period << 2); | ||
5788 | p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); | ||
5789 | |||
5790 | interval = max(sysctl_numa_balancing_scan_delay, | ||
5791 | current->numa_scan_period << 2); | ||
5792 | current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval); | ||
5793 | } | ||
5794 | #else | ||
5795 | static void | ||
5796 | update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target) | ||
5797 | { | ||
5751 | } | 5798 | } |
5799 | #endif | ||
5752 | 5800 | ||
5753 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, | 5801 | static int wake_affine(struct sched_domain *sd, struct task_struct *p, |
5754 | int prev_cpu, int sync) | 5802 | int this_cpu, int prev_cpu, int sync) |
5755 | { | 5803 | { |
5756 | int this_cpu = smp_processor_id(); | ||
5757 | int target = nr_cpumask_bits; | 5804 | int target = nr_cpumask_bits; |
5758 | 5805 | ||
5759 | if (sched_feat(WA_IDLE)) | 5806 | if (sched_feat(WA_IDLE)) |
@@ -5766,6 +5813,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, | |||
5766 | if (target == nr_cpumask_bits) | 5813 | if (target == nr_cpumask_bits) |
5767 | return prev_cpu; | 5814 | return prev_cpu; |
5768 | 5815 | ||
5816 | update_wa_numa_placement(p, prev_cpu, target); | ||
5769 | schedstat_inc(sd->ttwu_move_affine); | 5817 | schedstat_inc(sd->ttwu_move_affine); |
5770 | schedstat_inc(p->se.statistics.nr_wakeups_affine); | 5818 | schedstat_inc(p->se.statistics.nr_wakeups_affine); |
5771 | return target; | 5819 | return target; |
@@ -5826,7 +5874,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
5826 | max_spare_cap = 0; | 5874 | max_spare_cap = 0; |
5827 | 5875 | ||
5828 | for_each_cpu(i, sched_group_span(group)) { | 5876 | for_each_cpu(i, sched_group_span(group)) { |
5829 | /* Bias balancing toward cpus of our domain */ | 5877 | /* Bias balancing toward CPUs of our domain */ |
5830 | if (local_group) | 5878 | if (local_group) |
5831 | load = source_load(i, load_idx); | 5879 | load = source_load(i, load_idx); |
5832 | else | 5880 | else |
@@ -5856,7 +5904,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
5856 | if (min_runnable_load > (runnable_load + imbalance)) { | 5904 | if (min_runnable_load > (runnable_load + imbalance)) { |
5857 | /* | 5905 | /* |
5858 | * The runnable load is significantly smaller | 5906 | * The runnable load is significantly smaller |
5859 | * so we can pick this new cpu | 5907 | * so we can pick this new CPU: |
5860 | */ | 5908 | */ |
5861 | min_runnable_load = runnable_load; | 5909 | min_runnable_load = runnable_load; |
5862 | min_avg_load = avg_load; | 5910 | min_avg_load = avg_load; |
@@ -5865,7 +5913,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, | |||
5865 | (100*min_avg_load > imbalance_scale*avg_load)) { | 5913 | (100*min_avg_load > imbalance_scale*avg_load)) { |
5866 | /* | 5914 | /* |
5867 | * The runnable loads are close so take the | 5915 | * The runnable loads are close so take the |
5868 | * blocked load into account through avg_load. | 5916 | * blocked load into account through avg_load: |
5869 | */ | 5917 | */ |
5870 | min_avg_load = avg_load; | 5918 | min_avg_load = avg_load; |
5871 | idlest = group; | 5919 | idlest = group; |
@@ -5903,6 +5951,18 @@ skip_spare: | |||
5903 | if (!idlest) | 5951 | if (!idlest) |
5904 | return NULL; | 5952 | return NULL; |
5905 | 5953 | ||
5954 | /* | ||
5955 | * When comparing groups across NUMA domains, it's possible for the | ||
5956 | * local domain to be very lightly loaded relative to the remote | ||
5957 | * domains but "imbalance" skews the comparison making remote CPUs | ||
5958 | * look much more favourable. When considering cross-domain, add | ||
5959 | * imbalance to the runnable load on the remote node and consider | ||
5960 | * staying local. | ||
5961 | */ | ||
5962 | if ((sd->flags & SD_NUMA) && | ||
5963 | min_runnable_load + imbalance >= this_runnable_load) | ||
5964 | return NULL; | ||
5965 | |||
5906 | if (min_runnable_load > (this_runnable_load + imbalance)) | 5966 | if (min_runnable_load > (this_runnable_load + imbalance)) |
5907 | return NULL; | 5967 | return NULL; |
5908 | 5968 | ||
@@ -5914,7 +5974,7 @@ skip_spare: | |||
5914 | } | 5974 | } |
5915 | 5975 | ||
5916 | /* | 5976 | /* |
5917 | * find_idlest_group_cpu - find the idlest cpu among the cpus in group. | 5977 | * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group. |
5918 | */ | 5978 | */ |
5919 | static int | 5979 | static int |
5920 | find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | 5980 | find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) |
@@ -5992,12 +6052,12 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p | |||
5992 | 6052 | ||
5993 | new_cpu = find_idlest_group_cpu(group, p, cpu); | 6053 | new_cpu = find_idlest_group_cpu(group, p, cpu); |
5994 | if (new_cpu == cpu) { | 6054 | if (new_cpu == cpu) { |
5995 | /* Now try balancing at a lower domain level of cpu */ | 6055 | /* Now try balancing at a lower domain level of 'cpu': */ |
5996 | sd = sd->child; | 6056 | sd = sd->child; |
5997 | continue; | 6057 | continue; |
5998 | } | 6058 | } |
5999 | 6059 | ||
6000 | /* Now try balancing at a lower domain level of new_cpu */ | 6060 | /* Now try balancing at a lower domain level of 'new_cpu': */ |
6001 | cpu = new_cpu; | 6061 | cpu = new_cpu; |
6002 | weight = sd->span_weight; | 6062 | weight = sd->span_weight; |
6003 | sd = NULL; | 6063 | sd = NULL; |
@@ -6007,7 +6067,6 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p | |||
6007 | if (tmp->flags & sd_flag) | 6067 | if (tmp->flags & sd_flag) |
6008 | sd = tmp; | 6068 | sd = tmp; |
6009 | } | 6069 | } |
6010 | /* while loop will break here if sd == NULL */ | ||
6011 | } | 6070 | } |
6012 | 6071 | ||
6013 | return new_cpu; | 6072 | return new_cpu; |
@@ -6203,12 +6262,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
6203 | return target; | 6262 | return target; |
6204 | 6263 | ||
6205 | /* | 6264 | /* |
6206 | * If the previous cpu is cache affine and idle, don't be stupid. | 6265 | * If the previous CPU is cache affine and idle, don't be stupid: |
6207 | */ | 6266 | */ |
6208 | if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) | 6267 | if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) |
6209 | return prev; | 6268 | return prev; |
6210 | 6269 | ||
6211 | /* Check a recently used CPU as a potential idle candidate */ | 6270 | /* Check a recently used CPU as a potential idle candidate: */ |
6212 | recent_used_cpu = p->recent_used_cpu; | 6271 | recent_used_cpu = p->recent_used_cpu; |
6213 | if (recent_used_cpu != prev && | 6272 | if (recent_used_cpu != prev && |
6214 | recent_used_cpu != target && | 6273 | recent_used_cpu != target && |
@@ -6217,7 +6276,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) | |||
6217 | cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { | 6276 | cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { |
6218 | /* | 6277 | /* |
6219 | * Replace recent_used_cpu with prev as it is a potential | 6278 | * Replace recent_used_cpu with prev as it is a potential |
6220 | * candidate for the next wake. | 6279 | * candidate for the next wake: |
6221 | */ | 6280 | */ |
6222 | p->recent_used_cpu = prev; | 6281 | p->recent_used_cpu = prev; |
6223 | return recent_used_cpu; | 6282 | return recent_used_cpu; |
@@ -6282,7 +6341,7 @@ static inline unsigned long task_util(struct task_struct *p) | |||
6282 | } | 6341 | } |
6283 | 6342 | ||
6284 | /* | 6343 | /* |
6285 | * cpu_util_wake: Compute cpu utilization with any contributions from | 6344 | * cpu_util_wake: Compute CPU utilization with any contributions from |
6286 | * the waking task p removed. | 6345 | * the waking task p removed. |
6287 | */ | 6346 | */ |
6288 | static unsigned long cpu_util_wake(int cpu, struct task_struct *p) | 6347 | static unsigned long cpu_util_wake(int cpu, struct task_struct *p) |
@@ -6328,10 +6387,10 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) | |||
6328 | * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, | 6387 | * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, |
6329 | * SD_BALANCE_FORK, or SD_BALANCE_EXEC. | 6388 | * SD_BALANCE_FORK, or SD_BALANCE_EXEC. |
6330 | * | 6389 | * |
6331 | * Balances load by selecting the idlest cpu in the idlest group, or under | 6390 | * Balances load by selecting the idlest CPU in the idlest group, or under |
6332 | * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set. | 6391 | * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set. |
6333 | * | 6392 | * |
6334 | * Returns the target cpu number. | 6393 | * Returns the target CPU number. |
6335 | * | 6394 | * |
6336 | * preempt must be disabled. | 6395 | * preempt must be disabled. |
6337 | */ | 6396 | */ |
@@ -6342,7 +6401,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
6342 | int cpu = smp_processor_id(); | 6401 | int cpu = smp_processor_id(); |
6343 | int new_cpu = prev_cpu; | 6402 | int new_cpu = prev_cpu; |
6344 | int want_affine = 0; | 6403 | int want_affine = 0; |
6345 | int sync = wake_flags & WF_SYNC; | 6404 | int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); |
6346 | 6405 | ||
6347 | if (sd_flag & SD_BALANCE_WAKE) { | 6406 | if (sd_flag & SD_BALANCE_WAKE) { |
6348 | record_wakee(p); | 6407 | record_wakee(p); |
@@ -6356,7 +6415,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
6356 | break; | 6415 | break; |
6357 | 6416 | ||
6358 | /* | 6417 | /* |
6359 | * If both cpu and prev_cpu are part of this domain, | 6418 | * If both 'cpu' and 'prev_cpu' are part of this domain, |
6360 | * cpu is a valid SD_WAKE_AFFINE target. | 6419 | * cpu is a valid SD_WAKE_AFFINE target. |
6361 | */ | 6420 | */ |
6362 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && | 6421 | if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && |
@@ -6376,7 +6435,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f | |||
6376 | if (cpu == prev_cpu) | 6435 | if (cpu == prev_cpu) |
6377 | goto pick_cpu; | 6436 | goto pick_cpu; |
6378 | 6437 | ||
6379 | new_cpu = wake_affine(affine_sd, p, prev_cpu, sync); | 6438 | new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync); |
6380 | } | 6439 | } |
6381 | 6440 | ||
6382 | if (sd && !(sd_flag & SD_BALANCE_FORK)) { | 6441 | if (sd && !(sd_flag & SD_BALANCE_FORK)) { |
@@ -6407,9 +6466,9 @@ pick_cpu: | |||
6407 | static void detach_entity_cfs_rq(struct sched_entity *se); | 6466 | static void detach_entity_cfs_rq(struct sched_entity *se); |
6408 | 6467 | ||
6409 | /* | 6468 | /* |
6410 | * Called immediately before a task is migrated to a new cpu; task_cpu(p) and | 6469 | * Called immediately before a task is migrated to a new CPU; task_cpu(p) and |
6411 | * cfs_rq_of(p) references at time of call are still valid and identify the | 6470 | * cfs_rq_of(p) references at time of call are still valid and identify the |
6412 | * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. | 6471 | * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held. |
6413 | */ | 6472 | */ |
6414 | static void migrate_task_rq_fair(struct task_struct *p) | 6473 | static void migrate_task_rq_fair(struct task_struct *p) |
6415 | { | 6474 | { |
@@ -6843,17 +6902,17 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6843 | * BASICS | 6902 | * BASICS |
6844 | * | 6903 | * |
6845 | * The purpose of load-balancing is to achieve the same basic fairness the | 6904 | * The purpose of load-balancing is to achieve the same basic fairness the |
6846 | * per-cpu scheduler provides, namely provide a proportional amount of compute | 6905 | * per-CPU scheduler provides, namely provide a proportional amount of compute |
6847 | * time to each task. This is expressed in the following equation: | 6906 | * time to each task. This is expressed in the following equation: |
6848 | * | 6907 | * |
6849 | * W_i,n/P_i == W_j,n/P_j for all i,j (1) | 6908 | * W_i,n/P_i == W_j,n/P_j for all i,j (1) |
6850 | * | 6909 | * |
6851 | * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight | 6910 | * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight |
6852 | * W_i,0 is defined as: | 6911 | * W_i,0 is defined as: |
6853 | * | 6912 | * |
6854 | * W_i,0 = \Sum_j w_i,j (2) | 6913 | * W_i,0 = \Sum_j w_i,j (2) |
6855 | * | 6914 | * |
6856 | * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight | 6915 | * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight |
6857 | * is derived from the nice value as per sched_prio_to_weight[]. | 6916 | * is derived from the nice value as per sched_prio_to_weight[]. |
6858 | * | 6917 | * |
6859 | * The weight average is an exponential decay average of the instantaneous | 6918 | * The weight average is an exponential decay average of the instantaneous |
@@ -6861,7 +6920,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6861 | * | 6920 | * |
6862 | * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) | 6921 | * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3) |
6863 | * | 6922 | * |
6864 | * C_i is the compute capacity of cpu i, typically it is the | 6923 | * C_i is the compute capacity of CPU i, typically it is the |
6865 | * fraction of 'recent' time available for SCHED_OTHER task execution. But it | 6924 | * fraction of 'recent' time available for SCHED_OTHER task execution. But it |
6866 | * can also include other factors [XXX]. | 6925 | * can also include other factors [XXX]. |
6867 | * | 6926 | * |
@@ -6882,11 +6941,11 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6882 | * SCHED DOMAINS | 6941 | * SCHED DOMAINS |
6883 | * | 6942 | * |
6884 | * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) | 6943 | * In order to solve the imbalance equation (4), and avoid the obvious O(n^2) |
6885 | * for all i,j solution, we create a tree of cpus that follows the hardware | 6944 | * for all i,j solution, we create a tree of CPUs that follows the hardware |
6886 | * topology where each level pairs two lower groups (or better). This results | 6945 | * topology where each level pairs two lower groups (or better). This results |
6887 | * in O(log n) layers. Furthermore we reduce the number of cpus going up the | 6946 | * in O(log n) layers. Furthermore we reduce the number of CPUs going up the |
6888 | * tree to only the first of the previous level and we decrease the frequency | 6947 | * tree to only the first of the previous level and we decrease the frequency |
6889 | * of load-balance at each level inv. proportional to the number of cpus in | 6948 | * of load-balance at each level inv. proportional to the number of CPUs in |
6890 | * the groups. | 6949 | * the groups. |
6891 | * | 6950 | * |
6892 | * This yields: | 6951 | * This yields: |
@@ -6895,7 +6954,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6895 | * \Sum { --- * --- * 2^i } = O(n) (5) | 6954 | * \Sum { --- * --- * 2^i } = O(n) (5) |
6896 | * i = 0 2^i 2^i | 6955 | * i = 0 2^i 2^i |
6897 | * `- size of each group | 6956 | * `- size of each group |
6898 | * | | `- number of cpus doing load-balance | 6957 | * | | `- number of CPUs doing load-balance |
6899 | * | `- freq | 6958 | * | `- freq |
6900 | * `- sum over all levels | 6959 | * `- sum over all levels |
6901 | * | 6960 | * |
@@ -6903,7 +6962,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6903 | * this makes (5) the runtime complexity of the balancer. | 6962 | * this makes (5) the runtime complexity of the balancer. |
6904 | * | 6963 | * |
6905 | * An important property here is that each CPU is still (indirectly) connected | 6964 | * An important property here is that each CPU is still (indirectly) connected |
6906 | * to every other cpu in at most O(log n) steps: | 6965 | * to every other CPU in at most O(log n) steps: |
6907 | * | 6966 | * |
6908 | * The adjacency matrix of the resulting graph is given by: | 6967 | * The adjacency matrix of the resulting graph is given by: |
6909 | * | 6968 | * |
@@ -6915,7 +6974,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6915 | * | 6974 | * |
6916 | * A^(log_2 n)_i,j != 0 for all i,j (7) | 6975 | * A^(log_2 n)_i,j != 0 for all i,j (7) |
6917 | * | 6976 | * |
6918 | * Showing there's indeed a path between every cpu in at most O(log n) steps. | 6977 | * Showing there's indeed a path between every CPU in at most O(log n) steps. |
6919 | * The task movement gives a factor of O(m), giving a convergence complexity | 6978 | * The task movement gives a factor of O(m), giving a convergence complexity |
6920 | * of: | 6979 | * of: |
6921 | * | 6980 | * |
@@ -6925,7 +6984,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6925 | * WORK CONSERVING | 6984 | * WORK CONSERVING |
6926 | * | 6985 | * |
6927 | * In order to avoid CPUs going idle while there's still work to do, new idle | 6986 | * In order to avoid CPUs going idle while there's still work to do, new idle |
6928 | * balancing is more aggressive and has the newly idle cpu iterate up the domain | 6987 | * balancing is more aggressive and has the newly idle CPU iterate up the domain |
6929 | * tree itself instead of relying on other CPUs to bring it work. | 6988 | * tree itself instead of relying on other CPUs to bring it work. |
6930 | * | 6989 | * |
6931 | * This adds some complexity to both (5) and (8) but it reduces the total idle | 6990 | * This adds some complexity to both (5) and (8) but it reduces the total idle |
@@ -6946,7 +7005,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp | |||
6946 | * | 7005 | * |
6947 | * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) | 7006 | * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10) |
6948 | * | 7007 | * |
6949 | * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i. | 7008 | * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i. |
6950 | * | 7009 | * |
6951 | * The big problem is S_k, its a global sum needed to compute a local (W_i) | 7010 | * The big problem is S_k, its a global sum needed to compute a local (W_i) |
6952 | * property. | 7011 | * property. |
@@ -7110,7 +7169,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
7110 | env->flags |= LBF_SOME_PINNED; | 7169 | env->flags |= LBF_SOME_PINNED; |
7111 | 7170 | ||
7112 | /* | 7171 | /* |
7113 | * Remember if this task can be migrated to any other cpu in | 7172 | * Remember if this task can be migrated to any other CPU in |
7114 | * our sched_group. We may want to revisit it if we couldn't | 7173 | * our sched_group. We may want to revisit it if we couldn't |
7115 | * meet load balance goals by pulling other tasks on src_cpu. | 7174 | * meet load balance goals by pulling other tasks on src_cpu. |
7116 | * | 7175 | * |
@@ -7120,7 +7179,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) | |||
7120 | if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) | 7179 | if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED)) |
7121 | return 0; | 7180 | return 0; |
7122 | 7181 | ||
7123 | /* Prevent to re-select dst_cpu via env's cpus */ | 7182 | /* Prevent to re-select dst_cpu via env's CPUs: */ |
7124 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { | 7183 | for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { |
7125 | if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { | 7184 | if (cpumask_test_cpu(cpu, &p->cpus_allowed)) { |
7126 | env->flags |= LBF_DST_PINNED; | 7185 | env->flags |= LBF_DST_PINNED; |
@@ -7694,8 +7753,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) | |||
7694 | * Group imbalance indicates (and tries to solve) the problem where balancing | 7753 | * Group imbalance indicates (and tries to solve) the problem where balancing |
7695 | * groups is inadequate due to ->cpus_allowed constraints. | 7754 | * groups is inadequate due to ->cpus_allowed constraints. |
7696 | * | 7755 | * |
7697 | * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a | 7756 | * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a |
7698 | * cpumask covering 1 cpu of the first group and 3 cpus of the second group. | 7757 | * cpumask covering 1 CPU of the first group and 3 CPUs of the second group. |
7699 | * Something like: | 7758 | * Something like: |
7700 | * | 7759 | * |
7701 | * { 0 1 2 3 } { 4 5 6 7 } | 7760 | * { 0 1 2 3 } { 4 5 6 7 } |
@@ -7703,7 +7762,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) | |||
7703 | * | 7762 | * |
7704 | * If we were to balance group-wise we'd place two tasks in the first group and | 7763 | * If we were to balance group-wise we'd place two tasks in the first group and |
7705 | * two tasks in the second group. Clearly this is undesired as it will overload | 7764 | * two tasks in the second group. Clearly this is undesired as it will overload |
7706 | * cpu 3 and leave one of the cpus in the second group unused. | 7765 | * cpu 3 and leave one of the CPUs in the second group unused. |
7707 | * | 7766 | * |
7708 | * The current solution to this issue is detecting the skew in the first group | 7767 | * The current solution to this issue is detecting the skew in the first group |
7709 | * by noticing the lower domain failed to reach balance and had difficulty | 7768 | * by noticing the lower domain failed to reach balance and had difficulty |
@@ -7816,7 +7875,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, | |||
7816 | for_each_cpu_and(i, sched_group_span(group), env->cpus) { | 7875 | for_each_cpu_and(i, sched_group_span(group), env->cpus) { |
7817 | struct rq *rq = cpu_rq(i); | 7876 | struct rq *rq = cpu_rq(i); |
7818 | 7877 | ||
7819 | /* Bias balancing toward cpus of our domain */ | 7878 | /* Bias balancing toward CPUs of our domain: */ |
7820 | if (local_group) | 7879 | if (local_group) |
7821 | load = target_load(i, load_idx); | 7880 | load = target_load(i, load_idx); |
7822 | else | 7881 | else |
@@ -7902,7 +7961,7 @@ asym_packing: | |||
7902 | if (!(env->sd->flags & SD_ASYM_PACKING)) | 7961 | if (!(env->sd->flags & SD_ASYM_PACKING)) |
7903 | return true; | 7962 | return true; |
7904 | 7963 | ||
7905 | /* No ASYM_PACKING if target cpu is already busy */ | 7964 | /* No ASYM_PACKING if target CPU is already busy */ |
7906 | if (env->idle == CPU_NOT_IDLE) | 7965 | if (env->idle == CPU_NOT_IDLE) |
7907 | return true; | 7966 | return true; |
7908 | /* | 7967 | /* |
@@ -7915,7 +7974,7 @@ asym_packing: | |||
7915 | if (!sds->busiest) | 7974 | if (!sds->busiest) |
7916 | return true; | 7975 | return true; |
7917 | 7976 | ||
7918 | /* Prefer to move from lowest priority cpu's work */ | 7977 | /* Prefer to move from lowest priority CPU's work */ |
7919 | if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, | 7978 | if (sched_asym_prefer(sds->busiest->asym_prefer_cpu, |
7920 | sg->asym_prefer_cpu)) | 7979 | sg->asym_prefer_cpu)) |
7921 | return true; | 7980 | return true; |
@@ -8168,7 +8227,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
8168 | if (busiest->group_type == group_imbalanced) { | 8227 | if (busiest->group_type == group_imbalanced) { |
8169 | /* | 8228 | /* |
8170 | * In the group_imb case we cannot rely on group-wide averages | 8229 | * In the group_imb case we cannot rely on group-wide averages |
8171 | * to ensure cpu-load equilibrium, look at wider averages. XXX | 8230 | * to ensure CPU-load equilibrium, look at wider averages. XXX |
8172 | */ | 8231 | */ |
8173 | busiest->load_per_task = | 8232 | busiest->load_per_task = |
8174 | min(busiest->load_per_task, sds->avg_load); | 8233 | min(busiest->load_per_task, sds->avg_load); |
@@ -8187,7 +8246,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
8187 | } | 8246 | } |
8188 | 8247 | ||
8189 | /* | 8248 | /* |
8190 | * If there aren't any idle cpus, avoid creating some. | 8249 | * If there aren't any idle CPUs, avoid creating some. |
8191 | */ | 8250 | */ |
8192 | if (busiest->group_type == group_overloaded && | 8251 | if (busiest->group_type == group_overloaded && |
8193 | local->group_type == group_overloaded) { | 8252 | local->group_type == group_overloaded) { |
@@ -8201,9 +8260,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s | |||
8201 | } | 8260 | } |
8202 | 8261 | ||
8203 | /* | 8262 | /* |
8204 | * We're trying to get all the cpus to the average_load, so we don't | 8263 | * We're trying to get all the CPUs to the average_load, so we don't |
8205 | * want to push ourselves above the average load, nor do we wish to | 8264 | * want to push ourselves above the average load, nor do we wish to |
8206 | * reduce the max loaded cpu below the average load. At the same time, | 8265 | * reduce the max loaded CPU below the average load. At the same time, |
8207 | * we also don't want to reduce the group load below the group | 8266 | * we also don't want to reduce the group load below the group |
8208 | * capacity. Thus we look for the minimum possible imbalance. | 8267 | * capacity. Thus we look for the minimum possible imbalance. |
8209 | */ | 8268 | */ |
@@ -8297,9 +8356,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env) | |||
8297 | 8356 | ||
8298 | if (env->idle == CPU_IDLE) { | 8357 | if (env->idle == CPU_IDLE) { |
8299 | /* | 8358 | /* |
8300 | * This cpu is idle. If the busiest group is not overloaded | 8359 | * This CPU is idle. If the busiest group is not overloaded |
8301 | * and there is no imbalance between this and busiest group | 8360 | * and there is no imbalance between this and busiest group |
8302 | * wrt idle cpus, it is balanced. The imbalance becomes | 8361 | * wrt idle CPUs, it is balanced. The imbalance becomes |
8303 | * significant if the diff is greater than 1 otherwise we | 8362 | * significant if the diff is greater than 1 otherwise we |
8304 | * might end up to just move the imbalance on another group | 8363 | * might end up to just move the imbalance on another group |
8305 | */ | 8364 | */ |
@@ -8327,7 +8386,7 @@ out_balanced: | |||
8327 | } | 8386 | } |
8328 | 8387 | ||
8329 | /* | 8388 | /* |
8330 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 8389 | * find_busiest_queue - find the busiest runqueue among the CPUs in the group. |
8331 | */ | 8390 | */ |
8332 | static struct rq *find_busiest_queue(struct lb_env *env, | 8391 | static struct rq *find_busiest_queue(struct lb_env *env, |
8333 | struct sched_group *group) | 8392 | struct sched_group *group) |
@@ -8371,7 +8430,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
8371 | 8430 | ||
8372 | /* | 8431 | /* |
8373 | * When comparing with imbalance, use weighted_cpuload() | 8432 | * When comparing with imbalance, use weighted_cpuload() |
8374 | * which is not scaled with the cpu capacity. | 8433 | * which is not scaled with the CPU capacity. |
8375 | */ | 8434 | */ |
8376 | 8435 | ||
8377 | if (rq->nr_running == 1 && wl > env->imbalance && | 8436 | if (rq->nr_running == 1 && wl > env->imbalance && |
@@ -8379,9 +8438,9 @@ static struct rq *find_busiest_queue(struct lb_env *env, | |||
8379 | continue; | 8438 | continue; |
8380 | 8439 | ||
8381 | /* | 8440 | /* |
8382 | * For the load comparisons with the other cpu's, consider | 8441 | * For the load comparisons with the other CPU's, consider |
8383 | * the weighted_cpuload() scaled with the cpu capacity, so | 8442 | * the weighted_cpuload() scaled with the CPU capacity, so |
8384 | * that the load can be moved away from the cpu that is | 8443 | * that the load can be moved away from the CPU that is |
8385 | * potentially running at a lower capacity. | 8444 | * potentially running at a lower capacity. |
8386 | * | 8445 | * |
8387 | * Thus we're looking for max(wl_i / capacity_i), crosswise | 8446 | * Thus we're looking for max(wl_i / capacity_i), crosswise |
@@ -8452,13 +8511,13 @@ static int should_we_balance(struct lb_env *env) | |||
8452 | return 0; | 8511 | return 0; |
8453 | 8512 | ||
8454 | /* | 8513 | /* |
8455 | * In the newly idle case, we will allow all the cpu's | 8514 | * In the newly idle case, we will allow all the CPUs |
8456 | * to do the newly idle load balance. | 8515 | * to do the newly idle load balance. |
8457 | */ | 8516 | */ |
8458 | if (env->idle == CPU_NEWLY_IDLE) | 8517 | if (env->idle == CPU_NEWLY_IDLE) |
8459 | return 1; | 8518 | return 1; |
8460 | 8519 | ||
8461 | /* Try to find first idle cpu */ | 8520 | /* Try to find first idle CPU */ |
8462 | for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { | 8521 | for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { |
8463 | if (!idle_cpu(cpu)) | 8522 | if (!idle_cpu(cpu)) |
8464 | continue; | 8523 | continue; |
@@ -8471,7 +8530,7 @@ static int should_we_balance(struct lb_env *env) | |||
8471 | balance_cpu = group_balance_cpu(sg); | 8530 | balance_cpu = group_balance_cpu(sg); |
8472 | 8531 | ||
8473 | /* | 8532 | /* |
8474 | * First idle cpu or the first cpu(busiest) in this sched group | 8533 | * First idle CPU or the first CPU(busiest) in this sched group |
8475 | * is eligible for doing load balancing at this and above domains. | 8534 | * is eligible for doing load balancing at this and above domains. |
8476 | */ | 8535 | */ |
8477 | return balance_cpu == env->dst_cpu; | 8536 | return balance_cpu == env->dst_cpu; |
@@ -8580,7 +8639,7 @@ more_balance: | |||
8580 | * Revisit (affine) tasks on src_cpu that couldn't be moved to | 8639 | * Revisit (affine) tasks on src_cpu that couldn't be moved to |
8581 | * us and move them to an alternate dst_cpu in our sched_group | 8640 | * us and move them to an alternate dst_cpu in our sched_group |
8582 | * where they can run. The upper limit on how many times we | 8641 | * where they can run. The upper limit on how many times we |
8583 | * iterate on same src_cpu is dependent on number of cpus in our | 8642 | * iterate on same src_cpu is dependent on number of CPUs in our |
8584 | * sched_group. | 8643 | * sched_group. |
8585 | * | 8644 | * |
8586 | * This changes load balance semantics a bit on who can move | 8645 | * This changes load balance semantics a bit on who can move |
@@ -8597,7 +8656,7 @@ more_balance: | |||
8597 | */ | 8656 | */ |
8598 | if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { | 8657 | if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) { |
8599 | 8658 | ||
8600 | /* Prevent to re-select dst_cpu via env's cpus */ | 8659 | /* Prevent to re-select dst_cpu via env's CPUs */ |
8601 | cpumask_clear_cpu(env.dst_cpu, env.cpus); | 8660 | cpumask_clear_cpu(env.dst_cpu, env.cpus); |
8602 | 8661 | ||
8603 | env.dst_rq = cpu_rq(env.new_dst_cpu); | 8662 | env.dst_rq = cpu_rq(env.new_dst_cpu); |
@@ -8659,9 +8718,10 @@ more_balance: | |||
8659 | 8718 | ||
8660 | raw_spin_lock_irqsave(&busiest->lock, flags); | 8719 | raw_spin_lock_irqsave(&busiest->lock, flags); |
8661 | 8720 | ||
8662 | /* don't kick the active_load_balance_cpu_stop, | 8721 | /* |
8663 | * if the curr task on busiest cpu can't be | 8722 | * Don't kick the active_load_balance_cpu_stop, |
8664 | * moved to this_cpu | 8723 | * if the curr task on busiest CPU can't be |
8724 | * moved to this_cpu: | ||
8665 | */ | 8725 | */ |
8666 | if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { | 8726 | if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) { |
8667 | raw_spin_unlock_irqrestore(&busiest->lock, | 8727 | raw_spin_unlock_irqrestore(&busiest->lock, |
@@ -8887,7 +8947,7 @@ out: | |||
8887 | } | 8947 | } |
8888 | 8948 | ||
8889 | /* | 8949 | /* |
8890 | * active_load_balance_cpu_stop is run by cpu stopper. It pushes | 8950 | * active_load_balance_cpu_stop is run by the CPU stopper. It pushes |
8891 | * running tasks off the busiest CPU onto idle CPUs. It requires at | 8951 | * running tasks off the busiest CPU onto idle CPUs. It requires at |
8892 | * least 1 task to be running on each physical CPU where possible, and | 8952 | * least 1 task to be running on each physical CPU where possible, and |
8893 | * avoids physical / logical imbalances. | 8953 | * avoids physical / logical imbalances. |
@@ -8911,7 +8971,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
8911 | if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) | 8971 | if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu)) |
8912 | goto out_unlock; | 8972 | goto out_unlock; |
8913 | 8973 | ||
8914 | /* make sure the requested cpu hasn't gone down in the meantime */ | 8974 | /* Make sure the requested CPU hasn't gone down in the meantime: */ |
8915 | if (unlikely(busiest_cpu != smp_processor_id() || | 8975 | if (unlikely(busiest_cpu != smp_processor_id() || |
8916 | !busiest_rq->active_balance)) | 8976 | !busiest_rq->active_balance)) |
8917 | goto out_unlock; | 8977 | goto out_unlock; |
@@ -8923,7 +8983,7 @@ static int active_load_balance_cpu_stop(void *data) | |||
8923 | /* | 8983 | /* |
8924 | * This condition is "impossible", if it occurs | 8984 | * This condition is "impossible", if it occurs |
8925 | * we need to fix it. Originally reported by | 8985 | * we need to fix it. Originally reported by |
8926 | * Bjorn Helgaas on a 128-cpu setup. | 8986 | * Bjorn Helgaas on a 128-CPU setup. |
8927 | */ | 8987 | */ |
8928 | BUG_ON(busiest_rq == target_rq); | 8988 | BUG_ON(busiest_rq == target_rq); |
8929 | 8989 | ||
@@ -9025,7 +9085,7 @@ static void nohz_balancer_kick(void) | |||
9025 | return; | 9085 | return; |
9026 | /* | 9086 | /* |
9027 | * Use smp_send_reschedule() instead of resched_cpu(). | 9087 | * Use smp_send_reschedule() instead of resched_cpu(). |
9028 | * This way we generate a sched IPI on the target cpu which | 9088 | * This way we generate a sched IPI on the target CPU which |
9029 | * is idle. And the softirq performing nohz idle load balance | 9089 | * is idle. And the softirq performing nohz idle load balance |
9030 | * will be run before returning from the IPI. | 9090 | * will be run before returning from the IPI. |
9031 | */ | 9091 | */ |
@@ -9082,14 +9142,12 @@ unlock: | |||
9082 | } | 9142 | } |
9083 | 9143 | ||
9084 | /* | 9144 | /* |
9085 | * This routine will record that the cpu is going idle with tick stopped. | 9145 | * This routine will record that the CPU is going idle with tick stopped. |
9086 | * This info will be used in performing idle load balancing in the future. | 9146 | * This info will be used in performing idle load balancing in the future. |
9087 | */ | 9147 | */ |
9088 | void nohz_balance_enter_idle(int cpu) | 9148 | void nohz_balance_enter_idle(int cpu) |
9089 | { | 9149 | { |
9090 | /* | 9150 | /* If this CPU is going down, then nothing needs to be done: */ |
9091 | * If this cpu is going down, then nothing needs to be done. | ||
9092 | */ | ||
9093 | if (!cpu_active(cpu)) | 9151 | if (!cpu_active(cpu)) |
9094 | return; | 9152 | return; |
9095 | 9153 | ||
@@ -9100,9 +9158,7 @@ void nohz_balance_enter_idle(int cpu) | |||
9100 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) | 9158 | if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu))) |
9101 | return; | 9159 | return; |
9102 | 9160 | ||
9103 | /* | 9161 | /* If we're a completely isolated CPU, we don't play: */ |
9104 | * If we're a completely isolated CPU, we don't play. | ||
9105 | */ | ||
9106 | if (on_null_domain(cpu_rq(cpu))) | 9162 | if (on_null_domain(cpu_rq(cpu))) |
9107 | return; | 9163 | return; |
9108 | 9164 | ||
@@ -9211,7 +9267,7 @@ out: | |||
9211 | 9267 | ||
9212 | /* | 9268 | /* |
9213 | * next_balance will be updated only when there is a need. | 9269 | * next_balance will be updated only when there is a need. |
9214 | * When the cpu is attached to null domain for ex, it will not be | 9270 | * When the CPU is attached to null domain for ex, it will not be |
9215 | * updated. | 9271 | * updated. |
9216 | */ | 9272 | */ |
9217 | if (likely(update_next_balance)) { | 9273 | if (likely(update_next_balance)) { |
@@ -9235,7 +9291,7 @@ out: | |||
9235 | #ifdef CONFIG_NO_HZ_COMMON | 9291 | #ifdef CONFIG_NO_HZ_COMMON |
9236 | /* | 9292 | /* |
9237 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the | 9293 | * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the |
9238 | * rebalancing for all the cpus for whom scheduler ticks are stopped. | 9294 | * rebalancing for all the CPUs for whom scheduler ticks are stopped. |
9239 | */ | 9295 | */ |
9240 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | 9296 | static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) |
9241 | { | 9297 | { |
@@ -9255,8 +9311,8 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) | |||
9255 | continue; | 9311 | continue; |
9256 | 9312 | ||
9257 | /* | 9313 | /* |
9258 | * If this cpu gets work to do, stop the load balancing | 9314 | * If this CPU gets work to do, stop the load balancing |
9259 | * work being done for other cpus. Next load | 9315 | * work being done for other CPUs. Next load |
9260 | * balancing owner will pick it up. | 9316 | * balancing owner will pick it up. |
9261 | */ | 9317 | */ |
9262 | if (need_resched()) | 9318 | if (need_resched()) |
@@ -9298,13 +9354,13 @@ end: | |||
9298 | 9354 | ||
9299 | /* | 9355 | /* |
9300 | * Current heuristic for kicking the idle load balancer in the presence | 9356 | * Current heuristic for kicking the idle load balancer in the presence |
9301 | * of an idle cpu in the system. | 9357 | * of an idle CPU in the system. |
9302 | * - This rq has more than one task. | 9358 | * - This rq has more than one task. |
9303 | * - This rq has at least one CFS task and the capacity of the CPU is | 9359 | * - This rq has at least one CFS task and the capacity of the CPU is |
9304 | * significantly reduced because of RT tasks or IRQs. | 9360 | * significantly reduced because of RT tasks or IRQs. |
9305 | * - At parent of LLC scheduler domain level, this cpu's scheduler group has | 9361 | * - At parent of LLC scheduler domain level, this CPU's scheduler group has |
9306 | * multiple busy cpu. | 9362 | * multiple busy CPUs. |
9307 | * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler | 9363 | * - For SD_ASYM_PACKING, if the lower numbered CPU's in the scheduler |
9308 | * domain span are idle. | 9364 | * domain span are idle. |
9309 | */ | 9365 | */ |
9310 | static inline bool nohz_kick_needed(struct rq *rq) | 9366 | static inline bool nohz_kick_needed(struct rq *rq) |
@@ -9394,10 +9450,10 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) | |||
9394 | CPU_IDLE : CPU_NOT_IDLE; | 9450 | CPU_IDLE : CPU_NOT_IDLE; |
9395 | 9451 | ||
9396 | /* | 9452 | /* |
9397 | * If this cpu has a pending nohz_balance_kick, then do the | 9453 | * If this CPU has a pending nohz_balance_kick, then do the |
9398 | * balancing on behalf of the other idle cpus whose ticks are | 9454 | * balancing on behalf of the other idle CPUs whose ticks are |
9399 | * stopped. Do nohz_idle_balance *before* rebalance_domains to | 9455 | * stopped. Do nohz_idle_balance *before* rebalance_domains to |
9400 | * give the idle cpus a chance to load balance. Else we may | 9456 | * give the idle CPUs a chance to load balance. Else we may |
9401 | * load balance only within the local sched_domain hierarchy | 9457 | * load balance only within the local sched_domain hierarchy |
9402 | * and abort nohz_idle_balance altogether if we pull some load. | 9458 | * and abort nohz_idle_balance altogether if we pull some load. |
9403 | */ | 9459 | */ |
@@ -9440,7 +9496,12 @@ static void rq_offline_fair(struct rq *rq) | |||
9440 | #endif /* CONFIG_SMP */ | 9496 | #endif /* CONFIG_SMP */ |
9441 | 9497 | ||
9442 | /* | 9498 | /* |
9443 | * scheduler tick hitting a task of our scheduling class: | 9499 | * scheduler tick hitting a task of our scheduling class. |
9500 | * | ||
9501 | * NOTE: This function can be called remotely by the tick offload that | ||
9502 | * goes along full dynticks. Therefore no local assumption can be made | ||
9503 | * and everything must be accessed through the @rq and @curr passed in | ||
9504 | * parameters. | ||
9444 | */ | 9505 | */ |
9445 | static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | 9506 | static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) |
9446 | { | 9507 | { |
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 7dae9eb8c042..2975f195e1c4 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c | |||
@@ -1,23 +1,14 @@ | |||
1 | /* | 1 | /* |
2 | * Generic entry point for the idle threads | 2 | * Generic entry points for the idle threads and |
3 | * implementation of the idle task scheduling class. | ||
4 | * | ||
5 | * (NOTE: these are not related to SCHED_IDLE batch scheduled | ||
6 | * tasks which are handled in sched/fair.c ) | ||
3 | */ | 7 | */ |
4 | #include <linux/sched.h> | 8 | #include "sched.h" |
5 | #include <linux/sched/idle.h> | ||
6 | #include <linux/cpu.h> | ||
7 | #include <linux/cpuidle.h> | ||
8 | #include <linux/cpuhotplug.h> | ||
9 | #include <linux/tick.h> | ||
10 | #include <linux/mm.h> | ||
11 | #include <linux/stackprotector.h> | ||
12 | #include <linux/suspend.h> | ||
13 | #include <linux/livepatch.h> | ||
14 | |||
15 | #include <asm/tlb.h> | ||
16 | 9 | ||
17 | #include <trace/events/power.h> | 10 | #include <trace/events/power.h> |
18 | 11 | ||
19 | #include "sched.h" | ||
20 | |||
21 | /* Linker adds these: start and end of __cpuidle functions */ | 12 | /* Linker adds these: start and end of __cpuidle functions */ |
22 | extern char __cpuidle_text_start[], __cpuidle_text_end[]; | 13 | extern char __cpuidle_text_start[], __cpuidle_text_end[]; |
23 | 14 | ||
@@ -46,6 +37,7 @@ void cpu_idle_poll_ctrl(bool enable) | |||
46 | static int __init cpu_idle_poll_setup(char *__unused) | 37 | static int __init cpu_idle_poll_setup(char *__unused) |
47 | { | 38 | { |
48 | cpu_idle_force_poll = 1; | 39 | cpu_idle_force_poll = 1; |
40 | |||
49 | return 1; | 41 | return 1; |
50 | } | 42 | } |
51 | __setup("nohlt", cpu_idle_poll_setup); | 43 | __setup("nohlt", cpu_idle_poll_setup); |
@@ -53,6 +45,7 @@ __setup("nohlt", cpu_idle_poll_setup); | |||
53 | static int __init cpu_idle_nopoll_setup(char *__unused) | 45 | static int __init cpu_idle_nopoll_setup(char *__unused) |
54 | { | 46 | { |
55 | cpu_idle_force_poll = 0; | 47 | cpu_idle_force_poll = 0; |
48 | |||
56 | return 1; | 49 | return 1; |
57 | } | 50 | } |
58 | __setup("hlt", cpu_idle_nopoll_setup); | 51 | __setup("hlt", cpu_idle_nopoll_setup); |
@@ -64,12 +57,14 @@ static noinline int __cpuidle cpu_idle_poll(void) | |||
64 | trace_cpu_idle_rcuidle(0, smp_processor_id()); | 57 | trace_cpu_idle_rcuidle(0, smp_processor_id()); |
65 | local_irq_enable(); | 58 | local_irq_enable(); |
66 | stop_critical_timings(); | 59 | stop_critical_timings(); |
60 | |||
67 | while (!tif_need_resched() && | 61 | while (!tif_need_resched() && |
68 | (cpu_idle_force_poll || tick_check_broadcast_expired())) | 62 | (cpu_idle_force_poll || tick_check_broadcast_expired())) |
69 | cpu_relax(); | 63 | cpu_relax(); |
70 | start_critical_timings(); | 64 | start_critical_timings(); |
71 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); | 65 | trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); |
72 | rcu_idle_exit(); | 66 | rcu_idle_exit(); |
67 | |||
73 | return 1; | 68 | return 1; |
74 | } | 69 | } |
75 | 70 | ||
@@ -332,8 +327,8 @@ void cpu_startup_entry(enum cpuhp_state state) | |||
332 | { | 327 | { |
333 | /* | 328 | /* |
334 | * This #ifdef needs to die, but it's too late in the cycle to | 329 | * This #ifdef needs to die, but it's too late in the cycle to |
335 | * make this generic (arm and sh have never invoked the canary | 330 | * make this generic (ARM and SH have never invoked the canary |
336 | * init for the non boot cpus!). Will be fixed in 3.11 | 331 | * init for the non boot CPUs!). Will be fixed in 3.11 |
337 | */ | 332 | */ |
338 | #ifdef CONFIG_X86 | 333 | #ifdef CONFIG_X86 |
339 | /* | 334 | /* |
@@ -350,3 +345,116 @@ void cpu_startup_entry(enum cpuhp_state state) | |||
350 | while (1) | 345 | while (1) |
351 | do_idle(); | 346 | do_idle(); |
352 | } | 347 | } |
348 | |||
349 | /* | ||
350 | * idle-task scheduling class. | ||
351 | */ | ||
352 | |||
353 | #ifdef CONFIG_SMP | ||
354 | static int | ||
355 | select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) | ||
356 | { | ||
357 | return task_cpu(p); /* IDLE tasks as never migrated */ | ||
358 | } | ||
359 | #endif | ||
360 | |||
361 | /* | ||
362 | * Idle tasks are unconditionally rescheduled: | ||
363 | */ | ||
364 | static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) | ||
365 | { | ||
366 | resched_curr(rq); | ||
367 | } | ||
368 | |||
369 | static struct task_struct * | ||
370 | pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | ||
371 | { | ||
372 | put_prev_task(rq, prev); | ||
373 | update_idle_core(rq); | ||
374 | schedstat_inc(rq->sched_goidle); | ||
375 | |||
376 | return rq->idle; | ||
377 | } | ||
378 | |||
379 | /* | ||
380 | * It is not legal to sleep in the idle task - print a warning | ||
381 | * message if some code attempts to do it: | ||
382 | */ | ||
383 | static void | ||
384 | dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) | ||
385 | { | ||
386 | raw_spin_unlock_irq(&rq->lock); | ||
387 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); | ||
388 | dump_stack(); | ||
389 | raw_spin_lock_irq(&rq->lock); | ||
390 | } | ||
391 | |||
392 | static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) | ||
393 | { | ||
394 | } | ||
395 | |||
396 | /* | ||
397 | * scheduler tick hitting a task of our scheduling class. | ||
398 | * | ||
399 | * NOTE: This function can be called remotely by the tick offload that | ||
400 | * goes along full dynticks. Therefore no local assumption can be made | ||
401 | * and everything must be accessed through the @rq and @curr passed in | ||
402 | * parameters. | ||
403 | */ | ||
404 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) | ||
405 | { | ||
406 | } | ||
407 | |||
408 | static void set_curr_task_idle(struct rq *rq) | ||
409 | { | ||
410 | } | ||
411 | |||
412 | static void switched_to_idle(struct rq *rq, struct task_struct *p) | ||
413 | { | ||
414 | BUG(); | ||
415 | } | ||
416 | |||
417 | static void | ||
418 | prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) | ||
419 | { | ||
420 | BUG(); | ||
421 | } | ||
422 | |||
423 | static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) | ||
424 | { | ||
425 | return 0; | ||
426 | } | ||
427 | |||
428 | static void update_curr_idle(struct rq *rq) | ||
429 | { | ||
430 | } | ||
431 | |||
432 | /* | ||
433 | * Simple, special scheduling class for the per-CPU idle tasks: | ||
434 | */ | ||
435 | const struct sched_class idle_sched_class = { | ||
436 | /* .next is NULL */ | ||
437 | /* no enqueue/yield_task for idle tasks */ | ||
438 | |||
439 | /* dequeue is not valid, we print a debug message there: */ | ||
440 | .dequeue_task = dequeue_task_idle, | ||
441 | |||
442 | .check_preempt_curr = check_preempt_curr_idle, | ||
443 | |||
444 | .pick_next_task = pick_next_task_idle, | ||
445 | .put_prev_task = put_prev_task_idle, | ||
446 | |||
447 | #ifdef CONFIG_SMP | ||
448 | .select_task_rq = select_task_rq_idle, | ||
449 | .set_cpus_allowed = set_cpus_allowed_common, | ||
450 | #endif | ||
451 | |||
452 | .set_curr_task = set_curr_task_idle, | ||
453 | .task_tick = task_tick_idle, | ||
454 | |||
455 | .get_rr_interval = get_rr_interval_idle, | ||
456 | |||
457 | .prio_changed = prio_changed_idle, | ||
458 | .switched_to = switched_to_idle, | ||
459 | .update_curr = update_curr_idle, | ||
460 | }; | ||
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c deleted file mode 100644 index d518664cce4f..000000000000 --- a/kernel/sched/idle_task.c +++ /dev/null | |||
@@ -1,110 +0,0 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #include "sched.h" | ||
3 | |||
4 | /* | ||
5 | * idle-task scheduling class. | ||
6 | * | ||
7 | * (NOTE: these are not related to SCHED_IDLE tasks which are | ||
8 | * handled in sched/fair.c) | ||
9 | */ | ||
10 | |||
11 | #ifdef CONFIG_SMP | ||
12 | static int | ||
13 | select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags) | ||
14 | { | ||
15 | return task_cpu(p); /* IDLE tasks as never migrated */ | ||
16 | } | ||
17 | #endif /* CONFIG_SMP */ | ||
18 | |||
19 | /* | ||
20 | * Idle tasks are unconditionally rescheduled: | ||
21 | */ | ||
22 | static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) | ||
23 | { | ||
24 | resched_curr(rq); | ||
25 | } | ||
26 | |||
27 | static struct task_struct * | ||
28 | pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | ||
29 | { | ||
30 | put_prev_task(rq, prev); | ||
31 | update_idle_core(rq); | ||
32 | schedstat_inc(rq->sched_goidle); | ||
33 | return rq->idle; | ||
34 | } | ||
35 | |||
36 | /* | ||
37 | * It is not legal to sleep in the idle task - print a warning | ||
38 | * message if some code attempts to do it: | ||
39 | */ | ||
40 | static void | ||
41 | dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) | ||
42 | { | ||
43 | raw_spin_unlock_irq(&rq->lock); | ||
44 | printk(KERN_ERR "bad: scheduling from the idle thread!\n"); | ||
45 | dump_stack(); | ||
46 | raw_spin_lock_irq(&rq->lock); | ||
47 | } | ||
48 | |||
49 | static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) | ||
50 | { | ||
51 | rq_last_tick_reset(rq); | ||
52 | } | ||
53 | |||
54 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) | ||
55 | { | ||
56 | } | ||
57 | |||
58 | static void set_curr_task_idle(struct rq *rq) | ||
59 | { | ||
60 | } | ||
61 | |||
62 | static void switched_to_idle(struct rq *rq, struct task_struct *p) | ||
63 | { | ||
64 | BUG(); | ||
65 | } | ||
66 | |||
67 | static void | ||
68 | prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio) | ||
69 | { | ||
70 | BUG(); | ||
71 | } | ||
72 | |||
73 | static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task) | ||
74 | { | ||
75 | return 0; | ||
76 | } | ||
77 | |||
78 | static void update_curr_idle(struct rq *rq) | ||
79 | { | ||
80 | } | ||
81 | |||
82 | /* | ||
83 | * Simple, special scheduling class for the per-CPU idle tasks: | ||
84 | */ | ||
85 | const struct sched_class idle_sched_class = { | ||
86 | /* .next is NULL */ | ||
87 | /* no enqueue/yield_task for idle tasks */ | ||
88 | |||
89 | /* dequeue is not valid, we print a debug message there: */ | ||
90 | .dequeue_task = dequeue_task_idle, | ||
91 | |||
92 | .check_preempt_curr = check_preempt_curr_idle, | ||
93 | |||
94 | .pick_next_task = pick_next_task_idle, | ||
95 | .put_prev_task = put_prev_task_idle, | ||
96 | |||
97 | #ifdef CONFIG_SMP | ||
98 | .select_task_rq = select_task_rq_idle, | ||
99 | .set_cpus_allowed = set_cpus_allowed_common, | ||
100 | #endif | ||
101 | |||
102 | .set_curr_task = set_curr_task_idle, | ||
103 | .task_tick = task_tick_idle, | ||
104 | |||
105 | .get_rr_interval = get_rr_interval_idle, | ||
106 | |||
107 | .prio_changed = prio_changed_idle, | ||
108 | .switched_to = switched_to_idle, | ||
109 | .update_curr = update_curr_idle, | ||
110 | }; | ||
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c index b71b436f59f2..e6802181900f 100644 --- a/kernel/sched/isolation.c +++ b/kernel/sched/isolation.c | |||
@@ -3,15 +3,10 @@ | |||
3 | * any CPU: unbound workqueues, timers, kthreads and any offloadable work. | 3 | * any CPU: unbound workqueues, timers, kthreads and any offloadable work. |
4 | * | 4 | * |
5 | * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker | 5 | * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker |
6 | * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker | ||
6 | * | 7 | * |
7 | */ | 8 | */ |
8 | 9 | #include "sched.h" | |
9 | #include <linux/sched/isolation.h> | ||
10 | #include <linux/tick.h> | ||
11 | #include <linux/init.h> | ||
12 | #include <linux/kernel.h> | ||
13 | #include <linux/static_key.h> | ||
14 | #include <linux/ctype.h> | ||
15 | 10 | ||
16 | DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); | 11 | DEFINE_STATIC_KEY_FALSE(housekeeping_overriden); |
17 | EXPORT_SYMBOL_GPL(housekeeping_overriden); | 12 | EXPORT_SYMBOL_GPL(housekeeping_overriden); |
@@ -60,6 +55,9 @@ void __init housekeeping_init(void) | |||
60 | 55 | ||
61 | static_branch_enable(&housekeeping_overriden); | 56 | static_branch_enable(&housekeeping_overriden); |
62 | 57 | ||
58 | if (housekeeping_flags & HK_FLAG_TICK) | ||
59 | sched_tick_offload_init(); | ||
60 | |||
63 | /* We need at least one CPU to handle housekeeping work */ | 61 | /* We need at least one CPU to handle housekeeping work */ |
64 | WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); | 62 | WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); |
65 | } | 63 | } |
@@ -119,7 +117,7 @@ static int __init housekeeping_nohz_full_setup(char *str) | |||
119 | { | 117 | { |
120 | unsigned int flags; | 118 | unsigned int flags; |
121 | 119 | ||
122 | flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; | 120 | flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC; |
123 | 121 | ||
124 | return housekeeping_setup(str, flags); | 122 | return housekeeping_setup(str, flags); |
125 | } | 123 | } |
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 89a989e4d758..a171c1258109 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c | |||
@@ -6,10 +6,6 @@ | |||
6 | * figure. Its a silly number but people think its important. We go through | 6 | * figure. Its a silly number but people think its important. We go through |
7 | * great pains to make it work on big machines and tickless kernels. | 7 | * great pains to make it work on big machines and tickless kernels. |
8 | */ | 8 | */ |
9 | |||
10 | #include <linux/export.h> | ||
11 | #include <linux/sched/loadavg.h> | ||
12 | |||
13 | #include "sched.h" | 9 | #include "sched.h" |
14 | 10 | ||
15 | /* | 11 | /* |
@@ -32,29 +28,29 @@ | |||
32 | * Due to a number of reasons the above turns in the mess below: | 28 | * Due to a number of reasons the above turns in the mess below: |
33 | * | 29 | * |
34 | * - for_each_possible_cpu() is prohibitively expensive on machines with | 30 | * - for_each_possible_cpu() is prohibitively expensive on machines with |
35 | * serious number of cpus, therefore we need to take a distributed approach | 31 | * serious number of CPUs, therefore we need to take a distributed approach |
36 | * to calculating nr_active. | 32 | * to calculating nr_active. |
37 | * | 33 | * |
38 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 | 34 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 |
39 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } | 35 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } |
40 | * | 36 | * |
41 | * So assuming nr_active := 0 when we start out -- true per definition, we | 37 | * So assuming nr_active := 0 when we start out -- true per definition, we |
42 | * can simply take per-cpu deltas and fold those into a global accumulate | 38 | * can simply take per-CPU deltas and fold those into a global accumulate |
43 | * to obtain the same result. See calc_load_fold_active(). | 39 | * to obtain the same result. See calc_load_fold_active(). |
44 | * | 40 | * |
45 | * Furthermore, in order to avoid synchronizing all per-cpu delta folding | 41 | * Furthermore, in order to avoid synchronizing all per-CPU delta folding |
46 | * across the machine, we assume 10 ticks is sufficient time for every | 42 | * across the machine, we assume 10 ticks is sufficient time for every |
47 | * cpu to have completed this task. | 43 | * CPU to have completed this task. |
48 | * | 44 | * |
49 | * This places an upper-bound on the IRQ-off latency of the machine. Then | 45 | * This places an upper-bound on the IRQ-off latency of the machine. Then |
50 | * again, being late doesn't loose the delta, just wrecks the sample. | 46 | * again, being late doesn't loose the delta, just wrecks the sample. |
51 | * | 47 | * |
52 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because | 48 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because |
53 | * this would add another cross-cpu cacheline miss and atomic operation | 49 | * this would add another cross-CPU cacheline miss and atomic operation |
54 | * to the wakeup path. Instead we increment on whatever cpu the task ran | 50 | * to the wakeup path. Instead we increment on whatever CPU the task ran |
55 | * when it went into uninterruptible state and decrement on whatever cpu | 51 | * when it went into uninterruptible state and decrement on whatever CPU |
56 | * did the wakeup. This means that only the sum of nr_uninterruptible over | 52 | * did the wakeup. This means that only the sum of nr_uninterruptible over |
57 | * all cpus yields the correct result. | 53 | * all CPUs yields the correct result. |
58 | * | 54 | * |
59 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. | 55 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. |
60 | */ | 56 | */ |
@@ -115,11 +111,11 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
115 | * Handle NO_HZ for the global load-average. | 111 | * Handle NO_HZ for the global load-average. |
116 | * | 112 | * |
117 | * Since the above described distributed algorithm to compute the global | 113 | * Since the above described distributed algorithm to compute the global |
118 | * load-average relies on per-cpu sampling from the tick, it is affected by | 114 | * load-average relies on per-CPU sampling from the tick, it is affected by |
119 | * NO_HZ. | 115 | * NO_HZ. |
120 | * | 116 | * |
121 | * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon | 117 | * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon |
122 | * entering NO_HZ state such that we can include this as an 'extra' cpu delta | 118 | * entering NO_HZ state such that we can include this as an 'extra' CPU delta |
123 | * when we read the global state. | 119 | * when we read the global state. |
124 | * | 120 | * |
125 | * Obviously reality has to ruin such a delightfully simple scheme: | 121 | * Obviously reality has to ruin such a delightfully simple scheme: |
@@ -146,9 +142,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active) | |||
146 | * busy state. | 142 | * busy state. |
147 | * | 143 | * |
148 | * This is solved by pushing the window forward, and thus skipping the | 144 | * This is solved by pushing the window forward, and thus skipping the |
149 | * sample, for this cpu (effectively using the NO_HZ-delta for this cpu which | 145 | * sample, for this CPU (effectively using the NO_HZ-delta for this CPU which |
150 | * was in effect at the time the window opened). This also solves the issue | 146 | * was in effect at the time the window opened). This also solves the issue |
151 | * of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ | 147 | * of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ |
152 | * intervals. | 148 | * intervals. |
153 | * | 149 | * |
154 | * When making the ILB scale, we should try to pull this in as well. | 150 | * When making the ILB scale, we should try to pull this in as well. |
@@ -299,7 +295,7 @@ calc_load_n(unsigned long load, unsigned long exp, | |||
299 | } | 295 | } |
300 | 296 | ||
301 | /* | 297 | /* |
302 | * NO_HZ can leave us missing all per-cpu ticks calling | 298 | * NO_HZ can leave us missing all per-CPU ticks calling |
303 | * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into | 299 | * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into |
304 | * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold | 300 | * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold |
305 | * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. | 301 | * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. |
@@ -363,7 +359,7 @@ void calc_global_load(unsigned long ticks) | |||
363 | return; | 359 | return; |
364 | 360 | ||
365 | /* | 361 | /* |
366 | * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus. | 362 | * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs. |
367 | */ | 363 | */ |
368 | delta = calc_load_nohz_fold(); | 364 | delta = calc_load_nohz_fold(); |
369 | if (delta) | 365 | if (delta) |
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index 5d0762633639..76e0eaf4654e 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c | |||
@@ -13,32 +13,25 @@ | |||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
14 | * GNU General Public License for more details. | 14 | * GNU General Public License for more details. |
15 | */ | 15 | */ |
16 | 16 | #include "sched.h" | |
17 | #include <linux/syscalls.h> | ||
18 | #include <linux/membarrier.h> | ||
19 | #include <linux/tick.h> | ||
20 | #include <linux/cpumask.h> | ||
21 | #include <linux/atomic.h> | ||
22 | |||
23 | #include "sched.h" /* for cpu_rq(). */ | ||
24 | 17 | ||
25 | /* | 18 | /* |
26 | * Bitmask made from a "or" of all commands within enum membarrier_cmd, | 19 | * Bitmask made from a "or" of all commands within enum membarrier_cmd, |
27 | * except MEMBARRIER_CMD_QUERY. | 20 | * except MEMBARRIER_CMD_QUERY. |
28 | */ | 21 | */ |
29 | #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE | 22 | #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE |
30 | #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ | 23 | #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK \ |
31 | (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ | 24 | (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \ |
32 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) | 25 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE) |
33 | #else | 26 | #else |
34 | #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 | 27 | #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK 0 |
35 | #endif | 28 | #endif |
36 | 29 | ||
37 | #define MEMBARRIER_CMD_BITMASK \ | 30 | #define MEMBARRIER_CMD_BITMASK \ |
38 | (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ | 31 | (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \ |
39 | | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ | 32 | | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \ |
40 | | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ | 33 | | MEMBARRIER_CMD_PRIVATE_EXPEDITED \ |
41 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ | 34 | | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ |
42 | | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) | 35 | | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) |
43 | 36 | ||
44 | static void ipi_mb(void *info) | 37 | static void ipi_mb(void *info) |
@@ -85,6 +78,7 @@ static int membarrier_global_expedited(void) | |||
85 | */ | 78 | */ |
86 | if (cpu == raw_smp_processor_id()) | 79 | if (cpu == raw_smp_processor_id()) |
87 | continue; | 80 | continue; |
81 | |||
88 | rcu_read_lock(); | 82 | rcu_read_lock(); |
89 | p = task_rcu_dereference(&cpu_rq(cpu)->curr); | 83 | p = task_rcu_dereference(&cpu_rq(cpu)->curr); |
90 | if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & | 84 | if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & |
@@ -188,6 +182,7 @@ static int membarrier_private_expedited(int flags) | |||
188 | * rq->curr modification in scheduler. | 182 | * rq->curr modification in scheduler. |
189 | */ | 183 | */ |
190 | smp_mb(); /* exit from system call is not a mb */ | 184 | smp_mb(); /* exit from system call is not a mb */ |
185 | |||
191 | return 0; | 186 | return 0; |
192 | } | 187 | } |
193 | 188 | ||
@@ -219,6 +214,7 @@ static int membarrier_register_global_expedited(void) | |||
219 | } | 214 | } |
220 | atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, | 215 | atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, |
221 | &mm->membarrier_state); | 216 | &mm->membarrier_state); |
217 | |||
222 | return 0; | 218 | return 0; |
223 | } | 219 | } |
224 | 220 | ||
@@ -253,6 +249,7 @@ static int membarrier_register_private_expedited(int flags) | |||
253 | synchronize_sched(); | 249 | synchronize_sched(); |
254 | } | 250 | } |
255 | atomic_or(state, &mm->membarrier_state); | 251 | atomic_or(state, &mm->membarrier_state); |
252 | |||
256 | return 0; | 253 | return 0; |
257 | } | 254 | } |
258 | 255 | ||
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index aad49451584e..4f4fd3b157f1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c | |||
@@ -3,12 +3,8 @@ | |||
3 | * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR | 3 | * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR |
4 | * policies) | 4 | * policies) |
5 | */ | 5 | */ |
6 | |||
7 | #include "sched.h" | 6 | #include "sched.h" |
8 | 7 | ||
9 | #include <linux/slab.h> | ||
10 | #include <linux/irq_work.h> | ||
11 | |||
12 | int sched_rr_timeslice = RR_TIMESLICE; | 8 | int sched_rr_timeslice = RR_TIMESLICE; |
13 | int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; | 9 | int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; |
14 | 10 | ||
@@ -359,7 +355,7 @@ static DEFINE_PER_CPU(struct callback_head, rt_pull_head); | |||
359 | static void push_rt_tasks(struct rq *); | 355 | static void push_rt_tasks(struct rq *); |
360 | static void pull_rt_task(struct rq *); | 356 | static void pull_rt_task(struct rq *); |
361 | 357 | ||
362 | static inline void queue_push_tasks(struct rq *rq) | 358 | static inline void rt_queue_push_tasks(struct rq *rq) |
363 | { | 359 | { |
364 | if (!has_pushable_tasks(rq)) | 360 | if (!has_pushable_tasks(rq)) |
365 | return; | 361 | return; |
@@ -367,7 +363,7 @@ static inline void queue_push_tasks(struct rq *rq) | |||
367 | queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); | 363 | queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks); |
368 | } | 364 | } |
369 | 365 | ||
370 | static inline void queue_pull_task(struct rq *rq) | 366 | static inline void rt_queue_pull_task(struct rq *rq) |
371 | { | 367 | { |
372 | queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); | 368 | queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task); |
373 | } | 369 | } |
@@ -425,7 +421,7 @@ static inline void pull_rt_task(struct rq *this_rq) | |||
425 | { | 421 | { |
426 | } | 422 | } |
427 | 423 | ||
428 | static inline void queue_push_tasks(struct rq *rq) | 424 | static inline void rt_queue_push_tasks(struct rq *rq) |
429 | { | 425 | { |
430 | } | 426 | } |
431 | #endif /* CONFIG_SMP */ | 427 | #endif /* CONFIG_SMP */ |
@@ -1453,9 +1449,9 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) | |||
1453 | return; | 1449 | return; |
1454 | 1450 | ||
1455 | /* | 1451 | /* |
1456 | * There appears to be other cpus that can accept | 1452 | * There appear to be other CPUs that can accept |
1457 | * current and none to run 'p', so lets reschedule | 1453 | * the current task but none can run 'p', so lets reschedule |
1458 | * to try and push current away: | 1454 | * to try and push the current task away: |
1459 | */ | 1455 | */ |
1460 | requeue_task_rt(rq, p, 1); | 1456 | requeue_task_rt(rq, p, 1); |
1461 | resched_curr(rq); | 1457 | resched_curr(rq); |
@@ -1569,7 +1565,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) | |||
1569 | /* The running task is never eligible for pushing */ | 1565 | /* The running task is never eligible for pushing */ |
1570 | dequeue_pushable_task(rq, p); | 1566 | dequeue_pushable_task(rq, p); |
1571 | 1567 | ||
1572 | queue_push_tasks(rq); | 1568 | rt_queue_push_tasks(rq); |
1573 | 1569 | ||
1574 | return p; | 1570 | return p; |
1575 | } | 1571 | } |
@@ -1596,12 +1592,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) | |||
1596 | if (!task_running(rq, p) && | 1592 | if (!task_running(rq, p) && |
1597 | cpumask_test_cpu(cpu, &p->cpus_allowed)) | 1593 | cpumask_test_cpu(cpu, &p->cpus_allowed)) |
1598 | return 1; | 1594 | return 1; |
1595 | |||
1599 | return 0; | 1596 | return 0; |
1600 | } | 1597 | } |
1601 | 1598 | ||
1602 | /* | 1599 | /* |
1603 | * Return the highest pushable rq's task, which is suitable to be executed | 1600 | * Return the highest pushable rq's task, which is suitable to be executed |
1604 | * on the cpu, NULL otherwise | 1601 | * on the CPU, NULL otherwise |
1605 | */ | 1602 | */ |
1606 | static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) | 1603 | static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu) |
1607 | { | 1604 | { |
@@ -1639,11 +1636,11 @@ static int find_lowest_rq(struct task_struct *task) | |||
1639 | return -1; /* No targets found */ | 1636 | return -1; /* No targets found */ |
1640 | 1637 | ||
1641 | /* | 1638 | /* |
1642 | * At this point we have built a mask of cpus representing the | 1639 | * At this point we have built a mask of CPUs representing the |
1643 | * lowest priority tasks in the system. Now we want to elect | 1640 | * lowest priority tasks in the system. Now we want to elect |
1644 | * the best one based on our affinity and topology. | 1641 | * the best one based on our affinity and topology. |
1645 | * | 1642 | * |
1646 | * We prioritize the last cpu that the task executed on since | 1643 | * We prioritize the last CPU that the task executed on since |
1647 | * it is most likely cache-hot in that location. | 1644 | * it is most likely cache-hot in that location. |
1648 | */ | 1645 | */ |
1649 | if (cpumask_test_cpu(cpu, lowest_mask)) | 1646 | if (cpumask_test_cpu(cpu, lowest_mask)) |
@@ -1651,7 +1648,7 @@ static int find_lowest_rq(struct task_struct *task) | |||
1651 | 1648 | ||
1652 | /* | 1649 | /* |
1653 | * Otherwise, we consult the sched_domains span maps to figure | 1650 | * Otherwise, we consult the sched_domains span maps to figure |
1654 | * out which cpu is logically closest to our hot cache data. | 1651 | * out which CPU is logically closest to our hot cache data. |
1655 | */ | 1652 | */ |
1656 | if (!cpumask_test_cpu(this_cpu, lowest_mask)) | 1653 | if (!cpumask_test_cpu(this_cpu, lowest_mask)) |
1657 | this_cpu = -1; /* Skip this_cpu opt if not among lowest */ | 1654 | this_cpu = -1; /* Skip this_cpu opt if not among lowest */ |
@@ -1692,6 +1689,7 @@ static int find_lowest_rq(struct task_struct *task) | |||
1692 | cpu = cpumask_any(lowest_mask); | 1689 | cpu = cpumask_any(lowest_mask); |
1693 | if (cpu < nr_cpu_ids) | 1690 | if (cpu < nr_cpu_ids) |
1694 | return cpu; | 1691 | return cpu; |
1692 | |||
1695 | return -1; | 1693 | return -1; |
1696 | } | 1694 | } |
1697 | 1695 | ||
@@ -1827,7 +1825,7 @@ retry: | |||
1827 | * The task hasn't migrated, and is still the next | 1825 | * The task hasn't migrated, and is still the next |
1828 | * eligible task, but we failed to find a run-queue | 1826 | * eligible task, but we failed to find a run-queue |
1829 | * to push it to. Do not retry in this case, since | 1827 | * to push it to. Do not retry in this case, since |
1830 | * other cpus will pull from us when ready. | 1828 | * other CPUs will pull from us when ready. |
1831 | */ | 1829 | */ |
1832 | goto out; | 1830 | goto out; |
1833 | } | 1831 | } |
@@ -1919,7 +1917,7 @@ static int rto_next_cpu(struct root_domain *rd) | |||
1919 | * rt_next_cpu() will simply return the first CPU found in | 1917 | * rt_next_cpu() will simply return the first CPU found in |
1920 | * the rto_mask. | 1918 | * the rto_mask. |
1921 | * | 1919 | * |
1922 | * If rto_next_cpu() is called with rto_cpu is a valid cpu, it | 1920 | * If rto_next_cpu() is called with rto_cpu is a valid CPU, it |
1923 | * will return the next CPU found in the rto_mask. | 1921 | * will return the next CPU found in the rto_mask. |
1924 | * | 1922 | * |
1925 | * If there are no more CPUs left in the rto_mask, then a check is made | 1923 | * If there are no more CPUs left in the rto_mask, then a check is made |
@@ -1980,7 +1978,7 @@ static void tell_cpu_to_push(struct rq *rq) | |||
1980 | raw_spin_lock(&rq->rd->rto_lock); | 1978 | raw_spin_lock(&rq->rd->rto_lock); |
1981 | 1979 | ||
1982 | /* | 1980 | /* |
1983 | * The rto_cpu is updated under the lock, if it has a valid cpu | 1981 | * The rto_cpu is updated under the lock, if it has a valid CPU |
1984 | * then the IPI is still running and will continue due to the | 1982 | * then the IPI is still running and will continue due to the |
1985 | * update to loop_next, and nothing needs to be done here. | 1983 | * update to loop_next, and nothing needs to be done here. |
1986 | * Otherwise it is finishing up and an ipi needs to be sent. | 1984 | * Otherwise it is finishing up and an ipi needs to be sent. |
@@ -2105,7 +2103,7 @@ static void pull_rt_task(struct rq *this_rq) | |||
2105 | 2103 | ||
2106 | /* | 2104 | /* |
2107 | * There's a chance that p is higher in priority | 2105 | * There's a chance that p is higher in priority |
2108 | * than what's currently running on its cpu. | 2106 | * than what's currently running on its CPU. |
2109 | * This is just that p is wakeing up and hasn't | 2107 | * This is just that p is wakeing up and hasn't |
2110 | * had a chance to schedule. We only pull | 2108 | * had a chance to schedule. We only pull |
2111 | * p if it is lower in priority than the | 2109 | * p if it is lower in priority than the |
@@ -2187,7 +2185,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p) | |||
2187 | if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) | 2185 | if (!task_on_rq_queued(p) || rq->rt.rt_nr_running) |
2188 | return; | 2186 | return; |
2189 | 2187 | ||
2190 | queue_pull_task(rq); | 2188 | rt_queue_pull_task(rq); |
2191 | } | 2189 | } |
2192 | 2190 | ||
2193 | void __init init_sched_rt_class(void) | 2191 | void __init init_sched_rt_class(void) |
@@ -2218,7 +2216,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p) | |||
2218 | if (task_on_rq_queued(p) && rq->curr != p) { | 2216 | if (task_on_rq_queued(p) && rq->curr != p) { |
2219 | #ifdef CONFIG_SMP | 2217 | #ifdef CONFIG_SMP |
2220 | if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) | 2218 | if (p->nr_cpus_allowed > 1 && rq->rt.overloaded) |
2221 | queue_push_tasks(rq); | 2219 | rt_queue_push_tasks(rq); |
2222 | #endif /* CONFIG_SMP */ | 2220 | #endif /* CONFIG_SMP */ |
2223 | if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) | 2221 | if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq))) |
2224 | resched_curr(rq); | 2222 | resched_curr(rq); |
@@ -2242,7 +2240,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio) | |||
2242 | * may need to pull tasks to this runqueue. | 2240 | * may need to pull tasks to this runqueue. |
2243 | */ | 2241 | */ |
2244 | if (oldprio < p->prio) | 2242 | if (oldprio < p->prio) |
2245 | queue_pull_task(rq); | 2243 | rt_queue_pull_task(rq); |
2246 | 2244 | ||
2247 | /* | 2245 | /* |
2248 | * If there's a higher priority task waiting to run | 2246 | * If there's a higher priority task waiting to run |
@@ -2292,6 +2290,14 @@ static void watchdog(struct rq *rq, struct task_struct *p) | |||
2292 | static inline void watchdog(struct rq *rq, struct task_struct *p) { } | 2290 | static inline void watchdog(struct rq *rq, struct task_struct *p) { } |
2293 | #endif | 2291 | #endif |
2294 | 2292 | ||
2293 | /* | ||
2294 | * scheduler tick hitting a task of our scheduling class. | ||
2295 | * | ||
2296 | * NOTE: This function can be called remotely by the tick offload that | ||
2297 | * goes along full dynticks. Therefore no local assumption can be made | ||
2298 | * and everything must be accessed through the @rq and @curr passed in | ||
2299 | * parameters. | ||
2300 | */ | ||
2295 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | 2301 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) |
2296 | { | 2302 | { |
2297 | struct sched_rt_entity *rt_se = &p->rt; | 2303 | struct sched_rt_entity *rt_se = &p->rt; |
@@ -2685,6 +2691,7 @@ int sched_rr_handler(struct ctl_table *table, int write, | |||
2685 | msecs_to_jiffies(sysctl_sched_rr_timeslice); | 2691 | msecs_to_jiffies(sysctl_sched_rr_timeslice); |
2686 | } | 2692 | } |
2687 | mutex_unlock(&mutex); | 2693 | mutex_unlock(&mutex); |
2694 | |||
2688 | return ret; | 2695 | return ret; |
2689 | } | 2696 | } |
2690 | 2697 | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index fb5fc458547f..23ba4dd76ac4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -1,39 +1,73 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0 */ | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | 2 | /* | |
3 | * Scheduler internal types and methods: | ||
4 | */ | ||
3 | #include <linux/sched.h> | 5 | #include <linux/sched.h> |
6 | |||
4 | #include <linux/sched/autogroup.h> | 7 | #include <linux/sched/autogroup.h> |
5 | #include <linux/sched/sysctl.h> | ||
6 | #include <linux/sched/topology.h> | ||
7 | #include <linux/sched/rt.h> | ||
8 | #include <linux/sched/deadline.h> | ||
9 | #include <linux/sched/clock.h> | 8 | #include <linux/sched/clock.h> |
10 | #include <linux/sched/wake_q.h> | 9 | #include <linux/sched/coredump.h> |
11 | #include <linux/sched/signal.h> | ||
12 | #include <linux/sched/numa_balancing.h> | ||
13 | #include <linux/sched/mm.h> | ||
14 | #include <linux/sched/cpufreq.h> | 10 | #include <linux/sched/cpufreq.h> |
15 | #include <linux/sched/stat.h> | 11 | #include <linux/sched/cputime.h> |
16 | #include <linux/sched/nohz.h> | 12 | #include <linux/sched/deadline.h> |
17 | #include <linux/sched/debug.h> | 13 | #include <linux/sched/debug.h> |
18 | #include <linux/sched/hotplug.h> | 14 | #include <linux/sched/hotplug.h> |
15 | #include <linux/sched/idle.h> | ||
16 | #include <linux/sched/init.h> | ||
17 | #include <linux/sched/isolation.h> | ||
18 | #include <linux/sched/jobctl.h> | ||
19 | #include <linux/sched/loadavg.h> | ||
20 | #include <linux/sched/mm.h> | ||
21 | #include <linux/sched/nohz.h> | ||
22 | #include <linux/sched/numa_balancing.h> | ||
23 | #include <linux/sched/prio.h> | ||
24 | #include <linux/sched/rt.h> | ||
25 | #include <linux/sched/signal.h> | ||
26 | #include <linux/sched/stat.h> | ||
27 | #include <linux/sched/sysctl.h> | ||
19 | #include <linux/sched/task.h> | 28 | #include <linux/sched/task.h> |
20 | #include <linux/sched/task_stack.h> | 29 | #include <linux/sched/task_stack.h> |
21 | #include <linux/sched/cputime.h> | 30 | #include <linux/sched/topology.h> |
22 | #include <linux/sched/init.h> | 31 | #include <linux/sched/user.h> |
32 | #include <linux/sched/wake_q.h> | ||
33 | #include <linux/sched/xacct.h> | ||
34 | |||
35 | #include <uapi/linux/sched/types.h> | ||
23 | 36 | ||
24 | #include <linux/u64_stats_sync.h> | ||
25 | #include <linux/kernel_stat.h> | ||
26 | #include <linux/binfmts.h> | 37 | #include <linux/binfmts.h> |
27 | #include <linux/mutex.h> | 38 | #include <linux/blkdev.h> |
28 | #include <linux/spinlock.h> | 39 | #include <linux/compat.h> |
40 | #include <linux/context_tracking.h> | ||
41 | #include <linux/cpufreq.h> | ||
42 | #include <linux/cpuidle.h> | ||
43 | #include <linux/cpuset.h> | ||
44 | #include <linux/ctype.h> | ||
45 | #include <linux/debugfs.h> | ||
46 | #include <linux/delayacct.h> | ||
47 | #include <linux/init_task.h> | ||
48 | #include <linux/kprobes.h> | ||
49 | #include <linux/kthread.h> | ||
50 | #include <linux/membarrier.h> | ||
51 | #include <linux/migrate.h> | ||
52 | #include <linux/mmu_context.h> | ||
53 | #include <linux/nmi.h> | ||
54 | #include <linux/proc_fs.h> | ||
55 | #include <linux/prefetch.h> | ||
56 | #include <linux/profile.h> | ||
57 | #include <linux/rcupdate_wait.h> | ||
58 | #include <linux/security.h> | ||
59 | #include <linux/stackprotector.h> | ||
29 | #include <linux/stop_machine.h> | 60 | #include <linux/stop_machine.h> |
30 | #include <linux/irq_work.h> | 61 | #include <linux/suspend.h> |
31 | #include <linux/tick.h> | 62 | #include <linux/swait.h> |
32 | #include <linux/slab.h> | 63 | #include <linux/syscalls.h> |
33 | #include <linux/cgroup.h> | 64 | #include <linux/task_work.h> |
65 | #include <linux/tsacct_kern.h> | ||
66 | |||
67 | #include <asm/tlb.h> | ||
34 | 68 | ||
35 | #ifdef CONFIG_PARAVIRT | 69 | #ifdef CONFIG_PARAVIRT |
36 | #include <asm/paravirt.h> | 70 | # include <asm/paravirt.h> |
37 | #endif | 71 | #endif |
38 | 72 | ||
39 | #include "cpupri.h" | 73 | #include "cpupri.h" |
@@ -79,11 +113,11 @@ static inline void cpu_load_update_active(struct rq *this_rq) { } | |||
79 | * and does not change the user-interface for setting shares/weights. | 113 | * and does not change the user-interface for setting shares/weights. |
80 | * | 114 | * |
81 | * We increase resolution only if we have enough bits to allow this increased | 115 | * We increase resolution only if we have enough bits to allow this increased |
82 | * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are | 116 | * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit |
83 | * pretty high and the returns do not justify the increased costs. | 117 | * are pretty high and the returns do not justify the increased costs. |
84 | * | 118 | * |
85 | * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to | 119 | * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to |
86 | * increase coverage and consistency always enable it on 64bit platforms. | 120 | * increase coverage and consistency always enable it on 64-bit platforms. |
87 | */ | 121 | */ |
88 | #ifdef CONFIG_64BIT | 122 | #ifdef CONFIG_64BIT |
89 | # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) | 123 | # define NICE_0_LOAD_SHIFT (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT) |
@@ -111,16 +145,12 @@ static inline void cpu_load_update_active(struct rq *this_rq) { } | |||
111 | * 10 -> just above 1us | 145 | * 10 -> just above 1us |
112 | * 9 -> just above 0.5us | 146 | * 9 -> just above 0.5us |
113 | */ | 147 | */ |
114 | #define DL_SCALE (10) | 148 | #define DL_SCALE 10 |
115 | |||
116 | /* | ||
117 | * These are the 'tuning knobs' of the scheduler: | ||
118 | */ | ||
119 | 149 | ||
120 | /* | 150 | /* |
121 | * single value that denotes runtime == period, ie unlimited time. | 151 | * Single value that denotes runtime == period, ie unlimited time. |
122 | */ | 152 | */ |
123 | #define RUNTIME_INF ((u64)~0ULL) | 153 | #define RUNTIME_INF ((u64)~0ULL) |
124 | 154 | ||
125 | static inline int idle_policy(int policy) | 155 | static inline int idle_policy(int policy) |
126 | { | 156 | { |
@@ -235,9 +265,9 @@ void __dl_clear_params(struct task_struct *p); | |||
235 | * control. | 265 | * control. |
236 | */ | 266 | */ |
237 | struct dl_bandwidth { | 267 | struct dl_bandwidth { |
238 | raw_spinlock_t dl_runtime_lock; | 268 | raw_spinlock_t dl_runtime_lock; |
239 | u64 dl_runtime; | 269 | u64 dl_runtime; |
240 | u64 dl_period; | 270 | u64 dl_period; |
241 | }; | 271 | }; |
242 | 272 | ||
243 | static inline int dl_bandwidth_enabled(void) | 273 | static inline int dl_bandwidth_enabled(void) |
@@ -246,8 +276,9 @@ static inline int dl_bandwidth_enabled(void) | |||
246 | } | 276 | } |
247 | 277 | ||
248 | struct dl_bw { | 278 | struct dl_bw { |
249 | raw_spinlock_t lock; | 279 | raw_spinlock_t lock; |
250 | u64 bw, total_bw; | 280 | u64 bw; |
281 | u64 total_bw; | ||
251 | }; | 282 | }; |
252 | 283 | ||
253 | static inline void __dl_update(struct dl_bw *dl_b, s64 bw); | 284 | static inline void __dl_update(struct dl_bw *dl_b, s64 bw); |
@@ -273,20 +304,17 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw) | |||
273 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; | 304 | dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw; |
274 | } | 305 | } |
275 | 306 | ||
276 | void dl_change_utilization(struct task_struct *p, u64 new_bw); | 307 | extern void dl_change_utilization(struct task_struct *p, u64 new_bw); |
277 | extern void init_dl_bw(struct dl_bw *dl_b); | 308 | extern void init_dl_bw(struct dl_bw *dl_b); |
278 | extern int sched_dl_global_validate(void); | 309 | extern int sched_dl_global_validate(void); |
279 | extern void sched_dl_do_global(void); | 310 | extern void sched_dl_do_global(void); |
280 | extern int sched_dl_overflow(struct task_struct *p, int policy, | 311 | extern int sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr); |
281 | const struct sched_attr *attr); | ||
282 | extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); | 312 | extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr); |
283 | extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); | 313 | extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr); |
284 | extern bool __checkparam_dl(const struct sched_attr *attr); | 314 | extern bool __checkparam_dl(const struct sched_attr *attr); |
285 | extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); | 315 | extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); |
286 | extern int dl_task_can_attach(struct task_struct *p, | 316 | extern int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed); |
287 | const struct cpumask *cs_cpus_allowed); | 317 | extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); |
288 | extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, | ||
289 | const struct cpumask *trial); | ||
290 | extern bool dl_cpu_busy(unsigned int cpu); | 318 | extern bool dl_cpu_busy(unsigned int cpu); |
291 | 319 | ||
292 | #ifdef CONFIG_CGROUP_SCHED | 320 | #ifdef CONFIG_CGROUP_SCHED |
@@ -300,32 +328,36 @@ extern struct list_head task_groups; | |||
300 | 328 | ||
301 | struct cfs_bandwidth { | 329 | struct cfs_bandwidth { |
302 | #ifdef CONFIG_CFS_BANDWIDTH | 330 | #ifdef CONFIG_CFS_BANDWIDTH |
303 | raw_spinlock_t lock; | 331 | raw_spinlock_t lock; |
304 | ktime_t period; | 332 | ktime_t period; |
305 | u64 quota, runtime; | 333 | u64 quota; |
306 | s64 hierarchical_quota; | 334 | u64 runtime; |
307 | u64 runtime_expires; | 335 | s64 hierarchical_quota; |
308 | 336 | u64 runtime_expires; | |
309 | int idle, period_active; | 337 | |
310 | struct hrtimer period_timer, slack_timer; | 338 | int idle; |
311 | struct list_head throttled_cfs_rq; | 339 | int period_active; |
312 | 340 | struct hrtimer period_timer; | |
313 | /* statistics */ | 341 | struct hrtimer slack_timer; |
314 | int nr_periods, nr_throttled; | 342 | struct list_head throttled_cfs_rq; |
315 | u64 throttled_time; | 343 | |
344 | /* Statistics: */ | ||
345 | int nr_periods; | ||
346 | int nr_throttled; | ||
347 | u64 throttled_time; | ||
316 | #endif | 348 | #endif |
317 | }; | 349 | }; |
318 | 350 | ||
319 | /* task group related information */ | 351 | /* Task group related information */ |
320 | struct task_group { | 352 | struct task_group { |
321 | struct cgroup_subsys_state css; | 353 | struct cgroup_subsys_state css; |
322 | 354 | ||
323 | #ifdef CONFIG_FAIR_GROUP_SCHED | 355 | #ifdef CONFIG_FAIR_GROUP_SCHED |
324 | /* schedulable entities of this group on each cpu */ | 356 | /* schedulable entities of this group on each CPU */ |
325 | struct sched_entity **se; | 357 | struct sched_entity **se; |
326 | /* runqueue "owned" by this group on each cpu */ | 358 | /* runqueue "owned" by this group on each CPU */ |
327 | struct cfs_rq **cfs_rq; | 359 | struct cfs_rq **cfs_rq; |
328 | unsigned long shares; | 360 | unsigned long shares; |
329 | 361 | ||
330 | #ifdef CONFIG_SMP | 362 | #ifdef CONFIG_SMP |
331 | /* | 363 | /* |
@@ -333,29 +365,29 @@ struct task_group { | |||
333 | * it in its own cacheline separated from the fields above which | 365 | * it in its own cacheline separated from the fields above which |
334 | * will also be accessed at each tick. | 366 | * will also be accessed at each tick. |
335 | */ | 367 | */ |
336 | atomic_long_t load_avg ____cacheline_aligned; | 368 | atomic_long_t load_avg ____cacheline_aligned; |
337 | #endif | 369 | #endif |
338 | #endif | 370 | #endif |
339 | 371 | ||
340 | #ifdef CONFIG_RT_GROUP_SCHED | 372 | #ifdef CONFIG_RT_GROUP_SCHED |
341 | struct sched_rt_entity **rt_se; | 373 | struct sched_rt_entity **rt_se; |
342 | struct rt_rq **rt_rq; | 374 | struct rt_rq **rt_rq; |
343 | 375 | ||
344 | struct rt_bandwidth rt_bandwidth; | 376 | struct rt_bandwidth rt_bandwidth; |
345 | #endif | 377 | #endif |
346 | 378 | ||
347 | struct rcu_head rcu; | 379 | struct rcu_head rcu; |
348 | struct list_head list; | 380 | struct list_head list; |
349 | 381 | ||
350 | struct task_group *parent; | 382 | struct task_group *parent; |
351 | struct list_head siblings; | 383 | struct list_head siblings; |
352 | struct list_head children; | 384 | struct list_head children; |
353 | 385 | ||
354 | #ifdef CONFIG_SCHED_AUTOGROUP | 386 | #ifdef CONFIG_SCHED_AUTOGROUP |
355 | struct autogroup *autogroup; | 387 | struct autogroup *autogroup; |
356 | #endif | 388 | #endif |
357 | 389 | ||
358 | struct cfs_bandwidth cfs_bandwidth; | 390 | struct cfs_bandwidth cfs_bandwidth; |
359 | }; | 391 | }; |
360 | 392 | ||
361 | #ifdef CONFIG_FAIR_GROUP_SCHED | 393 | #ifdef CONFIG_FAIR_GROUP_SCHED |
@@ -369,8 +401,8 @@ struct task_group { | |||
369 | * (The default weight is 1024 - so there's no practical | 401 | * (The default weight is 1024 - so there's no practical |
370 | * limitation from this.) | 402 | * limitation from this.) |
371 | */ | 403 | */ |
372 | #define MIN_SHARES (1UL << 1) | 404 | #define MIN_SHARES (1UL << 1) |
373 | #define MAX_SHARES (1UL << 18) | 405 | #define MAX_SHARES (1UL << 18) |
374 | #endif | 406 | #endif |
375 | 407 | ||
376 | typedef int (*tg_visitor)(struct task_group *, void *); | 408 | typedef int (*tg_visitor)(struct task_group *, void *); |
@@ -443,35 +475,39 @@ struct cfs_bandwidth { }; | |||
443 | 475 | ||
444 | /* CFS-related fields in a runqueue */ | 476 | /* CFS-related fields in a runqueue */ |
445 | struct cfs_rq { | 477 | struct cfs_rq { |
446 | struct load_weight load; | 478 | struct load_weight load; |
447 | unsigned long runnable_weight; | 479 | unsigned long runnable_weight; |
448 | unsigned int nr_running, h_nr_running; | 480 | unsigned int nr_running; |
481 | unsigned int h_nr_running; | ||
449 | 482 | ||
450 | u64 exec_clock; | 483 | u64 exec_clock; |
451 | u64 min_vruntime; | 484 | u64 min_vruntime; |
452 | #ifndef CONFIG_64BIT | 485 | #ifndef CONFIG_64BIT |
453 | u64 min_vruntime_copy; | 486 | u64 min_vruntime_copy; |
454 | #endif | 487 | #endif |
455 | 488 | ||
456 | struct rb_root_cached tasks_timeline; | 489 | struct rb_root_cached tasks_timeline; |
457 | 490 | ||
458 | /* | 491 | /* |
459 | * 'curr' points to currently running entity on this cfs_rq. | 492 | * 'curr' points to currently running entity on this cfs_rq. |
460 | * It is set to NULL otherwise (i.e when none are currently running). | 493 | * It is set to NULL otherwise (i.e when none are currently running). |
461 | */ | 494 | */ |
462 | struct sched_entity *curr, *next, *last, *skip; | 495 | struct sched_entity *curr; |
496 | struct sched_entity *next; | ||
497 | struct sched_entity *last; | ||
498 | struct sched_entity *skip; | ||
463 | 499 | ||
464 | #ifdef CONFIG_SCHED_DEBUG | 500 | #ifdef CONFIG_SCHED_DEBUG |
465 | unsigned int nr_spread_over; | 501 | unsigned int nr_spread_over; |
466 | #endif | 502 | #endif |
467 | 503 | ||
468 | #ifdef CONFIG_SMP | 504 | #ifdef CONFIG_SMP |
469 | /* | 505 | /* |
470 | * CFS load tracking | 506 | * CFS load tracking |
471 | */ | 507 | */ |
472 | struct sched_avg avg; | 508 | struct sched_avg avg; |
473 | #ifndef CONFIG_64BIT | 509 | #ifndef CONFIG_64BIT |
474 | u64 load_last_update_time_copy; | 510 | u64 load_last_update_time_copy; |
475 | #endif | 511 | #endif |
476 | struct { | 512 | struct { |
477 | raw_spinlock_t lock ____cacheline_aligned; | 513 | raw_spinlock_t lock ____cacheline_aligned; |
@@ -482,9 +518,9 @@ struct cfs_rq { | |||
482 | } removed; | 518 | } removed; |
483 | 519 | ||
484 | #ifdef CONFIG_FAIR_GROUP_SCHED | 520 | #ifdef CONFIG_FAIR_GROUP_SCHED |
485 | unsigned long tg_load_avg_contrib; | 521 | unsigned long tg_load_avg_contrib; |
486 | long propagate; | 522 | long propagate; |
487 | long prop_runnable_sum; | 523 | long prop_runnable_sum; |
488 | 524 | ||
489 | /* | 525 | /* |
490 | * h_load = weight * f(tg) | 526 | * h_load = weight * f(tg) |
@@ -492,36 +528,38 @@ struct cfs_rq { | |||
492 | * Where f(tg) is the recursive weight fraction assigned to | 528 | * Where f(tg) is the recursive weight fraction assigned to |
493 | * this group. | 529 | * this group. |
494 | */ | 530 | */ |
495 | unsigned long h_load; | 531 | unsigned long h_load; |
496 | u64 last_h_load_update; | 532 | u64 last_h_load_update; |
497 | struct sched_entity *h_load_next; | 533 | struct sched_entity *h_load_next; |
498 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 534 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
499 | #endif /* CONFIG_SMP */ | 535 | #endif /* CONFIG_SMP */ |
500 | 536 | ||
501 | #ifdef CONFIG_FAIR_GROUP_SCHED | 537 | #ifdef CONFIG_FAIR_GROUP_SCHED |
502 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 538 | struct rq *rq; /* CPU runqueue to which this cfs_rq is attached */ |
503 | 539 | ||
504 | /* | 540 | /* |
505 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | 541 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in |
506 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | 542 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities |
507 | * (like users, containers etc.) | 543 | * (like users, containers etc.) |
508 | * | 544 | * |
509 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | 545 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU. |
510 | * list is used during load balance. | 546 | * This list is used during load balance. |
511 | */ | 547 | */ |
512 | int on_list; | 548 | int on_list; |
513 | struct list_head leaf_cfs_rq_list; | 549 | struct list_head leaf_cfs_rq_list; |
514 | struct task_group *tg; /* group that "owns" this runqueue */ | 550 | struct task_group *tg; /* group that "owns" this runqueue */ |
515 | 551 | ||
516 | #ifdef CONFIG_CFS_BANDWIDTH | 552 | #ifdef CONFIG_CFS_BANDWIDTH |
517 | int runtime_enabled; | 553 | int runtime_enabled; |
518 | u64 runtime_expires; | 554 | u64 runtime_expires; |
519 | s64 runtime_remaining; | 555 | s64 runtime_remaining; |
520 | 556 | ||
521 | u64 throttled_clock, throttled_clock_task; | 557 | u64 throttled_clock; |
522 | u64 throttled_clock_task_time; | 558 | u64 throttled_clock_task; |
523 | int throttled, throttle_count; | 559 | u64 throttled_clock_task_time; |
524 | struct list_head throttled_list; | 560 | int throttled; |
561 | int throttle_count; | ||
562 | struct list_head throttled_list; | ||
525 | #endif /* CONFIG_CFS_BANDWIDTH */ | 563 | #endif /* CONFIG_CFS_BANDWIDTH */ |
526 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 564 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
527 | }; | 565 | }; |
@@ -538,45 +576,45 @@ static inline int rt_bandwidth_enabled(void) | |||
538 | 576 | ||
539 | /* Real-Time classes' related field in a runqueue: */ | 577 | /* Real-Time classes' related field in a runqueue: */ |
540 | struct rt_rq { | 578 | struct rt_rq { |
541 | struct rt_prio_array active; | 579 | struct rt_prio_array active; |
542 | unsigned int rt_nr_running; | 580 | unsigned int rt_nr_running; |
543 | unsigned int rr_nr_running; | 581 | unsigned int rr_nr_running; |
544 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED | 582 | #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED |
545 | struct { | 583 | struct { |
546 | int curr; /* highest queued rt task prio */ | 584 | int curr; /* highest queued rt task prio */ |
547 | #ifdef CONFIG_SMP | 585 | #ifdef CONFIG_SMP |
548 | int next; /* next highest */ | 586 | int next; /* next highest */ |
549 | #endif | 587 | #endif |
550 | } highest_prio; | 588 | } highest_prio; |
551 | #endif | 589 | #endif |
552 | #ifdef CONFIG_SMP | 590 | #ifdef CONFIG_SMP |
553 | unsigned long rt_nr_migratory; | 591 | unsigned long rt_nr_migratory; |
554 | unsigned long rt_nr_total; | 592 | unsigned long rt_nr_total; |
555 | int overloaded; | 593 | int overloaded; |
556 | struct plist_head pushable_tasks; | 594 | struct plist_head pushable_tasks; |
557 | #endif /* CONFIG_SMP */ | 595 | #endif /* CONFIG_SMP */ |
558 | int rt_queued; | 596 | int rt_queued; |
559 | 597 | ||
560 | int rt_throttled; | 598 | int rt_throttled; |
561 | u64 rt_time; | 599 | u64 rt_time; |
562 | u64 rt_runtime; | 600 | u64 rt_runtime; |
563 | /* Nests inside the rq lock: */ | 601 | /* Nests inside the rq lock: */ |
564 | raw_spinlock_t rt_runtime_lock; | 602 | raw_spinlock_t rt_runtime_lock; |
565 | 603 | ||
566 | #ifdef CONFIG_RT_GROUP_SCHED | 604 | #ifdef CONFIG_RT_GROUP_SCHED |
567 | unsigned long rt_nr_boosted; | 605 | unsigned long rt_nr_boosted; |
568 | 606 | ||
569 | struct rq *rq; | 607 | struct rq *rq; |
570 | struct task_group *tg; | 608 | struct task_group *tg; |
571 | #endif | 609 | #endif |
572 | }; | 610 | }; |
573 | 611 | ||
574 | /* Deadline class' related fields in a runqueue */ | 612 | /* Deadline class' related fields in a runqueue */ |
575 | struct dl_rq { | 613 | struct dl_rq { |
576 | /* runqueue is an rbtree, ordered by deadline */ | 614 | /* runqueue is an rbtree, ordered by deadline */ |
577 | struct rb_root_cached root; | 615 | struct rb_root_cached root; |
578 | 616 | ||
579 | unsigned long dl_nr_running; | 617 | unsigned long dl_nr_running; |
580 | 618 | ||
581 | #ifdef CONFIG_SMP | 619 | #ifdef CONFIG_SMP |
582 | /* | 620 | /* |
@@ -586,28 +624,28 @@ struct dl_rq { | |||
586 | * should migrate somewhere else. | 624 | * should migrate somewhere else. |
587 | */ | 625 | */ |
588 | struct { | 626 | struct { |
589 | u64 curr; | 627 | u64 curr; |
590 | u64 next; | 628 | u64 next; |
591 | } earliest_dl; | 629 | } earliest_dl; |
592 | 630 | ||
593 | unsigned long dl_nr_migratory; | 631 | unsigned long dl_nr_migratory; |
594 | int overloaded; | 632 | int overloaded; |
595 | 633 | ||
596 | /* | 634 | /* |
597 | * Tasks on this rq that can be pushed away. They are kept in | 635 | * Tasks on this rq that can be pushed away. They are kept in |
598 | * an rb-tree, ordered by tasks' deadlines, with caching | 636 | * an rb-tree, ordered by tasks' deadlines, with caching |
599 | * of the leftmost (earliest deadline) element. | 637 | * of the leftmost (earliest deadline) element. |
600 | */ | 638 | */ |
601 | struct rb_root_cached pushable_dl_tasks_root; | 639 | struct rb_root_cached pushable_dl_tasks_root; |
602 | #else | 640 | #else |
603 | struct dl_bw dl_bw; | 641 | struct dl_bw dl_bw; |
604 | #endif | 642 | #endif |
605 | /* | 643 | /* |
606 | * "Active utilization" for this runqueue: increased when a | 644 | * "Active utilization" for this runqueue: increased when a |
607 | * task wakes up (becomes TASK_RUNNING) and decreased when a | 645 | * task wakes up (becomes TASK_RUNNING) and decreased when a |
608 | * task blocks | 646 | * task blocks |
609 | */ | 647 | */ |
610 | u64 running_bw; | 648 | u64 running_bw; |
611 | 649 | ||
612 | /* | 650 | /* |
613 | * Utilization of the tasks "assigned" to this runqueue (including | 651 | * Utilization of the tasks "assigned" to this runqueue (including |
@@ -618,14 +656,14 @@ struct dl_rq { | |||
618 | * This is needed to compute the "inactive utilization" for the | 656 | * This is needed to compute the "inactive utilization" for the |
619 | * runqueue (inactive utilization = this_bw - running_bw). | 657 | * runqueue (inactive utilization = this_bw - running_bw). |
620 | */ | 658 | */ |
621 | u64 this_bw; | 659 | u64 this_bw; |
622 | u64 extra_bw; | 660 | u64 extra_bw; |
623 | 661 | ||
624 | /* | 662 | /* |
625 | * Inverse of the fraction of CPU utilization that can be reclaimed | 663 | * Inverse of the fraction of CPU utilization that can be reclaimed |
626 | * by the GRUB algorithm. | 664 | * by the GRUB algorithm. |
627 | */ | 665 | */ |
628 | u64 bw_ratio; | 666 | u64 bw_ratio; |
629 | }; | 667 | }; |
630 | 668 | ||
631 | #ifdef CONFIG_SMP | 669 | #ifdef CONFIG_SMP |
@@ -638,51 +676,51 @@ static inline bool sched_asym_prefer(int a, int b) | |||
638 | /* | 676 | /* |
639 | * We add the notion of a root-domain which will be used to define per-domain | 677 | * We add the notion of a root-domain which will be used to define per-domain |
640 | * variables. Each exclusive cpuset essentially defines an island domain by | 678 | * variables. Each exclusive cpuset essentially defines an island domain by |
641 | * fully partitioning the member cpus from any other cpuset. Whenever a new | 679 | * fully partitioning the member CPUs from any other cpuset. Whenever a new |
642 | * exclusive cpuset is created, we also create and attach a new root-domain | 680 | * exclusive cpuset is created, we also create and attach a new root-domain |
643 | * object. | 681 | * object. |
644 | * | 682 | * |
645 | */ | 683 | */ |
646 | struct root_domain { | 684 | struct root_domain { |
647 | atomic_t refcount; | 685 | atomic_t refcount; |
648 | atomic_t rto_count; | 686 | atomic_t rto_count; |
649 | struct rcu_head rcu; | 687 | struct rcu_head rcu; |
650 | cpumask_var_t span; | 688 | cpumask_var_t span; |
651 | cpumask_var_t online; | 689 | cpumask_var_t online; |
652 | 690 | ||
653 | /* Indicate more than one runnable task for any CPU */ | 691 | /* Indicate more than one runnable task for any CPU */ |
654 | bool overload; | 692 | bool overload; |
655 | 693 | ||
656 | /* | 694 | /* |
657 | * The bit corresponding to a CPU gets set here if such CPU has more | 695 | * The bit corresponding to a CPU gets set here if such CPU has more |
658 | * than one runnable -deadline task (as it is below for RT tasks). | 696 | * than one runnable -deadline task (as it is below for RT tasks). |
659 | */ | 697 | */ |
660 | cpumask_var_t dlo_mask; | 698 | cpumask_var_t dlo_mask; |
661 | atomic_t dlo_count; | 699 | atomic_t dlo_count; |
662 | struct dl_bw dl_bw; | 700 | struct dl_bw dl_bw; |
663 | struct cpudl cpudl; | 701 | struct cpudl cpudl; |
664 | 702 | ||
665 | #ifdef HAVE_RT_PUSH_IPI | 703 | #ifdef HAVE_RT_PUSH_IPI |
666 | /* | 704 | /* |
667 | * For IPI pull requests, loop across the rto_mask. | 705 | * For IPI pull requests, loop across the rto_mask. |
668 | */ | 706 | */ |
669 | struct irq_work rto_push_work; | 707 | struct irq_work rto_push_work; |
670 | raw_spinlock_t rto_lock; | 708 | raw_spinlock_t rto_lock; |
671 | /* These are only updated and read within rto_lock */ | 709 | /* These are only updated and read within rto_lock */ |
672 | int rto_loop; | 710 | int rto_loop; |
673 | int rto_cpu; | 711 | int rto_cpu; |
674 | /* These atomics are updated outside of a lock */ | 712 | /* These atomics are updated outside of a lock */ |
675 | atomic_t rto_loop_next; | 713 | atomic_t rto_loop_next; |
676 | atomic_t rto_loop_start; | 714 | atomic_t rto_loop_start; |
677 | #endif | 715 | #endif |
678 | /* | 716 | /* |
679 | * The "RT overload" flag: it gets set if a CPU has more than | 717 | * The "RT overload" flag: it gets set if a CPU has more than |
680 | * one runnable RT task. | 718 | * one runnable RT task. |
681 | */ | 719 | */ |
682 | cpumask_var_t rto_mask; | 720 | cpumask_var_t rto_mask; |
683 | struct cpupri cpupri; | 721 | struct cpupri cpupri; |
684 | 722 | ||
685 | unsigned long max_cpu_capacity; | 723 | unsigned long max_cpu_capacity; |
686 | }; | 724 | }; |
687 | 725 | ||
688 | extern struct root_domain def_root_domain; | 726 | extern struct root_domain def_root_domain; |
@@ -708,41 +746,39 @@ extern void rto_push_irq_work_func(struct irq_work *work); | |||
708 | */ | 746 | */ |
709 | struct rq { | 747 | struct rq { |
710 | /* runqueue lock: */ | 748 | /* runqueue lock: */ |
711 | raw_spinlock_t lock; | 749 | raw_spinlock_t lock; |
712 | 750 | ||
713 | /* | 751 | /* |
714 | * nr_running and cpu_load should be in the same cacheline because | 752 | * nr_running and cpu_load should be in the same cacheline because |
715 | * remote CPUs use both these fields when doing load calculation. | 753 | * remote CPUs use both these fields when doing load calculation. |
716 | */ | 754 | */ |
717 | unsigned int nr_running; | 755 | unsigned int nr_running; |
718 | #ifdef CONFIG_NUMA_BALANCING | 756 | #ifdef CONFIG_NUMA_BALANCING |
719 | unsigned int nr_numa_running; | 757 | unsigned int nr_numa_running; |
720 | unsigned int nr_preferred_running; | 758 | unsigned int nr_preferred_running; |
721 | #endif | 759 | #endif |
722 | #define CPU_LOAD_IDX_MAX 5 | 760 | #define CPU_LOAD_IDX_MAX 5 |
723 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; | 761 | unsigned long cpu_load[CPU_LOAD_IDX_MAX]; |
724 | #ifdef CONFIG_NO_HZ_COMMON | 762 | #ifdef CONFIG_NO_HZ_COMMON |
725 | #ifdef CONFIG_SMP | 763 | #ifdef CONFIG_SMP |
726 | unsigned long last_load_update_tick; | 764 | unsigned long last_load_update_tick; |
727 | #endif /* CONFIG_SMP */ | 765 | #endif /* CONFIG_SMP */ |
728 | unsigned long nohz_flags; | 766 | unsigned long nohz_flags; |
729 | #endif /* CONFIG_NO_HZ_COMMON */ | 767 | #endif /* CONFIG_NO_HZ_COMMON */ |
730 | #ifdef CONFIG_NO_HZ_FULL | ||
731 | unsigned long last_sched_tick; | ||
732 | #endif | ||
733 | /* capture load from *all* tasks on this cpu: */ | ||
734 | struct load_weight load; | ||
735 | unsigned long nr_load_updates; | ||
736 | u64 nr_switches; | ||
737 | 768 | ||
738 | struct cfs_rq cfs; | 769 | /* capture load from *all* tasks on this CPU: */ |
739 | struct rt_rq rt; | 770 | struct load_weight load; |
740 | struct dl_rq dl; | 771 | unsigned long nr_load_updates; |
772 | u64 nr_switches; | ||
773 | |||
774 | struct cfs_rq cfs; | ||
775 | struct rt_rq rt; | ||
776 | struct dl_rq dl; | ||
741 | 777 | ||
742 | #ifdef CONFIG_FAIR_GROUP_SCHED | 778 | #ifdef CONFIG_FAIR_GROUP_SCHED |
743 | /* list of leaf cfs_rq on this cpu: */ | 779 | /* list of leaf cfs_rq on this CPU: */ |
744 | struct list_head leaf_cfs_rq_list; | 780 | struct list_head leaf_cfs_rq_list; |
745 | struct list_head *tmp_alone_branch; | 781 | struct list_head *tmp_alone_branch; |
746 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 782 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
747 | 783 | ||
748 | /* | 784 | /* |
@@ -751,94 +787,98 @@ struct rq { | |||
751 | * one CPU and if it got migrated afterwards it may decrease | 787 | * one CPU and if it got migrated afterwards it may decrease |
752 | * it on another CPU. Always updated under the runqueue lock: | 788 | * it on another CPU. Always updated under the runqueue lock: |
753 | */ | 789 | */ |
754 | unsigned long nr_uninterruptible; | 790 | unsigned long nr_uninterruptible; |
755 | 791 | ||
756 | struct task_struct *curr, *idle, *stop; | 792 | struct task_struct *curr; |
757 | unsigned long next_balance; | 793 | struct task_struct *idle; |
758 | struct mm_struct *prev_mm; | 794 | struct task_struct *stop; |
795 | unsigned long next_balance; | ||
796 | struct mm_struct *prev_mm; | ||
759 | 797 | ||
760 | unsigned int clock_update_flags; | 798 | unsigned int clock_update_flags; |
761 | u64 clock; | 799 | u64 clock; |
762 | u64 clock_task; | 800 | u64 clock_task; |
763 | 801 | ||
764 | atomic_t nr_iowait; | 802 | atomic_t nr_iowait; |
765 | 803 | ||
766 | #ifdef CONFIG_SMP | 804 | #ifdef CONFIG_SMP |
767 | struct root_domain *rd; | 805 | struct root_domain *rd; |
768 | struct sched_domain *sd; | 806 | struct sched_domain *sd; |
807 | |||
808 | unsigned long cpu_capacity; | ||
809 | unsigned long cpu_capacity_orig; | ||
769 | 810 | ||
770 | unsigned long cpu_capacity; | 811 | struct callback_head *balance_callback; |
771 | unsigned long cpu_capacity_orig; | ||
772 | 812 | ||
773 | struct callback_head *balance_callback; | 813 | unsigned char idle_balance; |
774 | 814 | ||
775 | unsigned char idle_balance; | ||
776 | /* For active balancing */ | 815 | /* For active balancing */ |
777 | int active_balance; | 816 | int active_balance; |
778 | int push_cpu; | 817 | int push_cpu; |
779 | struct cpu_stop_work active_balance_work; | 818 | struct cpu_stop_work active_balance_work; |
780 | /* cpu of this runqueue: */ | 819 | |
781 | int cpu; | 820 | /* CPU of this runqueue: */ |
782 | int online; | 821 | int cpu; |
822 | int online; | ||
783 | 823 | ||
784 | struct list_head cfs_tasks; | 824 | struct list_head cfs_tasks; |
785 | 825 | ||
786 | u64 rt_avg; | 826 | u64 rt_avg; |
787 | u64 age_stamp; | 827 | u64 age_stamp; |
788 | u64 idle_stamp; | 828 | u64 idle_stamp; |
789 | u64 avg_idle; | 829 | u64 avg_idle; |
790 | 830 | ||
791 | /* This is used to determine avg_idle's max value */ | 831 | /* This is used to determine avg_idle's max value */ |
792 | u64 max_idle_balance_cost; | 832 | u64 max_idle_balance_cost; |
793 | #endif | 833 | #endif |
794 | 834 | ||
795 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING | 835 | #ifdef CONFIG_IRQ_TIME_ACCOUNTING |
796 | u64 prev_irq_time; | 836 | u64 prev_irq_time; |
797 | #endif | 837 | #endif |
798 | #ifdef CONFIG_PARAVIRT | 838 | #ifdef CONFIG_PARAVIRT |
799 | u64 prev_steal_time; | 839 | u64 prev_steal_time; |
800 | #endif | 840 | #endif |
801 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING | 841 | #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING |
802 | u64 prev_steal_time_rq; | 842 | u64 prev_steal_time_rq; |
803 | #endif | 843 | #endif |
804 | 844 | ||
805 | /* calc_load related fields */ | 845 | /* calc_load related fields */ |
806 | unsigned long calc_load_update; | 846 | unsigned long calc_load_update; |
807 | long calc_load_active; | 847 | long calc_load_active; |
808 | 848 | ||
809 | #ifdef CONFIG_SCHED_HRTICK | 849 | #ifdef CONFIG_SCHED_HRTICK |
810 | #ifdef CONFIG_SMP | 850 | #ifdef CONFIG_SMP |
811 | int hrtick_csd_pending; | 851 | int hrtick_csd_pending; |
812 | call_single_data_t hrtick_csd; | 852 | call_single_data_t hrtick_csd; |
813 | #endif | 853 | #endif |
814 | struct hrtimer hrtick_timer; | 854 | struct hrtimer hrtick_timer; |
815 | #endif | 855 | #endif |
816 | 856 | ||
817 | #ifdef CONFIG_SCHEDSTATS | 857 | #ifdef CONFIG_SCHEDSTATS |
818 | /* latency stats */ | 858 | /* latency stats */ |
819 | struct sched_info rq_sched_info; | 859 | struct sched_info rq_sched_info; |
820 | unsigned long long rq_cpu_time; | 860 | unsigned long long rq_cpu_time; |
821 | /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ | 861 | /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ |
822 | 862 | ||
823 | /* sys_sched_yield() stats */ | 863 | /* sys_sched_yield() stats */ |
824 | unsigned int yld_count; | 864 | unsigned int yld_count; |
825 | 865 | ||
826 | /* schedule() stats */ | 866 | /* schedule() stats */ |
827 | unsigned int sched_count; | 867 | unsigned int sched_count; |
828 | unsigned int sched_goidle; | 868 | unsigned int sched_goidle; |
829 | 869 | ||
830 | /* try_to_wake_up() stats */ | 870 | /* try_to_wake_up() stats */ |
831 | unsigned int ttwu_count; | 871 | unsigned int ttwu_count; |
832 | unsigned int ttwu_local; | 872 | unsigned int ttwu_local; |
833 | #endif | 873 | #endif |
834 | 874 | ||
835 | #ifdef CONFIG_SMP | 875 | #ifdef CONFIG_SMP |
836 | struct llist_head wake_list; | 876 | struct llist_head wake_list; |
837 | #endif | 877 | #endif |
838 | 878 | ||
839 | #ifdef CONFIG_CPU_IDLE | 879 | #ifdef CONFIG_CPU_IDLE |
840 | /* Must be inspected within a rcu lock section */ | 880 | /* Must be inspected within a rcu lock section */ |
841 | struct cpuidle_state *idle_state; | 881 | struct cpuidle_state *idle_state; |
842 | #endif | 882 | #endif |
843 | }; | 883 | }; |
844 | 884 | ||
@@ -904,9 +944,9 @@ static inline u64 __rq_clock_broken(struct rq *rq) | |||
904 | * one position though, because the next rq_unpin_lock() will shift it | 944 | * one position though, because the next rq_unpin_lock() will shift it |
905 | * back. | 945 | * back. |
906 | */ | 946 | */ |
907 | #define RQCF_REQ_SKIP 0x01 | 947 | #define RQCF_REQ_SKIP 0x01 |
908 | #define RQCF_ACT_SKIP 0x02 | 948 | #define RQCF_ACT_SKIP 0x02 |
909 | #define RQCF_UPDATED 0x04 | 949 | #define RQCF_UPDATED 0x04 |
910 | 950 | ||
911 | static inline void assert_clock_updated(struct rq *rq) | 951 | static inline void assert_clock_updated(struct rq *rq) |
912 | { | 952 | { |
@@ -1059,12 +1099,12 @@ extern void sched_ttwu_pending(void); | |||
1059 | 1099 | ||
1060 | /** | 1100 | /** |
1061 | * highest_flag_domain - Return highest sched_domain containing flag. | 1101 | * highest_flag_domain - Return highest sched_domain containing flag. |
1062 | * @cpu: The cpu whose highest level of sched domain is to | 1102 | * @cpu: The CPU whose highest level of sched domain is to |
1063 | * be returned. | 1103 | * be returned. |
1064 | * @flag: The flag to check for the highest sched_domain | 1104 | * @flag: The flag to check for the highest sched_domain |
1065 | * for the given cpu. | 1105 | * for the given CPU. |
1066 | * | 1106 | * |
1067 | * Returns the highest sched_domain of a cpu which contains the given flag. | 1107 | * Returns the highest sched_domain of a CPU which contains the given flag. |
1068 | */ | 1108 | */ |
1069 | static inline struct sched_domain *highest_flag_domain(int cpu, int flag) | 1109 | static inline struct sched_domain *highest_flag_domain(int cpu, int flag) |
1070 | { | 1110 | { |
@@ -1099,30 +1139,30 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa); | |||
1099 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); | 1139 | DECLARE_PER_CPU(struct sched_domain *, sd_asym); |
1100 | 1140 | ||
1101 | struct sched_group_capacity { | 1141 | struct sched_group_capacity { |
1102 | atomic_t ref; | 1142 | atomic_t ref; |
1103 | /* | 1143 | /* |
1104 | * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity | 1144 | * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity |
1105 | * for a single CPU. | 1145 | * for a single CPU. |
1106 | */ | 1146 | */ |
1107 | unsigned long capacity; | 1147 | unsigned long capacity; |
1108 | unsigned long min_capacity; /* Min per-CPU capacity in group */ | 1148 | unsigned long min_capacity; /* Min per-CPU capacity in group */ |
1109 | unsigned long next_update; | 1149 | unsigned long next_update; |
1110 | int imbalance; /* XXX unrelated to capacity but shared group state */ | 1150 | int imbalance; /* XXX unrelated to capacity but shared group state */ |
1111 | 1151 | ||
1112 | #ifdef CONFIG_SCHED_DEBUG | 1152 | #ifdef CONFIG_SCHED_DEBUG |
1113 | int id; | 1153 | int id; |
1114 | #endif | 1154 | #endif |
1115 | 1155 | ||
1116 | unsigned long cpumask[0]; /* balance mask */ | 1156 | unsigned long cpumask[0]; /* Balance mask */ |
1117 | }; | 1157 | }; |
1118 | 1158 | ||
1119 | struct sched_group { | 1159 | struct sched_group { |
1120 | struct sched_group *next; /* Must be a circular list */ | 1160 | struct sched_group *next; /* Must be a circular list */ |
1121 | atomic_t ref; | 1161 | atomic_t ref; |
1122 | 1162 | ||
1123 | unsigned int group_weight; | 1163 | unsigned int group_weight; |
1124 | struct sched_group_capacity *sgc; | 1164 | struct sched_group_capacity *sgc; |
1125 | int asym_prefer_cpu; /* cpu of highest priority in group */ | 1165 | int asym_prefer_cpu; /* CPU of highest priority in group */ |
1126 | 1166 | ||
1127 | /* | 1167 | /* |
1128 | * The CPUs this group covers. | 1168 | * The CPUs this group covers. |
@@ -1131,7 +1171,7 @@ struct sched_group { | |||
1131 | * by attaching extra space to the end of the structure, | 1171 | * by attaching extra space to the end of the structure, |
1132 | * depending on how many CPUs the kernel has booted up with) | 1172 | * depending on how many CPUs the kernel has booted up with) |
1133 | */ | 1173 | */ |
1134 | unsigned long cpumask[0]; | 1174 | unsigned long cpumask[0]; |
1135 | }; | 1175 | }; |
1136 | 1176 | ||
1137 | static inline struct cpumask *sched_group_span(struct sched_group *sg) | 1177 | static inline struct cpumask *sched_group_span(struct sched_group *sg) |
@@ -1148,8 +1188,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg) | |||
1148 | } | 1188 | } |
1149 | 1189 | ||
1150 | /** | 1190 | /** |
1151 | * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. | 1191 | * group_first_cpu - Returns the first CPU in the cpumask of a sched_group. |
1152 | * @group: The group whose first cpu is to be returned. | 1192 | * @group: The group whose first CPU is to be returned. |
1153 | */ | 1193 | */ |
1154 | static inline unsigned int group_first_cpu(struct sched_group *group) | 1194 | static inline unsigned int group_first_cpu(struct sched_group *group) |
1155 | { | 1195 | { |
@@ -1349,19 +1389,12 @@ static inline int task_on_rq_migrating(struct task_struct *p) | |||
1349 | return p->on_rq == TASK_ON_RQ_MIGRATING; | 1389 | return p->on_rq == TASK_ON_RQ_MIGRATING; |
1350 | } | 1390 | } |
1351 | 1391 | ||
1352 | #ifndef prepare_arch_switch | ||
1353 | # define prepare_arch_switch(next) do { } while (0) | ||
1354 | #endif | ||
1355 | #ifndef finish_arch_post_lock_switch | ||
1356 | # define finish_arch_post_lock_switch() do { } while (0) | ||
1357 | #endif | ||
1358 | |||
1359 | /* | 1392 | /* |
1360 | * wake flags | 1393 | * wake flags |
1361 | */ | 1394 | */ |
1362 | #define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ | 1395 | #define WF_SYNC 0x01 /* Waker goes to sleep after wakeup */ |
1363 | #define WF_FORK 0x02 /* child wakeup after fork */ | 1396 | #define WF_FORK 0x02 /* Child wakeup after fork */ |
1364 | #define WF_MIGRATED 0x4 /* internal use, task got migrated */ | 1397 | #define WF_MIGRATED 0x4 /* Internal use, task got migrated */ |
1365 | 1398 | ||
1366 | /* | 1399 | /* |
1367 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 1400 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
@@ -1372,11 +1405,11 @@ static inline int task_on_rq_migrating(struct task_struct *p) | |||
1372 | * slice expiry etc. | 1405 | * slice expiry etc. |
1373 | */ | 1406 | */ |
1374 | 1407 | ||
1375 | #define WEIGHT_IDLEPRIO 3 | 1408 | #define WEIGHT_IDLEPRIO 3 |
1376 | #define WMULT_IDLEPRIO 1431655765 | 1409 | #define WMULT_IDLEPRIO 1431655765 |
1377 | 1410 | ||
1378 | extern const int sched_prio_to_weight[40]; | 1411 | extern const int sched_prio_to_weight[40]; |
1379 | extern const u32 sched_prio_to_wmult[40]; | 1412 | extern const u32 sched_prio_to_wmult[40]; |
1380 | 1413 | ||
1381 | /* | 1414 | /* |
1382 | * {de,en}queue flags: | 1415 | * {de,en}queue flags: |
@@ -1398,9 +1431,9 @@ extern const u32 sched_prio_to_wmult[40]; | |||
1398 | */ | 1431 | */ |
1399 | 1432 | ||
1400 | #define DEQUEUE_SLEEP 0x01 | 1433 | #define DEQUEUE_SLEEP 0x01 |
1401 | #define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ | 1434 | #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ |
1402 | #define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ | 1435 | #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ |
1403 | #define DEQUEUE_NOCLOCK 0x08 /* matches ENQUEUE_NOCLOCK */ | 1436 | #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ |
1404 | 1437 | ||
1405 | #define ENQUEUE_WAKEUP 0x01 | 1438 | #define ENQUEUE_WAKEUP 0x01 |
1406 | #define ENQUEUE_RESTORE 0x02 | 1439 | #define ENQUEUE_RESTORE 0x02 |
@@ -1422,10 +1455,10 @@ struct sched_class { | |||
1422 | 1455 | ||
1423 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); | 1456 | void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); |
1424 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); | 1457 | void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); |
1425 | void (*yield_task) (struct rq *rq); | 1458 | void (*yield_task) (struct rq *rq); |
1426 | bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt); | 1459 | bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt); |
1427 | 1460 | ||
1428 | void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags); | 1461 | void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); |
1429 | 1462 | ||
1430 | /* | 1463 | /* |
1431 | * It is the responsibility of the pick_next_task() method that will | 1464 | * It is the responsibility of the pick_next_task() method that will |
@@ -1435,16 +1468,16 @@ struct sched_class { | |||
1435 | * May return RETRY_TASK when it finds a higher prio class has runnable | 1468 | * May return RETRY_TASK when it finds a higher prio class has runnable |
1436 | * tasks. | 1469 | * tasks. |
1437 | */ | 1470 | */ |
1438 | struct task_struct * (*pick_next_task) (struct rq *rq, | 1471 | struct task_struct * (*pick_next_task)(struct rq *rq, |
1439 | struct task_struct *prev, | 1472 | struct task_struct *prev, |
1440 | struct rq_flags *rf); | 1473 | struct rq_flags *rf); |
1441 | void (*put_prev_task) (struct rq *rq, struct task_struct *p); | 1474 | void (*put_prev_task)(struct rq *rq, struct task_struct *p); |
1442 | 1475 | ||
1443 | #ifdef CONFIG_SMP | 1476 | #ifdef CONFIG_SMP |
1444 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); | 1477 | int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags); |
1445 | void (*migrate_task_rq)(struct task_struct *p); | 1478 | void (*migrate_task_rq)(struct task_struct *p); |
1446 | 1479 | ||
1447 | void (*task_woken) (struct rq *this_rq, struct task_struct *task); | 1480 | void (*task_woken)(struct rq *this_rq, struct task_struct *task); |
1448 | 1481 | ||
1449 | void (*set_cpus_allowed)(struct task_struct *p, | 1482 | void (*set_cpus_allowed)(struct task_struct *p, |
1450 | const struct cpumask *newmask); | 1483 | const struct cpumask *newmask); |
@@ -1453,31 +1486,31 @@ struct sched_class { | |||
1453 | void (*rq_offline)(struct rq *rq); | 1486 | void (*rq_offline)(struct rq *rq); |
1454 | #endif | 1487 | #endif |
1455 | 1488 | ||
1456 | void (*set_curr_task) (struct rq *rq); | 1489 | void (*set_curr_task)(struct rq *rq); |
1457 | void (*task_tick) (struct rq *rq, struct task_struct *p, int queued); | 1490 | void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); |
1458 | void (*task_fork) (struct task_struct *p); | 1491 | void (*task_fork)(struct task_struct *p); |
1459 | void (*task_dead) (struct task_struct *p); | 1492 | void (*task_dead)(struct task_struct *p); |
1460 | 1493 | ||
1461 | /* | 1494 | /* |
1462 | * The switched_from() call is allowed to drop rq->lock, therefore we | 1495 | * The switched_from() call is allowed to drop rq->lock, therefore we |
1463 | * cannot assume the switched_from/switched_to pair is serliazed by | 1496 | * cannot assume the switched_from/switched_to pair is serliazed by |
1464 | * rq->lock. They are however serialized by p->pi_lock. | 1497 | * rq->lock. They are however serialized by p->pi_lock. |
1465 | */ | 1498 | */ |
1466 | void (*switched_from) (struct rq *this_rq, struct task_struct *task); | 1499 | void (*switched_from)(struct rq *this_rq, struct task_struct *task); |
1467 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); | 1500 | void (*switched_to) (struct rq *this_rq, struct task_struct *task); |
1468 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, | 1501 | void (*prio_changed) (struct rq *this_rq, struct task_struct *task, |
1469 | int oldprio); | 1502 | int oldprio); |
1470 | 1503 | ||
1471 | unsigned int (*get_rr_interval) (struct rq *rq, | 1504 | unsigned int (*get_rr_interval)(struct rq *rq, |
1472 | struct task_struct *task); | 1505 | struct task_struct *task); |
1473 | 1506 | ||
1474 | void (*update_curr) (struct rq *rq); | 1507 | void (*update_curr)(struct rq *rq); |
1475 | 1508 | ||
1476 | #define TASK_SET_GROUP 0 | 1509 | #define TASK_SET_GROUP 0 |
1477 | #define TASK_MOVE_GROUP 1 | 1510 | #define TASK_MOVE_GROUP 1 |
1478 | 1511 | ||
1479 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1512 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1480 | void (*task_change_group) (struct task_struct *p, int type); | 1513 | void (*task_change_group)(struct task_struct *p, int type); |
1481 | #endif | 1514 | #endif |
1482 | }; | 1515 | }; |
1483 | 1516 | ||
@@ -1526,6 +1559,7 @@ static inline void idle_set_state(struct rq *rq, | |||
1526 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) | 1559 | static inline struct cpuidle_state *idle_get_state(struct rq *rq) |
1527 | { | 1560 | { |
1528 | SCHED_WARN_ON(!rcu_read_lock_held()); | 1561 | SCHED_WARN_ON(!rcu_read_lock_held()); |
1562 | |||
1529 | return rq->idle_state; | 1563 | return rq->idle_state; |
1530 | } | 1564 | } |
1531 | #else | 1565 | #else |
@@ -1564,9 +1598,9 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se); | |||
1564 | extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); | 1598 | extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se); |
1565 | extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); | 1599 | extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq); |
1566 | 1600 | ||
1567 | #define BW_SHIFT 20 | 1601 | #define BW_SHIFT 20 |
1568 | #define BW_UNIT (1 << BW_SHIFT) | 1602 | #define BW_UNIT (1 << BW_SHIFT) |
1569 | #define RATIO_SHIFT 8 | 1603 | #define RATIO_SHIFT 8 |
1570 | unsigned long to_ratio(u64 period, u64 runtime); | 1604 | unsigned long to_ratio(u64 period, u64 runtime); |
1571 | 1605 | ||
1572 | extern void init_entity_runnable_average(struct sched_entity *se); | 1606 | extern void init_entity_runnable_average(struct sched_entity *se); |
@@ -1574,6 +1608,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se); | |||
1574 | 1608 | ||
1575 | #ifdef CONFIG_NO_HZ_FULL | 1609 | #ifdef CONFIG_NO_HZ_FULL |
1576 | extern bool sched_can_stop_tick(struct rq *rq); | 1610 | extern bool sched_can_stop_tick(struct rq *rq); |
1611 | extern int __init sched_tick_offload_init(void); | ||
1577 | 1612 | ||
1578 | /* | 1613 | /* |
1579 | * Tick may be needed by tasks in the runqueue depending on their policy and | 1614 | * Tick may be needed by tasks in the runqueue depending on their policy and |
@@ -1598,6 +1633,7 @@ static inline void sched_update_tick_dependency(struct rq *rq) | |||
1598 | tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); | 1633 | tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); |
1599 | } | 1634 | } |
1600 | #else | 1635 | #else |
1636 | static inline int sched_tick_offload_init(void) { return 0; } | ||
1601 | static inline void sched_update_tick_dependency(struct rq *rq) { } | 1637 | static inline void sched_update_tick_dependency(struct rq *rq) { } |
1602 | #endif | 1638 | #endif |
1603 | 1639 | ||
@@ -1624,13 +1660,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) | |||
1624 | sched_update_tick_dependency(rq); | 1660 | sched_update_tick_dependency(rq); |
1625 | } | 1661 | } |
1626 | 1662 | ||
1627 | static inline void rq_last_tick_reset(struct rq *rq) | ||
1628 | { | ||
1629 | #ifdef CONFIG_NO_HZ_FULL | ||
1630 | rq->last_sched_tick = jiffies; | ||
1631 | #endif | ||
1632 | } | ||
1633 | |||
1634 | extern void update_rq_clock(struct rq *rq); | 1663 | extern void update_rq_clock(struct rq *rq); |
1635 | 1664 | ||
1636 | extern void activate_task(struct rq *rq, struct task_struct *p, int flags); | 1665 | extern void activate_task(struct rq *rq, struct task_struct *p, int flags); |
@@ -1821,8 +1850,8 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
1821 | /* | 1850 | /* |
1822 | * Unfair double_lock_balance: Optimizes throughput at the expense of | 1851 | * Unfair double_lock_balance: Optimizes throughput at the expense of |
1823 | * latency by eliminating extra atomic operations when the locks are | 1852 | * latency by eliminating extra atomic operations when the locks are |
1824 | * already in proper order on entry. This favors lower cpu-ids and will | 1853 | * already in proper order on entry. This favors lower CPU-ids and will |
1825 | * grant the double lock to lower cpus over higher ids under contention, | 1854 | * grant the double lock to lower CPUs over higher ids under contention, |
1826 | * regardless of entry order into the function. | 1855 | * regardless of entry order into the function. |
1827 | */ | 1856 | */ |
1828 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | 1857 | static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) |
@@ -1854,7 +1883,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
1854 | static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) | 1883 | static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest) |
1855 | { | 1884 | { |
1856 | if (unlikely(!irqs_disabled())) { | 1885 | if (unlikely(!irqs_disabled())) { |
1857 | /* printk() doesn't work good under rq->lock */ | 1886 | /* printk() doesn't work well under rq->lock */ |
1858 | raw_spin_unlock(&this_rq->lock); | 1887 | raw_spin_unlock(&this_rq->lock); |
1859 | BUG_ON(1); | 1888 | BUG_ON(1); |
1860 | } | 1889 | } |
@@ -2113,15 +2142,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} | |||
2113 | #endif /* CONFIG_CPU_FREQ */ | 2142 | #endif /* CONFIG_CPU_FREQ */ |
2114 | 2143 | ||
2115 | #ifdef arch_scale_freq_capacity | 2144 | #ifdef arch_scale_freq_capacity |
2116 | #ifndef arch_scale_freq_invariant | 2145 | # ifndef arch_scale_freq_invariant |
2117 | #define arch_scale_freq_invariant() (true) | 2146 | # define arch_scale_freq_invariant() true |
2118 | #endif | 2147 | # endif |
2119 | #else /* arch_scale_freq_capacity */ | 2148 | #else |
2120 | #define arch_scale_freq_invariant() (false) | 2149 | # define arch_scale_freq_invariant() false |
2121 | #endif | 2150 | #endif |
2122 | 2151 | ||
2123 | #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL | 2152 | #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL |
2124 | |||
2125 | static inline unsigned long cpu_util_dl(struct rq *rq) | 2153 | static inline unsigned long cpu_util_dl(struct rq *rq) |
2126 | { | 2154 | { |
2127 | return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; | 2155 | return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT; |
@@ -2131,5 +2159,4 @@ static inline unsigned long cpu_util_cfs(struct rq *rq) | |||
2131 | { | 2159 | { |
2132 | return rq->cfs.avg.util_avg; | 2160 | return rq->cfs.avg.util_avg; |
2133 | } | 2161 | } |
2134 | |||
2135 | #endif | 2162 | #endif |
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c index 940b1fa1d2ce..ab112cbfd7c8 100644 --- a/kernel/sched/stats.c +++ b/kernel/sched/stats.c | |||
@@ -1,14 +1,13 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
2 | 2 | /* | |
3 | #include <linux/slab.h> | 3 | * /proc/schedstat implementation |
4 | #include <linux/fs.h> | 4 | */ |
5 | #include <linux/seq_file.h> | ||
6 | #include <linux/proc_fs.h> | ||
7 | |||
8 | #include "sched.h" | 5 | #include "sched.h" |
9 | 6 | ||
10 | /* | 7 | /* |
11 | * bump this up when changing the output format or the meaning of an existing | 8 | * Current schedstat API version. |
9 | * | ||
10 | * Bump this up when changing the output format or the meaning of an existing | ||
12 | * format, so that tools can adapt (or abort) | 11 | * format, so that tools can adapt (or abort) |
13 | */ | 12 | */ |
14 | #define SCHEDSTAT_VERSION 15 | 13 | #define SCHEDSTAT_VERSION 15 |
@@ -78,8 +77,8 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
78 | * This itererator needs some explanation. | 77 | * This itererator needs some explanation. |
79 | * It returns 1 for the header position. | 78 | * It returns 1 for the header position. |
80 | * This means 2 is cpu 0. | 79 | * This means 2 is cpu 0. |
81 | * In a hotplugged system some cpus, including cpu 0, may be missing so we have | 80 | * In a hotplugged system some CPUs, including cpu 0, may be missing so we have |
82 | * to use cpumask_* to iterate over the cpus. | 81 | * to use cpumask_* to iterate over the CPUs. |
83 | */ | 82 | */ |
84 | static void *schedstat_start(struct seq_file *file, loff_t *offset) | 83 | static void *schedstat_start(struct seq_file *file, loff_t *offset) |
85 | { | 84 | { |
@@ -99,12 +98,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset) | |||
99 | 98 | ||
100 | if (n < nr_cpu_ids) | 99 | if (n < nr_cpu_ids) |
101 | return (void *)(unsigned long)(n + 2); | 100 | return (void *)(unsigned long)(n + 2); |
101 | |||
102 | return NULL; | 102 | return NULL; |
103 | } | 103 | } |
104 | 104 | ||
105 | static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) | 105 | static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset) |
106 | { | 106 | { |
107 | (*offset)++; | 107 | (*offset)++; |
108 | |||
108 | return schedstat_start(file, offset); | 109 | return schedstat_start(file, offset); |
109 | } | 110 | } |
110 | 111 | ||
@@ -134,6 +135,7 @@ static const struct file_operations proc_schedstat_operations = { | |||
134 | static int __init proc_schedstat_init(void) | 135 | static int __init proc_schedstat_init(void) |
135 | { | 136 | { |
136 | proc_create("schedstat", 0, NULL, &proc_schedstat_operations); | 137 | proc_create("schedstat", 0, NULL, &proc_schedstat_operations); |
138 | |||
137 | return 0; | 139 | return 0; |
138 | } | 140 | } |
139 | subsys_initcall(proc_schedstat_init); | 141 | subsys_initcall(proc_schedstat_init); |
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 8e7b58de61e7..8aea199a39b4 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h | |||
@@ -30,35 +30,29 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | |||
30 | if (rq) | 30 | if (rq) |
31 | rq->rq_sched_info.run_delay += delta; | 31 | rq->rq_sched_info.run_delay += delta; |
32 | } | 32 | } |
33 | #define schedstat_enabled() static_branch_unlikely(&sched_schedstats) | 33 | #define schedstat_enabled() static_branch_unlikely(&sched_schedstats) |
34 | #define __schedstat_inc(var) do { var++; } while (0) | 34 | #define __schedstat_inc(var) do { var++; } while (0) |
35 | #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) | 35 | #define schedstat_inc(var) do { if (schedstat_enabled()) { var++; } } while (0) |
36 | #define __schedstat_add(var, amt) do { var += (amt); } while (0) | 36 | #define __schedstat_add(var, amt) do { var += (amt); } while (0) |
37 | #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) | 37 | #define schedstat_add(var, amt) do { if (schedstat_enabled()) { var += (amt); } } while (0) |
38 | #define __schedstat_set(var, val) do { var = (val); } while (0) | 38 | #define __schedstat_set(var, val) do { var = (val); } while (0) |
39 | #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) | 39 | #define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) |
40 | #define schedstat_val(var) (var) | 40 | #define schedstat_val(var) (var) |
41 | #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) | 41 | #define schedstat_val_or_zero(var) ((schedstat_enabled()) ? (var) : 0) |
42 | 42 | ||
43 | #else /* !CONFIG_SCHEDSTATS */ | 43 | #else /* !CONFIG_SCHEDSTATS: */ |
44 | static inline void | 44 | static inline void rq_sched_info_arrive (struct rq *rq, unsigned long long delta) { } |
45 | rq_sched_info_arrive(struct rq *rq, unsigned long long delta) | 45 | static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { } |
46 | {} | 46 | static inline void rq_sched_info_depart (struct rq *rq, unsigned long long delta) { } |
47 | static inline void | 47 | # define schedstat_enabled() 0 |
48 | rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) | 48 | # define __schedstat_inc(var) do { } while (0) |
49 | {} | 49 | # define schedstat_inc(var) do { } while (0) |
50 | static inline void | 50 | # define __schedstat_add(var, amt) do { } while (0) |
51 | rq_sched_info_depart(struct rq *rq, unsigned long long delta) | 51 | # define schedstat_add(var, amt) do { } while (0) |
52 | {} | 52 | # define __schedstat_set(var, val) do { } while (0) |
53 | #define schedstat_enabled() 0 | 53 | # define schedstat_set(var, val) do { } while (0) |
54 | #define __schedstat_inc(var) do { } while (0) | 54 | # define schedstat_val(var) 0 |
55 | #define schedstat_inc(var) do { } while (0) | 55 | # define schedstat_val_or_zero(var) 0 |
56 | #define __schedstat_add(var, amt) do { } while (0) | ||
57 | #define schedstat_add(var, amt) do { } while (0) | ||
58 | #define __schedstat_set(var, val) do { } while (0) | ||
59 | #define schedstat_set(var, val) do { } while (0) | ||
60 | #define schedstat_val(var) 0 | ||
61 | #define schedstat_val_or_zero(var) 0 | ||
62 | #endif /* CONFIG_SCHEDSTATS */ | 56 | #endif /* CONFIG_SCHEDSTATS */ |
63 | 57 | ||
64 | #ifdef CONFIG_SCHED_INFO | 58 | #ifdef CONFIG_SCHED_INFO |
@@ -69,9 +63,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t) | |||
69 | 63 | ||
70 | /* | 64 | /* |
71 | * We are interested in knowing how long it was from the *first* time a | 65 | * We are interested in knowing how long it was from the *first* time a |
72 | * task was queued to the time that it finally hit a cpu, we call this routine | 66 | * task was queued to the time that it finally hit a CPU, we call this routine |
73 | * from dequeue_task() to account for possible rq->clock skew across cpus. The | 67 | * from dequeue_task() to account for possible rq->clock skew across CPUs. The |
74 | * delta taken on each cpu would annul the skew. | 68 | * delta taken on each CPU would annul the skew. |
75 | */ | 69 | */ |
76 | static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) | 70 | static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) |
77 | { | 71 | { |
@@ -87,7 +81,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t) | |||
87 | } | 81 | } |
88 | 82 | ||
89 | /* | 83 | /* |
90 | * Called when a task finally hits the cpu. We can now calculate how | 84 | * Called when a task finally hits the CPU. We can now calculate how |
91 | * long it was waiting to run. We also note when it began so that we | 85 | * long it was waiting to run. We also note when it began so that we |
92 | * can keep stats on how long its timeslice is. | 86 | * can keep stats on how long its timeslice is. |
93 | */ | 87 | */ |
@@ -112,9 +106,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t) | |||
112 | */ | 106 | */ |
113 | static inline void sched_info_queued(struct rq *rq, struct task_struct *t) | 107 | static inline void sched_info_queued(struct rq *rq, struct task_struct *t) |
114 | { | 108 | { |
115 | if (unlikely(sched_info_on())) | 109 | if (unlikely(sched_info_on())) { |
116 | if (!t->sched_info.last_queued) | 110 | if (!t->sched_info.last_queued) |
117 | t->sched_info.last_queued = rq_clock(rq); | 111 | t->sched_info.last_queued = rq_clock(rq); |
112 | } | ||
118 | } | 113 | } |
119 | 114 | ||
120 | /* | 115 | /* |
@@ -127,8 +122,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t) | |||
127 | */ | 122 | */ |
128 | static inline void sched_info_depart(struct rq *rq, struct task_struct *t) | 123 | static inline void sched_info_depart(struct rq *rq, struct task_struct *t) |
129 | { | 124 | { |
130 | unsigned long long delta = rq_clock(rq) - | 125 | unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival; |
131 | t->sched_info.last_arrival; | ||
132 | 126 | ||
133 | rq_sched_info_depart(rq, delta); | 127 | rq_sched_info_depart(rq, delta); |
134 | 128 | ||
@@ -142,11 +136,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t) | |||
142 | * the idle task.) We are only called when prev != next. | 136 | * the idle task.) We are only called when prev != next. |
143 | */ | 137 | */ |
144 | static inline void | 138 | static inline void |
145 | __sched_info_switch(struct rq *rq, | 139 | __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) |
146 | struct task_struct *prev, struct task_struct *next) | ||
147 | { | 140 | { |
148 | /* | 141 | /* |
149 | * prev now departs the cpu. It's not interesting to record | 142 | * prev now departs the CPU. It's not interesting to record |
150 | * stats about how efficient we were at scheduling the idle | 143 | * stats about how efficient we were at scheduling the idle |
151 | * process, however. | 144 | * process, however. |
152 | */ | 145 | */ |
@@ -156,18 +149,19 @@ __sched_info_switch(struct rq *rq, | |||
156 | if (next != rq->idle) | 149 | if (next != rq->idle) |
157 | sched_info_arrive(rq, next); | 150 | sched_info_arrive(rq, next); |
158 | } | 151 | } |
152 | |||
159 | static inline void | 153 | static inline void |
160 | sched_info_switch(struct rq *rq, | 154 | sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) |
161 | struct task_struct *prev, struct task_struct *next) | ||
162 | { | 155 | { |
163 | if (unlikely(sched_info_on())) | 156 | if (unlikely(sched_info_on())) |
164 | __sched_info_switch(rq, prev, next); | 157 | __sched_info_switch(rq, prev, next); |
165 | } | 158 | } |
166 | #else | 159 | |
167 | #define sched_info_queued(rq, t) do { } while (0) | 160 | #else /* !CONFIG_SCHED_INFO: */ |
168 | #define sched_info_reset_dequeued(t) do { } while (0) | 161 | # define sched_info_queued(rq, t) do { } while (0) |
169 | #define sched_info_dequeued(rq, t) do { } while (0) | 162 | # define sched_info_reset_dequeued(t) do { } while (0) |
170 | #define sched_info_depart(rq, t) do { } while (0) | 163 | # define sched_info_dequeued(rq, t) do { } while (0) |
171 | #define sched_info_arrive(rq, next) do { } while (0) | 164 | # define sched_info_depart(rq, t) do { } while (0) |
172 | #define sched_info_switch(rq, t, next) do { } while (0) | 165 | # define sched_info_arrive(rq, next) do { } while (0) |
166 | # define sched_info_switch(rq, t, next) do { } while (0) | ||
173 | #endif /* CONFIG_SCHED_INFO */ | 167 | #endif /* CONFIG_SCHED_INFO */ |
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 210b1f2146ff..c183b790ca54 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c | |||
@@ -1,6 +1,4 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include "sched.h" | ||
3 | |||
4 | /* | 2 | /* |
5 | * stop-task scheduling class. | 3 | * stop-task scheduling class. |
6 | * | 4 | * |
@@ -9,6 +7,7 @@ | |||
9 | * | 7 | * |
10 | * See kernel/stop_machine.c | 8 | * See kernel/stop_machine.c |
11 | */ | 9 | */ |
10 | #include "sched.h" | ||
12 | 11 | ||
13 | #ifdef CONFIG_SMP | 12 | #ifdef CONFIG_SMP |
14 | static int | 13 | static int |
@@ -75,6 +74,14 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) | |||
75 | cgroup_account_cputime(curr, delta_exec); | 74 | cgroup_account_cputime(curr, delta_exec); |
76 | } | 75 | } |
77 | 76 | ||
77 | /* | ||
78 | * scheduler tick hitting a task of our scheduling class. | ||
79 | * | ||
80 | * NOTE: This function can be called remotely by the tick offload that | ||
81 | * goes along full dynticks. Therefore no local assumption can be made | ||
82 | * and everything must be accessed through the @rq and @curr passed in | ||
83 | * parameters. | ||
84 | */ | ||
78 | static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) | 85 | static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued) |
79 | { | 86 | { |
80 | } | 87 | } |
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index 9ff1555341ed..b6fb2c3b3ff7 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c | |||
@@ -1,6 +1,8 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | 1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #include <linux/sched/signal.h> | 2 | /* |
3 | #include <linux/swait.h> | 3 | * <linux/swait.h> (simple wait queues ) implementation: |
4 | */ | ||
5 | #include "sched.h" | ||
4 | 6 | ||
5 | void __init_swait_queue_head(struct swait_queue_head *q, const char *name, | 7 | void __init_swait_queue_head(struct swait_queue_head *q, const char *name, |
6 | struct lock_class_key *key) | 8 | struct lock_class_key *key) |
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 519b024f4e94..64cc564f5255 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c | |||
@@ -2,10 +2,6 @@ | |||
2 | /* | 2 | /* |
3 | * Scheduler topology setup/handling methods | 3 | * Scheduler topology setup/handling methods |
4 | */ | 4 | */ |
5 | #include <linux/sched.h> | ||
6 | #include <linux/mutex.h> | ||
7 | #include <linux/sched/isolation.h> | ||
8 | |||
9 | #include "sched.h" | 5 | #include "sched.h" |
10 | 6 | ||
11 | DEFINE_MUTEX(sched_domains_mutex); | 7 | DEFINE_MUTEX(sched_domains_mutex); |
@@ -41,8 +37,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
41 | if (!(sd->flags & SD_LOAD_BALANCE)) { | 37 | if (!(sd->flags & SD_LOAD_BALANCE)) { |
42 | printk("does not load-balance\n"); | 38 | printk("does not load-balance\n"); |
43 | if (sd->parent) | 39 | if (sd->parent) |
44 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" | 40 | printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); |
45 | " has parent"); | ||
46 | return -1; | 41 | return -1; |
47 | } | 42 | } |
48 | 43 | ||
@@ -50,12 +45,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
50 | cpumask_pr_args(sched_domain_span(sd)), sd->name); | 45 | cpumask_pr_args(sched_domain_span(sd)), sd->name); |
51 | 46 | ||
52 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { | 47 | if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) { |
53 | printk(KERN_ERR "ERROR: domain->span does not contain " | 48 | printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); |
54 | "CPU%d\n", cpu); | ||
55 | } | 49 | } |
56 | if (!cpumask_test_cpu(cpu, sched_group_span(group))) { | 50 | if (!cpumask_test_cpu(cpu, sched_group_span(group))) { |
57 | printk(KERN_ERR "ERROR: domain->groups does not contain" | 51 | printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); |
58 | " CPU%d\n", cpu); | ||
59 | } | 52 | } |
60 | 53 | ||
61 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); | 54 | printk(KERN_DEBUG "%*s groups:", level + 1, ""); |
@@ -115,8 +108,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, | |||
115 | 108 | ||
116 | if (sd->parent && | 109 | if (sd->parent && |
117 | !cpumask_subset(groupmask, sched_domain_span(sd->parent))) | 110 | !cpumask_subset(groupmask, sched_domain_span(sd->parent))) |
118 | printk(KERN_ERR "ERROR: parent span is not a superset " | 111 | printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); |
119 | "of domain->span\n"); | ||
120 | return 0; | 112 | return 0; |
121 | } | 113 | } |
122 | 114 | ||
@@ -595,7 +587,7 @@ int group_balance_cpu(struct sched_group *sg) | |||
595 | * are not. | 587 | * are not. |
596 | * | 588 | * |
597 | * This leads to a few particularly weird cases where the sched_domain's are | 589 | * This leads to a few particularly weird cases where the sched_domain's are |
598 | * not of the same number for each cpu. Consider: | 590 | * not of the same number for each CPU. Consider: |
599 | * | 591 | * |
600 | * NUMA-2 0-3 0-3 | 592 | * NUMA-2 0-3 0-3 |
601 | * groups: {0-2},{1-3} {1-3},{0-2} | 593 | * groups: {0-2},{1-3} {1-3},{0-2} |
@@ -780,7 +772,7 @@ fail: | |||
780 | * ^ ^ ^ ^ | 772 | * ^ ^ ^ ^ |
781 | * `-' `-' | 773 | * `-' `-' |
782 | * | 774 | * |
783 | * The sched_domains are per-cpu and have a two way link (parent & child) and | 775 | * The sched_domains are per-CPU and have a two way link (parent & child) and |
784 | * denote the ever growing mask of CPUs belonging to that level of topology. | 776 | * denote the ever growing mask of CPUs belonging to that level of topology. |
785 | * | 777 | * |
786 | * Each sched_domain has a circular (double) linked list of sched_group's, each | 778 | * Each sched_domain has a circular (double) linked list of sched_group's, each |
@@ -1021,6 +1013,7 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) | |||
1021 | d->rd = alloc_rootdomain(); | 1013 | d->rd = alloc_rootdomain(); |
1022 | if (!d->rd) | 1014 | if (!d->rd) |
1023 | return sa_sd; | 1015 | return sa_sd; |
1016 | |||
1024 | return sa_rootdomain; | 1017 | return sa_rootdomain; |
1025 | } | 1018 | } |
1026 | 1019 | ||
@@ -1047,12 +1040,14 @@ static void claim_allocations(int cpu, struct sched_domain *sd) | |||
1047 | } | 1040 | } |
1048 | 1041 | ||
1049 | #ifdef CONFIG_NUMA | 1042 | #ifdef CONFIG_NUMA |
1050 | static int sched_domains_numa_levels; | ||
1051 | enum numa_topology_type sched_numa_topology_type; | 1043 | enum numa_topology_type sched_numa_topology_type; |
1052 | static int *sched_domains_numa_distance; | 1044 | |
1053 | int sched_max_numa_distance; | 1045 | static int sched_domains_numa_levels; |
1054 | static struct cpumask ***sched_domains_numa_masks; | 1046 | static int sched_domains_curr_level; |
1055 | static int sched_domains_curr_level; | 1047 | |
1048 | int sched_max_numa_distance; | ||
1049 | static int *sched_domains_numa_distance; | ||
1050 | static struct cpumask ***sched_domains_numa_masks; | ||
1056 | #endif | 1051 | #endif |
1057 | 1052 | ||
1058 | /* | 1053 | /* |
@@ -1074,11 +1069,11 @@ static int sched_domains_curr_level; | |||
1074 | * SD_ASYM_PACKING - describes SMT quirks | 1069 | * SD_ASYM_PACKING - describes SMT quirks |
1075 | */ | 1070 | */ |
1076 | #define TOPOLOGY_SD_FLAGS \ | 1071 | #define TOPOLOGY_SD_FLAGS \ |
1077 | (SD_SHARE_CPUCAPACITY | \ | 1072 | (SD_SHARE_CPUCAPACITY | \ |
1078 | SD_SHARE_PKG_RESOURCES | \ | 1073 | SD_SHARE_PKG_RESOURCES | \ |
1079 | SD_NUMA | \ | 1074 | SD_NUMA | \ |
1080 | SD_ASYM_PACKING | \ | 1075 | SD_ASYM_PACKING | \ |
1081 | SD_ASYM_CPUCAPACITY | \ | 1076 | SD_ASYM_CPUCAPACITY | \ |
1082 | SD_SHARE_POWERDOMAIN) | 1077 | SD_SHARE_POWERDOMAIN) |
1083 | 1078 | ||
1084 | static struct sched_domain * | 1079 | static struct sched_domain * |
@@ -1628,7 +1623,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve | |||
1628 | pr_err(" the %s domain not a subset of the %s domain\n", | 1623 | pr_err(" the %s domain not a subset of the %s domain\n", |
1629 | child->name, sd->name); | 1624 | child->name, sd->name); |
1630 | #endif | 1625 | #endif |
1631 | /* Fixup, ensure @sd has at least @child cpus. */ | 1626 | /* Fixup, ensure @sd has at least @child CPUs. */ |
1632 | cpumask_or(sched_domain_span(sd), | 1627 | cpumask_or(sched_domain_span(sd), |
1633 | sched_domain_span(sd), | 1628 | sched_domain_span(sd), |
1634 | sched_domain_span(child)); | 1629 | sched_domain_span(child)); |
@@ -1720,6 +1715,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att | |||
1720 | ret = 0; | 1715 | ret = 0; |
1721 | error: | 1716 | error: |
1722 | __free_domain_allocs(&d, alloc_state, cpu_map); | 1717 | __free_domain_allocs(&d, alloc_state, cpu_map); |
1718 | |||
1723 | return ret; | 1719 | return ret; |
1724 | } | 1720 | } |
1725 | 1721 | ||
@@ -1824,6 +1820,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, | |||
1824 | return 1; | 1820 | return 1; |
1825 | 1821 | ||
1826 | tmp = SD_ATTR_INIT; | 1822 | tmp = SD_ATTR_INIT; |
1823 | |||
1827 | return !memcmp(cur ? (cur + idx_cur) : &tmp, | 1824 | return !memcmp(cur ? (cur + idx_cur) : &tmp, |
1828 | new ? (new + idx_new) : &tmp, | 1825 | new ? (new + idx_new) : &tmp, |
1829 | sizeof(struct sched_domain_attr)); | 1826 | sizeof(struct sched_domain_attr)); |
@@ -1929,4 +1926,3 @@ match2: | |||
1929 | 1926 | ||
1930 | mutex_unlock(&sched_domains_mutex); | 1927 | mutex_unlock(&sched_domains_mutex); |
1931 | } | 1928 | } |
1932 | |||
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 929ecb7d6b78..928be527477e 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c | |||
@@ -3,14 +3,7 @@ | |||
3 | * | 3 | * |
4 | * (C) 2004 Nadia Yvette Chambers, Oracle | 4 | * (C) 2004 Nadia Yvette Chambers, Oracle |
5 | */ | 5 | */ |
6 | #include <linux/init.h> | 6 | #include "sched.h" |
7 | #include <linux/export.h> | ||
8 | #include <linux/sched/signal.h> | ||
9 | #include <linux/sched/debug.h> | ||
10 | #include <linux/mm.h> | ||
11 | #include <linux/wait.h> | ||
12 | #include <linux/hash.h> | ||
13 | #include <linux/kthread.h> | ||
14 | 7 | ||
15 | void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) | 8 | void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key) |
16 | { | 9 | { |
@@ -107,6 +100,7 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode, | |||
107 | break; | 100 | break; |
108 | } | 101 | } |
109 | } | 102 | } |
103 | |||
110 | return nr_exclusive; | 104 | return nr_exclusive; |
111 | } | 105 | } |
112 | 106 | ||
@@ -317,6 +311,7 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait) | |||
317 | spin_unlock(&wq->lock); | 311 | spin_unlock(&wq->lock); |
318 | schedule(); | 312 | schedule(); |
319 | spin_lock(&wq->lock); | 313 | spin_lock(&wq->lock); |
314 | |||
320 | return 0; | 315 | return 0; |
321 | } | 316 | } |
322 | EXPORT_SYMBOL(do_wait_intr); | 317 | EXPORT_SYMBOL(do_wait_intr); |
@@ -333,6 +328,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait) | |||
333 | spin_unlock_irq(&wq->lock); | 328 | spin_unlock_irq(&wq->lock); |
334 | schedule(); | 329 | schedule(); |
335 | spin_lock_irq(&wq->lock); | 330 | spin_lock_irq(&wq->lock); |
331 | |||
336 | return 0; | 332 | return 0; |
337 | } | 333 | } |
338 | EXPORT_SYMBOL(do_wait_intr_irq); | 334 | EXPORT_SYMBOL(do_wait_intr_irq); |
@@ -378,6 +374,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i | |||
378 | 374 | ||
379 | if (ret) | 375 | if (ret) |
380 | list_del_init(&wq_entry->entry); | 376 | list_del_init(&wq_entry->entry); |
377 | |||
381 | return ret; | 378 | return ret; |
382 | } | 379 | } |
383 | EXPORT_SYMBOL(autoremove_wake_function); | 380 | EXPORT_SYMBOL(autoremove_wake_function); |
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c index 84cb3acd9260..ed84ab245a05 100644 --- a/kernel/sched/wait_bit.c +++ b/kernel/sched/wait_bit.c | |||
@@ -1,10 +1,7 @@ | |||
1 | /* | 1 | /* |
2 | * The implementation of the wait_bit*() and related waiting APIs: | 2 | * The implementation of the wait_bit*() and related waiting APIs: |
3 | */ | 3 | */ |
4 | #include <linux/wait_bit.h> | 4 | #include "sched.h" |
5 | #include <linux/sched/signal.h> | ||
6 | #include <linux/sched/debug.h> | ||
7 | #include <linux/hash.h> | ||
8 | 5 | ||
9 | #define WAIT_TABLE_BITS 8 | 6 | #define WAIT_TABLE_BITS 8 |
10 | #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) | 7 | #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS) |
@@ -29,8 +26,8 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync | |||
29 | wait_bit->key.bit_nr != key->bit_nr || | 26 | wait_bit->key.bit_nr != key->bit_nr || |
30 | test_bit(key->bit_nr, key->flags)) | 27 | test_bit(key->bit_nr, key->flags)) |
31 | return 0; | 28 | return 0; |
32 | else | 29 | |
33 | return autoremove_wake_function(wq_entry, mode, sync, key); | 30 | return autoremove_wake_function(wq_entry, mode, sync, key); |
34 | } | 31 | } |
35 | EXPORT_SYMBOL(wake_bit_function); | 32 | EXPORT_SYMBOL(wake_bit_function); |
36 | 33 | ||
@@ -50,7 +47,9 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_ | |||
50 | if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) | 47 | if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags)) |
51 | ret = (*action)(&wbq_entry->key, mode); | 48 | ret = (*action)(&wbq_entry->key, mode); |
52 | } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); | 49 | } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret); |
50 | |||
53 | finish_wait(wq_head, &wbq_entry->wq_entry); | 51 | finish_wait(wq_head, &wbq_entry->wq_entry); |
52 | |||
54 | return ret; | 53 | return ret; |
55 | } | 54 | } |
56 | EXPORT_SYMBOL(__wait_on_bit); | 55 | EXPORT_SYMBOL(__wait_on_bit); |
@@ -73,6 +72,7 @@ int __sched out_of_line_wait_on_bit_timeout( | |||
73 | DEFINE_WAIT_BIT(wq_entry, word, bit); | 72 | DEFINE_WAIT_BIT(wq_entry, word, bit); |
74 | 73 | ||
75 | wq_entry.key.timeout = jiffies + timeout; | 74 | wq_entry.key.timeout = jiffies + timeout; |
75 | |||
76 | return __wait_on_bit(wq_head, &wq_entry, action, mode); | 76 | return __wait_on_bit(wq_head, &wq_entry, action, mode); |
77 | } | 77 | } |
78 | EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); | 78 | EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout); |
@@ -120,6 +120,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit_lock); | |||
120 | void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) | 120 | void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit) |
121 | { | 121 | { |
122 | struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); | 122 | struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); |
123 | |||
123 | if (waitqueue_active(wq_head)) | 124 | if (waitqueue_active(wq_head)) |
124 | __wake_up(wq_head, TASK_NORMAL, 1, &key); | 125 | __wake_up(wq_head, TASK_NORMAL, 1, &key); |
125 | } | 126 | } |
@@ -148,6 +149,54 @@ void wake_up_bit(void *word, int bit) | |||
148 | } | 149 | } |
149 | EXPORT_SYMBOL(wake_up_bit); | 150 | EXPORT_SYMBOL(wake_up_bit); |
150 | 151 | ||
152 | wait_queue_head_t *__var_waitqueue(void *p) | ||
153 | { | ||
154 | if (BITS_PER_LONG == 64) { | ||
155 | unsigned long q = (unsigned long)p; | ||
156 | |||
157 | return bit_waitqueue((void *)(q & ~1), q & 1); | ||
158 | } | ||
159 | return bit_waitqueue(p, 0); | ||
160 | } | ||
161 | EXPORT_SYMBOL(__var_waitqueue); | ||
162 | |||
163 | static int | ||
164 | var_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode, | ||
165 | int sync, void *arg) | ||
166 | { | ||
167 | struct wait_bit_key *key = arg; | ||
168 | struct wait_bit_queue_entry *wbq_entry = | ||
169 | container_of(wq_entry, struct wait_bit_queue_entry, wq_entry); | ||
170 | |||
171 | if (wbq_entry->key.flags != key->flags || | ||
172 | wbq_entry->key.bit_nr != key->bit_nr) | ||
173 | return 0; | ||
174 | |||
175 | return autoremove_wake_function(wq_entry, mode, sync, key); | ||
176 | } | ||
177 | |||
178 | void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags) | ||
179 | { | ||
180 | *wbq_entry = (struct wait_bit_queue_entry){ | ||
181 | .key = { | ||
182 | .flags = (var), | ||
183 | .bit_nr = -1, | ||
184 | }, | ||
185 | .wq_entry = { | ||
186 | .private = current, | ||
187 | .func = var_wake_function, | ||
188 | .entry = LIST_HEAD_INIT(wbq_entry->wq_entry.entry), | ||
189 | }, | ||
190 | }; | ||
191 | } | ||
192 | EXPORT_SYMBOL(init_wait_var_entry); | ||
193 | |||
194 | void wake_up_var(void *var) | ||
195 | { | ||
196 | __wake_up_bit(__var_waitqueue(var), var, -1); | ||
197 | } | ||
198 | EXPORT_SYMBOL(wake_up_var); | ||
199 | |||
151 | /* | 200 | /* |
152 | * Manipulate the atomic_t address to produce a better bit waitqueue table hash | 201 | * Manipulate the atomic_t address to produce a better bit waitqueue table hash |
153 | * index (we're keying off bit -1, but that would produce a horrible hash | 202 | * index (we're keying off bit -1, but that would produce a horrible hash |
@@ -157,6 +206,7 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p) | |||
157 | { | 206 | { |
158 | if (BITS_PER_LONG == 64) { | 207 | if (BITS_PER_LONG == 64) { |
159 | unsigned long q = (unsigned long)p; | 208 | unsigned long q = (unsigned long)p; |
209 | |||
160 | return bit_waitqueue((void *)(q & ~1), q & 1); | 210 | return bit_waitqueue((void *)(q & ~1), q & 1); |
161 | } | 211 | } |
162 | return bit_waitqueue(p, 0); | 212 | return bit_waitqueue(p, 0); |
@@ -173,6 +223,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo | |||
173 | wait_bit->key.bit_nr != key->bit_nr || | 223 | wait_bit->key.bit_nr != key->bit_nr || |
174 | atomic_read(val) != 0) | 224 | atomic_read(val) != 0) |
175 | return 0; | 225 | return 0; |
226 | |||
176 | return autoremove_wake_function(wq_entry, mode, sync, key); | 227 | return autoremove_wake_function(wq_entry, mode, sync, key); |
177 | } | 228 | } |
178 | 229 | ||
@@ -196,6 +247,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en | |||
196 | ret = (*action)(val, mode); | 247 | ret = (*action)(val, mode); |
197 | } while (!ret && atomic_read(val) != 0); | 248 | } while (!ret && atomic_read(val) != 0); |
198 | finish_wait(wq_head, &wbq_entry->wq_entry); | 249 | finish_wait(wq_head, &wbq_entry->wq_entry); |
250 | |||
199 | return ret; | 251 | return ret; |
200 | } | 252 | } |
201 | 253 | ||
@@ -226,6 +278,7 @@ __sched int atomic_t_wait(atomic_t *counter, unsigned int mode) | |||
226 | schedule(); | 278 | schedule(); |
227 | if (signal_pending_state(mode, current)) | 279 | if (signal_pending_state(mode, current)) |
228 | return -EINTR; | 280 | return -EINTR; |
281 | |||
229 | return 0; | 282 | return 0; |
230 | } | 283 | } |
231 | EXPORT_SYMBOL(atomic_t_wait); | 284 | EXPORT_SYMBOL(atomic_t_wait); |
@@ -250,6 +303,7 @@ __sched int bit_wait(struct wait_bit_key *word, int mode) | |||
250 | schedule(); | 303 | schedule(); |
251 | if (signal_pending_state(mode, current)) | 304 | if (signal_pending_state(mode, current)) |
252 | return -EINTR; | 305 | return -EINTR; |
306 | |||
253 | return 0; | 307 | return 0; |
254 | } | 308 | } |
255 | EXPORT_SYMBOL(bit_wait); | 309 | EXPORT_SYMBOL(bit_wait); |
@@ -259,6 +313,7 @@ __sched int bit_wait_io(struct wait_bit_key *word, int mode) | |||
259 | io_schedule(); | 313 | io_schedule(); |
260 | if (signal_pending_state(mode, current)) | 314 | if (signal_pending_state(mode, current)) |
261 | return -EINTR; | 315 | return -EINTR; |
316 | |||
262 | return 0; | 317 | return 0; |
263 | } | 318 | } |
264 | EXPORT_SYMBOL(bit_wait_io); | 319 | EXPORT_SYMBOL(bit_wait_io); |
@@ -266,11 +321,13 @@ EXPORT_SYMBOL(bit_wait_io); | |||
266 | __sched int bit_wait_timeout(struct wait_bit_key *word, int mode) | 321 | __sched int bit_wait_timeout(struct wait_bit_key *word, int mode) |
267 | { | 322 | { |
268 | unsigned long now = READ_ONCE(jiffies); | 323 | unsigned long now = READ_ONCE(jiffies); |
324 | |||
269 | if (time_after_eq(now, word->timeout)) | 325 | if (time_after_eq(now, word->timeout)) |
270 | return -EAGAIN; | 326 | return -EAGAIN; |
271 | schedule_timeout(word->timeout - now); | 327 | schedule_timeout(word->timeout - now); |
272 | if (signal_pending_state(mode, current)) | 328 | if (signal_pending_state(mode, current)) |
273 | return -EINTR; | 329 | return -EINTR; |
330 | |||
274 | return 0; | 331 | return 0; |
275 | } | 332 | } |
276 | EXPORT_SYMBOL_GPL(bit_wait_timeout); | 333 | EXPORT_SYMBOL_GPL(bit_wait_timeout); |
@@ -278,11 +335,13 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout); | |||
278 | __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) | 335 | __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) |
279 | { | 336 | { |
280 | unsigned long now = READ_ONCE(jiffies); | 337 | unsigned long now = READ_ONCE(jiffies); |
338 | |||
281 | if (time_after_eq(now, word->timeout)) | 339 | if (time_after_eq(now, word->timeout)) |
282 | return -EAGAIN; | 340 | return -EAGAIN; |
283 | io_schedule_timeout(word->timeout - now); | 341 | io_schedule_timeout(word->timeout - now); |
284 | if (signal_pending_state(mode, current)) | 342 | if (signal_pending_state(mode, current)) |
285 | return -EINTR; | 343 | return -EINTR; |
344 | |||
286 | return 0; | 345 | return 0; |
287 | } | 346 | } |
288 | EXPORT_SYMBOL_GPL(bit_wait_io_timeout); | 347 | EXPORT_SYMBOL_GPL(bit_wait_io_timeout); |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 29a5733eff83..f2fa2e940fe5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -481,11 +481,18 @@ static int __init setup_tick_nohz(char *str) | |||
481 | 481 | ||
482 | __setup("nohz=", setup_tick_nohz); | 482 | __setup("nohz=", setup_tick_nohz); |
483 | 483 | ||
484 | int tick_nohz_tick_stopped(void) | 484 | bool tick_nohz_tick_stopped(void) |
485 | { | 485 | { |
486 | return __this_cpu_read(tick_cpu_sched.tick_stopped); | 486 | return __this_cpu_read(tick_cpu_sched.tick_stopped); |
487 | } | 487 | } |
488 | 488 | ||
489 | bool tick_nohz_tick_stopped_cpu(int cpu) | ||
490 | { | ||
491 | struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); | ||
492 | |||
493 | return ts->tick_stopped; | ||
494 | } | ||
495 | |||
489 | /** | 496 | /** |
490 | * tick_nohz_update_jiffies - update jiffies when idle was interrupted | 497 | * tick_nohz_update_jiffies - update jiffies when idle was interrupted |
491 | * | 498 | * |
@@ -741,12 +748,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, | |||
741 | delta = KTIME_MAX; | 748 | delta = KTIME_MAX; |
742 | } | 749 | } |
743 | 750 | ||
744 | #ifdef CONFIG_NO_HZ_FULL | ||
745 | /* Limit the tick delta to the maximum scheduler deferment */ | ||
746 | if (!ts->inidle) | ||
747 | delta = min(delta, scheduler_tick_max_deferment()); | ||
748 | #endif | ||
749 | |||
750 | /* Calculate the next expiry time */ | 751 | /* Calculate the next expiry time */ |
751 | if (delta < (KTIME_MAX - basemono)) | 752 | if (delta < (KTIME_MAX - basemono)) |
752 | expires = basemono + delta; | 753 | expires = basemono + delta; |
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6ec6ba65127b..254e636a3d6b 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -5573,12 +5573,13 @@ static void __init wq_numa_init(void) | |||
5573 | int __init workqueue_init_early(void) | 5573 | int __init workqueue_init_early(void) |
5574 | { | 5574 | { |
5575 | int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; | 5575 | int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; |
5576 | int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ; | ||
5576 | int i, cpu; | 5577 | int i, cpu; |
5577 | 5578 | ||
5578 | WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); | 5579 | WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); |
5579 | 5580 | ||
5580 | BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); | 5581 | BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); |
5581 | cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN)); | 5582 | cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags)); |
5582 | 5583 | ||
5583 | pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); | 5584 | pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); |
5584 | 5585 | ||