Merge branch 'for-4.17/dax' into libnvdimm-for-next

author: Dan Williams <dan.j.williams@intel.com> 2018-04-09 13:50:17 -0400
committer: Dan Williams <dan.j.williams@intel.com> 2018-04-09 13:50:17 -0400
commit: e13e75b86ef2f88e3a47d672dd4c52a293efb95b (patch)
tree: 2617aebd952d1aec09d323f6b2484b93f659e753
parent: 1ed41b5696ccc3ff40a1dee39fe14eff273faf82 (diff)
parent: 976431b02c2ef92ae3f8b6a7d699fc554025e118 (diff)
60 files changed, 1637 insertions, 1298 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 1d1d53f85ddd..50b9837e985b 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -1766,6 +1766,17 @@
                        nohz
                          Disable the tick when a single task runs.
+                          A residual 1Hz tick is offloaded to workqueues, which you
+                          need to affine to housekeeping through the global
+                          workqueue's affinity configured via the
+                          /sys/devices/virtual/workqueue/cpumask sysfs file, or
+                          by using the 'domain' flag described below.
+                          NOTE: by default the global workqueue runs on all CPUs,
+                          so to protect individual CPUs the 'cpumask' file has to
+                          be configured manually after bootup.
                        domain
                          Isolate from the general SMP balancing and scheduling
                          algorithms. Note that performing domain isolation this way
diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
index b79aa8f7a497..e0700bf4893a 100644
--- a/drivers/dax/Kconfig
+++ b/drivers/dax/Kconfig
@@ -1,3 +1,7 @@
+config DAX_DRIVER
+        select DAX
+        bool
 menuconfig DAX
        tristate "DAX: direct access to differentiated memory"
        select SRCU
@@ -16,7 +20,6 @@ config DEV_DAX
          baseline memory pool.  Mappings of a /dev/daxX.Y device impose
          restrictions that make the mapping behavior deterministic.
 config DEV_DAX_PMEM
        tristate "PMEM DAX: direct access to persistent memory"
        depends on LIBNVDIMM && NVDIMM_DAX && DEV_DAX
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index ecdc292aa4e4..2b2332b605e4 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -124,10 +124,19 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
                return len < 0 ? len : -EIO;
        }
-        if ((IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn))
+        if (IS_ENABLED(CONFIG_FS_DAX_LIMITED) && pfn_t_special(pfn)) {
-                        || pfn_t_devmap(pfn))
+                /*
+                 * An arch that has enabled the pmem api should also
+                 * have its drivers support pfn_t_devmap()
+                 *
+                 * This is a developer warning and should not trigger in
+                 * production. dax_flush() will crash since it depends
+                 * on being able to do (page_address(pfn_to_page())).
+                 */
+                WARN_ON(IS_ENABLED(CONFIG_ARCH_HAS_PMEM_API));
+        } else if (pfn_t_devmap(pfn)) {
                /* pass */;
-        else {
+        } else {
                pr_debug("VFS (%s): error: dax support not enabled\n",
                                sb->s_id);
                return -EOPNOTSUPP;
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 2c8ac3688815..edff083f7c4e 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -201,7 +201,7 @@ config BLK_DEV_DM_BUILTIN
 config BLK_DEV_DM
        tristate "Device mapper support"
        select BLK_DEV_DM_BUILTIN
-        select DAX
+        depends on DAX || DAX=n
        ---help---
          Device-mapper is a low level volume manager.  It works by allowing
          people to specify mappings for ranges of logical sectors.  Various
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index d5f8eff7c11d..89443e0ededa 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -154,6 +154,7 @@ static int linear_iterate_devices(struct dm_target *ti,
        return fn(ti, lc->dev, lc->start, ti->len, data);
 }
+#if IS_ENABLED(CONFIG_DAX_DRIVER)
 static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
                long nr_pages, void **kaddr, pfn_t *pfn)
 {
@@ -184,6 +185,11 @@ static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
        return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
 }
+#else
+#define linear_dax_direct_access NULL
+#define linear_dax_copy_from_iter NULL
+#endif
 static struct target_type linear_target = {
        .name   = "linear",
        .version = {1, 4, 0},
diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c
index 3362d866793b..7fcb4216973f 100644
--- a/drivers/md/dm-log-writes.c
+++ b/drivers/md/dm-log-writes.c
@@ -610,51 +610,6 @@ static int log_mark(struct log_writes_c *lc, char *data)
        return 0;
 }
-static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes,
-                   struct iov_iter *i)
-{
-        struct pending_block *block;
-        if (!bytes)
-                return 0;
-        block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
-        if (!block) {
-                DMERR("Error allocating dax pending block");
-                return -ENOMEM;
-        }
-        block->data = kzalloc(bytes, GFP_KERNEL);
-        if (!block->data) {
-                DMERR("Error allocating dax data space");
-                kfree(block);
-                return -ENOMEM;
-        }
-        /* write data provided via the iterator */
-        if (!copy_from_iter(block->data, bytes, i)) {
-                DMERR("Error copying dax data");
-                kfree(block->data);
-                kfree(block);
-                return -EIO;
-        }
-        /* rewind the iterator so that the block driver can use it */
-        iov_iter_revert(i, bytes);
-        block->datalen = bytes;
-        block->sector = bio_to_dev_sectors(lc, sector);
-        block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift;
-        atomic_inc(&lc->pending_blocks);
-        spin_lock_irq(&lc->blocks_lock);
-        list_add_tail(&block->list, &lc->unflushed_blocks);
-        spin_unlock_irq(&lc->blocks_lock);
-        wake_up_process(lc->log_kthread);
-        return 0;
-}
 static void log_writes_dtr(struct dm_target *ti)
 {
        struct log_writes_c *lc = ti->private;
@@ -920,6 +875,52 @@ static void log_writes_io_hints(struct dm_target *ti, struct queue_limits *limit
        limits->io_min = limits->physical_block_size;
 }
+#if IS_ENABLED(CONFIG_DAX_DRIVER)
+static int log_dax(struct log_writes_c *lc, sector_t sector, size_t bytes,
+                   struct iov_iter *i)
+{
+        struct pending_block *block;
+        if (!bytes)
+                return 0;
+        block = kzalloc(sizeof(struct pending_block), GFP_KERNEL);
+        if (!block) {
+                DMERR("Error allocating dax pending block");
+                return -ENOMEM;
+        }
+        block->data = kzalloc(bytes, GFP_KERNEL);
+        if (!block->data) {
+                DMERR("Error allocating dax data space");
+                kfree(block);
+                return -ENOMEM;
+        }
+        /* write data provided via the iterator */
+        if (!copy_from_iter(block->data, bytes, i)) {
+                DMERR("Error copying dax data");
+                kfree(block->data);
+                kfree(block);
+                return -EIO;
+        }
+        /* rewind the iterator so that the block driver can use it */
+        iov_iter_revert(i, bytes);
+        block->datalen = bytes;
+        block->sector = bio_to_dev_sectors(lc, sector);
+        block->nr_sectors = ALIGN(bytes, lc->sectorsize) >> lc->sectorshift;
+        atomic_inc(&lc->pending_blocks);
+        spin_lock_irq(&lc->blocks_lock);
+        list_add_tail(&block->list, &lc->unflushed_blocks);
+        spin_unlock_irq(&lc->blocks_lock);
+        wake_up_process(lc->log_kthread);
+        return 0;
+}
 static long log_writes_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
                                         long nr_pages, void **kaddr, pfn_t *pfn)
 {
@@ -956,6 +957,10 @@ static size_t log_writes_dax_copy_from_iter(struct dm_target *ti,
 dax_copy:
        return dax_copy_from_iter(lc->dev->dax_dev, pgoff, addr, bytes, i);
 }
+#else
+#define log_writes_dax_direct_access NULL
+#define log_writes_dax_copy_from_iter NULL
+#endif
 static struct target_type log_writes_target = {
        .name   = "log-writes",
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index b5e892149c54..ac2e8ee9d586 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -311,6 +311,7 @@ static int stripe_map(struct dm_target *ti, struct bio *bio)
        return DM_MAPIO_REMAPPED;
 }
+#if IS_ENABLED(CONFIG_DAX_DRIVER)
 static long stripe_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
                long nr_pages, void **kaddr, pfn_t *pfn)
 {
@@ -351,6 +352,11 @@ static size_t stripe_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
        return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
 }
+#else
+#define stripe_dax_direct_access NULL
+#define stripe_dax_copy_from_iter NULL
+#endif
 /*
 * Stripe status:
 *
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 45328d8b2859..bac79f40f3cb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1805,7 +1805,7 @@ static void cleanup_mapped_device(struct mapped_device *md)
 static struct mapped_device *alloc_dev(int minor)
 {
        int r, numa_node_id = dm_get_numa_node();
-        struct dax_device *dax_dev;
+        struct dax_device *dax_dev = NULL;
        struct mapped_device *md;
        void *old_md;
@@ -1871,9 +1871,11 @@ static struct mapped_device *alloc_dev(int minor)
        md->disk->private_data = md;
        sprintf(md->disk->disk_name, "dm-%d", minor);
-        dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
+        if (IS_ENABLED(CONFIG_DAX_DRIVER)) {
-        if (!dax_dev)
+                dax_dev = alloc_dax(md, md->disk->disk_name, &dm_dax_ops);
-                goto bad;
+                if (!dax_dev)
+                        goto bad;
+        }
        md->dax_dev = dax_dev;
        add_disk_no_queue_reg(md->disk);
diff --git a/drivers/nvdimm/Kconfig b/drivers/nvdimm/Kconfig
index f6c533c4d09b..85997184e047 100644
--- a/drivers/nvdimm/Kconfig
+++ b/drivers/nvdimm/Kconfig
@@ -20,7 +20,7 @@ if LIBNVDIMM
 config BLK_DEV_PMEM
        tristate "PMEM: Persistent memory block device support"
        default LIBNVDIMM
-        select DAX
+        select DAX_DRIVER
        select ND_BTT if BTT
        select ND_PFN if NVDIMM_PFN
        help
diff --git a/drivers/s390/block/Kconfig b/drivers/s390/block/Kconfig
index 1444333210c7..9ac7574e3cfb 100644
--- a/drivers/s390/block/Kconfig
+++ b/drivers/s390/block/Kconfig
@@ -15,8 +15,8 @@ config BLK_DEV_XPRAM
 config DCSSBLK
        def_tristate m
-        select DAX
        select FS_DAX_LIMITED
+        select DAX_DRIVER
        prompt "DCSSBLK support"
        depends on S390 && BLOCK
        help
diff --git a/fs/block_dev.c b/fs/block_dev.c
index fe09ef9c21f3..846ee2d31781 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1946,11 +1946,6 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
 static int blkdev_writepages(struct address_space *mapping,
                             struct writeback_control *wbc)
 {
-        if (dax_mapping(mapping)) {
-                struct block_device *bdev = I_BDEV(mapping->host);
-                return dax_writeback_mapping_range(mapping, bdev, wbc);
-        }
        return generic_writepages(mapping, wbc);
 }
diff --git a/fs/dax.c b/fs/dax.c
index 0276df90e86c..a77394fe586e 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -73,16 +73,15 @@ fs_initcall(init_dax_wait_table);
 #define RADIX_DAX_ZERO_PAGE     (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
 #define RADIX_DAX_EMPTY         (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
-static unsigned long dax_radix_sector(void *entry)
+static unsigned long dax_radix_pfn(void *entry)
 {
        return (unsigned long)entry >> RADIX_DAX_SHIFT;
 }
-static void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
+static void *dax_radix_locked_entry(unsigned long pfn, unsigned long flags)
 {
        return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
-                        ((unsigned long)sector << RADIX_DAX_SHIFT) |
+                        (pfn << RADIX_DAX_SHIFT) | RADIX_DAX_ENTRY_LOCK);
-                        RADIX_DAX_ENTRY_LOCK);
 }
 static unsigned int dax_radix_order(void *entry)
@@ -299,6 +298,63 @@ static void put_unlocked_mapping_entry(struct address_space *mapping,
        dax_wake_mapping_entry_waiter(mapping, index, entry, false);
 }
+static unsigned long dax_entry_size(void *entry)
+{
+        if (dax_is_zero_entry(entry))
+                return 0;
+        else if (dax_is_empty_entry(entry))
+                return 0;
+        else if (dax_is_pmd_entry(entry))
+                return PMD_SIZE;
+        else
+                return PAGE_SIZE;
+}
+static unsigned long dax_radix_end_pfn(void *entry)
+{
+        return dax_radix_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE;
+}
+/*
+ * Iterate through all mapped pfns represented by an entry, i.e. skip
+ * 'empty' and 'zero' entries.
+ */
+#define for_each_mapped_pfn(entry, pfn) \
+        for (pfn = dax_radix_pfn(entry); \
+                        pfn < dax_radix_end_pfn(entry); pfn++)
+static void dax_associate_entry(void *entry, struct address_space *mapping)
+{
+        unsigned long pfn;
+        if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+                return;
+        for_each_mapped_pfn(entry, pfn) {
+                struct page *page = pfn_to_page(pfn);
+                WARN_ON_ONCE(page->mapping);
+                page->mapping = mapping;
+        }
+}
+static void dax_disassociate_entry(void *entry, struct address_space *mapping,
+                bool trunc)
+{
+        unsigned long pfn;
+        if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))
+                return;
+        for_each_mapped_pfn(entry, pfn) {
+                struct page *page = pfn_to_page(pfn);
+                WARN_ON_ONCE(trunc && page_ref_count(page) > 1);
+                WARN_ON_ONCE(page->mapping && page->mapping != mapping);
+                page->mapping = NULL;
+        }
+}
 /*
 * Find radix tree entry at given index. If it points to an exceptional entry,
 * return it with the radix tree entry locked. If the radix tree doesn't
@@ -405,6 +461,7 @@ restart:
                }
                if (pmd_downgrade) {
+                        dax_disassociate_entry(entry, mapping, false);
                        radix_tree_delete(&mapping->page_tree, index);
                        mapping->nrexceptional--;
                        dax_wake_mapping_entry_waiter(mapping, index, entry,
@@ -454,6 +511,7 @@ static int __dax_invalidate_mapping_entry(struct address_space *mapping,
            (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
             radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)))
                goto out;
+        dax_disassociate_entry(entry, mapping, trunc);
        radix_tree_delete(page_tree, index);
        mapping->nrexceptional--;
        ret = 1;
@@ -526,12 +584,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
 */
 static void *dax_insert_mapping_entry(struct address_space *mapping,
                                      struct vm_fault *vmf,
-                                      void *entry, sector_t sector,
+                                      void *entry, pfn_t pfn_t,
                                      unsigned long flags, bool dirty)
 {
        struct radix_tree_root *page_tree = &mapping->page_tree;
-        void *new_entry;
+        unsigned long pfn = pfn_t_to_pfn(pfn_t);
        pgoff_t index = vmf->pgoff;
+        void *new_entry;
        if (dirty)
                __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -546,7 +605,11 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
        }
        spin_lock_irq(&mapping->tree_lock);
-        new_entry = dax_radix_locked_entry(sector, flags);
+        new_entry = dax_radix_locked_entry(pfn, flags);
+        if (dax_entry_size(entry) != dax_entry_size(new_entry)) {
+                dax_disassociate_entry(entry, mapping, false);
+                dax_associate_entry(new_entry, mapping);
+        }
        if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
                /*
@@ -657,17 +720,14 @@ unlock_pte:
        i_mmap_unlock_read(mapping);
 }
-static int dax_writeback_one(struct block_device *bdev,
+static int dax_writeback_one(struct dax_device *dax_dev,
-                struct dax_device *dax_dev, struct address_space *mapping,
+                struct address_space *mapping, pgoff_t index, void *entry)
-                pgoff_t index, void *entry)
 {
        struct radix_tree_root *page_tree = &mapping->page_tree;
-        void *entry2, **slot, *kaddr;
+        void *entry2, **slot;
-        long ret = 0, id;
+        unsigned long pfn;
-        sector_t sector;
+        long ret = 0;
-        pgoff_t pgoff;
        size_t size;
-        pfn_t pfn;
        /*
         * A page got tagged dirty in DAX mapping? Something is seriously
@@ -683,10 +743,10 @@ static int dax_writeback_one(struct block_device *bdev,
                goto put_unlocked;
        /*
         * Entry got reallocated elsewhere? No need to writeback. We have to
-         * compare sectors as we must not bail out due to difference in lockbit
+         * compare pfns as we must not bail out due to difference in lockbit
         * or entry type.
         */
-        if (dax_radix_sector(entry2) != dax_radix_sector(entry))
+        if (dax_radix_pfn(entry2) != dax_radix_pfn(entry))
                goto put_unlocked;
        if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
                                dax_is_zero_entry(entry))) {
@@ -712,33 +772,15 @@ static int dax_writeback_one(struct block_device *bdev,
        /*
         * Even if dax_writeback_mapping_range() was given a wbc->range_start
         * in the middle of a PMD, the 'index' we are given will be aligned to
-         * the start index of the PMD, as will the sector we pull from
+         * the start index of the PMD, as will the pfn we pull from 'entry'.
-         * 'entry'.  This allows us to flush for PMD_SIZE and not have to
+         * This allows us to flush for PMD_SIZE and not have to worry about
-         * worry about partial PMD writebacks.
+         * partial PMD writebacks.
         */
-        sector = dax_radix_sector(entry);
+        pfn = dax_radix_pfn(entry);
        size = PAGE_SIZE << dax_radix_order(entry);
-        id = dax_read_lock();
+        dax_mapping_entry_mkclean(mapping, index, pfn);
-        ret = bdev_dax_pgoff(bdev, sector, size, &pgoff);
+        dax_flush(dax_dev, page_address(pfn_to_page(pfn)), size);
-        if (ret)
-                goto dax_unlock;
-        /*
-         * dax_direct_access() may sleep, so cannot hold tree_lock over
-         * its invocation.
-         */
-        ret = dax_direct_access(dax_dev, pgoff, size / PAGE_SIZE, &kaddr, &pfn);
-        if (ret < 0)
-                goto dax_unlock;
-        if (WARN_ON_ONCE(ret < size / PAGE_SIZE)) {
-                ret = -EIO;
-                goto dax_unlock;
-        }
-        dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(pfn));
-        dax_flush(dax_dev, kaddr, size);
        /*
         * After we have flushed the cache, we can clear the dirty tag. There
         * cannot be new dirty data in the pfn after the flush has completed as
@@ -749,8 +791,6 @@ static int dax_writeback_one(struct block_device *bdev,
        radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
        spin_unlock_irq(&mapping->tree_lock);
        trace_dax_writeback_one(mapping->host, index, size >> PAGE_SHIFT);
- dax_unlock:
-        dax_read_unlock(id);
        put_locked_mapping_entry(mapping, index);
        return ret;
@@ -808,8 +848,8 @@ int dax_writeback_mapping_range(struct address_space *mapping,
                                break;
                        }
-                        ret = dax_writeback_one(bdev, dax_dev, mapping,
+                        ret = dax_writeback_one(dax_dev, mapping, indices[i],
-                                        indices[i], pvec.pages[i]);
+                                        pvec.pages[i]);
                        if (ret < 0) {
                                mapping_set_error(mapping, ret);
                                goto out;
@@ -877,6 +917,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
        int ret = VM_FAULT_NOPAGE;
        struct page *zero_page;
        void *entry2;
+        pfn_t pfn;
        zero_page = ZERO_PAGE(0);
        if (unlikely(!zero_page)) {
@@ -884,14 +925,15 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
                goto out;
        }
-        entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
+        pfn = page_to_pfn_t(zero_page);
+        entry2 = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
                        RADIX_DAX_ZERO_PAGE, false);
        if (IS_ERR(entry2)) {
                ret = VM_FAULT_SIGBUS;
                goto out;
        }
-        vm_insert_mixed(vmf->vma, vaddr, page_to_pfn_t(zero_page));
+        vm_insert_mixed(vmf->vma, vaddr, pfn);
 out:
        trace_dax_load_hole(inode, vmf, ret);
        return ret;
@@ -1200,8 +1242,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
                if (error < 0)
                        goto error_finish_iomap;
-                entry = dax_insert_mapping_entry(mapping, vmf, entry,
+                entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
-                                                 dax_iomap_sector(&iomap, pos),
                                                 0, write && !sync);
                if (IS_ERR(entry)) {
                        error = PTR_ERR(entry);
@@ -1280,13 +1321,15 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
        void *ret = NULL;
        spinlock_t *ptl;
        pmd_t pmd_entry;
+        pfn_t pfn;
        zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm);
        if (unlikely(!zero_page))
                goto fallback;
-        ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
+        pfn = page_to_pfn_t(zero_page);
+        ret = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
                        RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
        if (IS_ERR(ret))
                goto fallback;
@@ -1409,8 +1452,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
                if (error < 0)
                        goto finish_iomap;
-                entry = dax_insert_mapping_entry(mapping, vmf, entry,
+                entry = dax_insert_mapping_entry(mapping, vmf, entry, pfn,
-                                                dax_iomap_sector(&iomap, pos),
                                                RADIX_DAX_PMD, write && !sync);
                if (IS_ERR(entry))
                        goto finish_iomap;
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 032295e1d386..cc40802ddfa8 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -814,6 +814,7 @@ extern const struct inode_operations ext2_file_inode_operations;
 extern const struct file_operations ext2_file_operations;
 /* inode.c */
+extern void ext2_set_file_ops(struct inode *inode);
 extern const struct address_space_operations ext2_aops;
 extern const struct address_space_operations ext2_nobh_aops;
 extern const struct iomap_ops ext2_iomap_ops;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 9b2ac55ac34f..1e01fabef130 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -940,9 +940,6 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
        loff_t offset = iocb->ki_pos;
        ssize_t ret;
-        if (WARN_ON_ONCE(IS_DAX(inode)))
-                return -EIO;
        ret = blockdev_direct_IO(iocb, inode, iter, ext2_get_block);
        if (ret < 0 && iov_iter_rw(iter) == WRITE)
                ext2_write_failed(mapping, offset + count);
@@ -952,17 +949,16 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 static int
 ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
-#ifdef CONFIG_FS_DAX
-        if (dax_mapping(mapping)) {
-                return dax_writeback_mapping_range(mapping,
-                                                   mapping->host->i_sb->s_bdev,
-                                                   wbc);
-        }
-#endif
        return mpage_writepages(mapping, wbc, ext2_get_block);
 }
+static int
+ext2_dax_writepages(struct address_space *mapping, struct writeback_control *wbc)
+{
+        return dax_writeback_mapping_range(mapping,
+                        mapping->host->i_sb->s_bdev, wbc);
+}
 const struct address_space_operations ext2_aops = {
        .readpage               = ext2_readpage,
        .readpages              = ext2_readpages,
@@ -990,6 +986,13 @@ const struct address_space_operations ext2_nobh_aops = {
        .error_remove_page      = generic_error_remove_page,
 };
+static const struct address_space_operations ext2_dax_aops = {
+        .writepages             = ext2_dax_writepages,
+        .direct_IO              = noop_direct_IO,
+        .set_page_dirty         = noop_set_page_dirty,
+        .invalidatepage         = noop_invalidatepage,
+};
 /*
 * Probably it should be a library function... search for first non-zero word
 * or memcmp with zero_page, whatever is better for particular architecture.
@@ -1388,6 +1391,18 @@ void ext2_set_inode_flags(struct inode *inode)
                inode->i_flags |= S_DAX;
 }
+void ext2_set_file_ops(struct inode *inode)
+{
+        inode->i_op = &ext2_file_inode_operations;
+        inode->i_fop = &ext2_file_operations;
+        if (IS_DAX(inode))
+                inode->i_mapping->a_ops = &ext2_dax_aops;
+        else if (test_opt(inode->i_sb, NOBH))
+                inode->i_mapping->a_ops = &ext2_nobh_aops;
+        else
+                inode->i_mapping->a_ops = &ext2_aops;
+}
 struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
 {
        struct ext2_inode_info *ei;
@@ -1480,14 +1495,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
                ei->i_data[n] = raw_inode->i_block[n];
        if (S_ISREG(inode->i_mode)) {
-                inode->i_op = &ext2_file_inode_operations;
+                ext2_set_file_ops(inode);
-                if (test_opt(inode->i_sb, NOBH)) {
-                        inode->i_mapping->a_ops = &ext2_nobh_aops;
-                        inode->i_fop = &ext2_file_operations;
-                } else {
-                        inode->i_mapping->a_ops = &ext2_aops;
-                        inode->i_fop = &ext2_file_operations;
-                }
        } else if (S_ISDIR(inode->i_mode)) {
                inode->i_op = &ext2_dir_inode_operations;
                inode->i_fop = &ext2_dir_operations;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index e078075dc66f..55f7caadb093 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -107,14 +107,7 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
        if (IS_ERR(inode))
                return PTR_ERR(inode);
-        inode->i_op = &ext2_file_inode_operations;
+        ext2_set_file_ops(inode);
-        if (test_opt(inode->i_sb, NOBH)) {
-                inode->i_mapping->a_ops = &ext2_nobh_aops;
-                inode->i_fop = &ext2_file_operations;
-        } else {
-                inode->i_mapping->a_ops = &ext2_aops;
-                inode->i_fop = &ext2_file_operations;
-        }
        mark_inode_dirty(inode);
        return ext2_add_nondir(dentry, inode);
 }
@@ -125,14 +118,7 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
        if (IS_ERR(inode))
                return PTR_ERR(inode);
-        inode->i_op = &ext2_file_inode_operations;
+        ext2_set_file_ops(inode);
-        if (test_opt(inode->i_sb, NOBH)) {
-                inode->i_mapping->a_ops = &ext2_nobh_aops;
-                inode->i_fop = &ext2_file_operations;
-        } else {
-                inode->i_mapping->a_ops = &ext2_aops;
-                inode->i_fop = &ext2_file_operations;
-        }
        mark_inode_dirty(inode);
        d_tmpfile(dentry, inode);
        unlock_new_inode(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c94780075b04..249a97b19181 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2725,12 +2725,6 @@ static int ext4_writepages(struct address_space *mapping,
        percpu_down_read(&sbi->s_journal_flag_rwsem);
        trace_ext4_writepages(inode, wbc);
-        if (dax_mapping(mapping)) {
-                ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
-                                                  wbc);
-                goto out_writepages;
-        }
        /*
         * No pages to write? This is mainly a kludge to avoid starting
         * a transaction for special inodes like journal inode on last iput()
@@ -2955,6 +2949,27 @@ out_writepages:
        return ret;
 }
+static int ext4_dax_writepages(struct address_space *mapping,
+                               struct writeback_control *wbc)
+{
+        int ret;
+        long nr_to_write = wbc->nr_to_write;
+        struct inode *inode = mapping->host;
+        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
+        if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
+                return -EIO;
+        percpu_down_read(&sbi->s_journal_flag_rwsem);
+        trace_ext4_writepages(inode, wbc);
+        ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, wbc);
+        trace_ext4_writepages_result(inode, wbc, ret,
+                                     nr_to_write - wbc->nr_to_write);
+        percpu_up_read(&sbi->s_journal_flag_rwsem);
+        return ret;
+}
 static int ext4_nonda_switch(struct super_block *sb)
 {
        s64 free_clusters, dirty_clusters;
@@ -3857,10 +3872,6 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
        if (ext4_has_inline_data(inode))
                return 0;
-        /* DAX uses iomap path now */
-        if (WARN_ON_ONCE(IS_DAX(inode)))
-                return 0;
        trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
        if (iov_iter_rw(iter) == READ)
                ret = ext4_direct_IO_read(iocb, iter);
@@ -3946,6 +3957,13 @@ static const struct address_space_operations ext4_da_aops = {
        .error_remove_page      = generic_error_remove_page,
 };
+static const struct address_space_operations ext4_dax_aops = {
+        .writepages             = ext4_dax_writepages,
+        .direct_IO              = noop_direct_IO,
+        .set_page_dirty         = noop_set_page_dirty,
+        .invalidatepage         = noop_invalidatepage,
+};
 void ext4_set_aops(struct inode *inode)
 {
        switch (ext4_inode_journal_mode(inode)) {
@@ -3958,7 +3976,9 @@ void ext4_set_aops(struct inode *inode)
        default:
                BUG();
        }
-        if (test_opt(inode->i_sb, DELALLOC))
+        if (IS_DAX(inode))
+                inode->i_mapping->a_ops = &ext4_dax_aops;
+        else if (test_opt(inode->i_sb, DELALLOC))
                inode->i_mapping->a_ops = &ext4_da_aops;
        else
                inode->i_mapping->a_ops = &ext4_aops;
diff --git a/fs/libfs.c b/fs/libfs.c
index 7ff3cb904acd..0fb590d79f30 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -1060,6 +1060,45 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 }
 EXPORT_SYMBOL(noop_fsync);
+int noop_set_page_dirty(struct page *page)
+{
+        /*
+         * Unlike __set_page_dirty_no_writeback that handles dirty page
+         * tracking in the page object, dax does all dirty tracking in
+         * the inode address_space in response to mkwrite faults. In the
+         * dax case we only need to worry about potentially dirty CPU
+         * caches, not dirty page cache pages to write back.
+         *
+         * This callback is defined to prevent fallback to
+         * __set_page_dirty_buffers() in set_page_dirty().
+         */
+        return 0;
+}
+EXPORT_SYMBOL_GPL(noop_set_page_dirty);
+void noop_invalidatepage(struct page *page, unsigned int offset,
+                unsigned int length)
+{
+        /*
+         * There is no page cache to invalidate in the dax case, however
+         * we need this callback defined to prevent falling back to
+         * block_invalidatepage() in do_invalidatepage().
+         */
+}
+EXPORT_SYMBOL_GPL(noop_invalidatepage);
+ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+        /*
+         * iomap based filesystems support direct I/O without need for
+         * this callback. However, it still needs to be set in
+         * inode->a_ops so that open/fcntl know that direct I/O is
+         * generally supported.
+         */
+        return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(noop_direct_IO);
 /* Because kfree isn't assignment-compatible with void(void*) ;-/ */
 void kfree_link(void *p)
 {
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 9c6a830da0ee..e7a56c4786ff 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1194,16 +1194,22 @@ xfs_vm_writepages(
        int                     ret;
        xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
-        if (dax_mapping(mapping))
-                return dax_writeback_mapping_range(mapping,
-                                xfs_find_bdev_for_inode(mapping->host), wbc);
        ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
        if (wpc.ioend)
                ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
        return ret;
 }
+STATIC int
+xfs_dax_writepages(
+        struct address_space    *mapping,
+        struct writeback_control *wbc)
+{
+        xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+        return dax_writeback_mapping_range(mapping,
+                        xfs_find_bdev_for_inode(mapping->host), wbc);
+}
 /*
 * Called to move a page into cleanable state - and from there
 * to be released. The page should already be clean. We always
@@ -1367,17 +1373,6 @@ out_unlock:
        return error;
 }
-STATIC ssize_t
-xfs_vm_direct_IO(
-        struct kiocb            *iocb,
-        struct iov_iter         *iter)
-{
-        /*
-         * We just need the method present so that open/fcntl allow direct I/O.
-         */
-        return -EINVAL;
-}
 STATIC sector_t
 xfs_vm_bmap(
        struct address_space    *mapping,
@@ -1500,8 +1495,15 @@ const struct address_space_operations xfs_address_space_operations = {
        .releasepage            = xfs_vm_releasepage,
        .invalidatepage         = xfs_vm_invalidatepage,
        .bmap                   = xfs_vm_bmap,
-        .direct_IO              = xfs_vm_direct_IO,
+        .direct_IO              = noop_direct_IO,
        .migratepage            = buffer_migrate_page,
        .is_partially_uptodate  = block_is_partially_uptodate,
        .error_remove_page      = generic_error_remove_page,
 };
+const struct address_space_operations xfs_dax_aops = {
+        .writepages             = xfs_dax_writepages,
+        .direct_IO              = noop_direct_IO,
+        .set_page_dirty         = noop_set_page_dirty,
+        .invalidatepage         = noop_invalidatepage,
+};
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 88c85ea63da0..69346d460dfa 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -54,6 +54,7 @@ struct xfs_ioend {
 };
 extern const struct address_space_operations xfs_address_space_operations;
+extern const struct address_space_operations xfs_dax_aops;
 int     xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 56475fcd76f2..951e84df5576 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1272,7 +1272,10 @@ xfs_setup_iops(
        case S_IFREG:
                inode->i_op = &xfs_inode_operations;
                inode->i_fop = &xfs_file_operations;
-                inode->i_mapping->a_ops = &xfs_address_space_operations;
+                if (IS_DAX(inode))
+                        inode->i_mapping->a_ops = &xfs_dax_aops;
+                else
+                        inode->i_mapping->a_ops = &xfs_address_space_operations;
                break;
        case S_IFDIR:
                if (xfs_sb_version_hasasciici(&XFS_M(inode->i_sb)->m_sb))
diff --git a/include/linux/dax.h b/include/linux/dax.h
index 0185ecdae135..f9eb22ad341e 100644
--- a/include/linux/dax.h
+++ b/include/linux/dax.h
@@ -26,18 +26,42 @@ extern struct attribute_group dax_attribute_group;
 #if IS_ENABLED(CONFIG_DAX)
 struct dax_device *dax_get_by_host(const char *host);
+struct dax_device *alloc_dax(void *private, const char *host,
+                const struct dax_operations *ops);
 void put_dax(struct dax_device *dax_dev);
+void kill_dax(struct dax_device *dax_dev);
+void dax_write_cache(struct dax_device *dax_dev, bool wc);
+bool dax_write_cache_enabled(struct dax_device *dax_dev);
 #else
 static inline struct dax_device *dax_get_by_host(const char *host)
 {
        return NULL;
 }
+static inline struct dax_device *alloc_dax(void *private, const char *host,
+                const struct dax_operations *ops)
+{
+        /*
+         * Callers should check IS_ENABLED(CONFIG_DAX) to know if this
+         * NULL is an error or expected.
+         */
+        return NULL;
+}
 static inline void put_dax(struct dax_device *dax_dev)
 {
 }
+static inline void kill_dax(struct dax_device *dax_dev)
+{
+}
+static inline void dax_write_cache(struct dax_device *dax_dev, bool wc)
+{
+}
+static inline bool dax_write_cache_enabled(struct dax_device *dax_dev)
+{
+        return false;
+}
 #endif
+struct writeback_control;
 int bdev_dax_pgoff(struct block_device *, sector_t, size_t, pgoff_t *pgoff);
 #if IS_ENABLED(CONFIG_FS_DAX)
 int __bdev_dax_supported(struct super_block *sb, int blocksize);
@@ -57,6 +81,8 @@ static inline void fs_put_dax(struct dax_device *dax_dev)
 }
 struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev);
+int dax_writeback_mapping_range(struct address_space *mapping,
+                struct block_device *bdev, struct writeback_control *wbc);
 #else
 static inline int bdev_dax_supported(struct super_block *sb, int blocksize)
 {
@@ -76,22 +102,23 @@ static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev)
 {
        return NULL;
 }
+static inline int dax_writeback_mapping_range(struct address_space *mapping,
+                struct block_device *bdev, struct writeback_control *wbc)
+{
+        return -EOPNOTSUPP;
+}
 #endif
 int dax_read_lock(void);
 void dax_read_unlock(int id);
-struct dax_device *alloc_dax(void *private, const char *host,
-                const struct dax_operations *ops);
 bool dax_alive(struct dax_device *dax_dev);
-void kill_dax(struct dax_device *dax_dev);
 void *dax_get_private(struct dax_device *dax_dev);
 long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
                void **kaddr, pfn_t *pfn);
 size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
                size_t bytes, struct iov_iter *i);
 void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
-void dax_write_cache(struct dax_device *dax_dev, bool wc);
-bool dax_write_cache_enabled(struct dax_device *dax_dev);
 ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
                const struct iomap_ops *ops);
@@ -121,7 +148,4 @@ static inline bool dax_mapping(struct address_space *mapping)
        return mapping->host && IS_DAX(mapping->host);
 }
-struct writeback_control;
-int dax_writeback_mapping_range(struct address_space *mapping,
-                struct block_device *bdev, struct writeback_control *wbc);
 #endif
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c6baf767619e..a3bb2aedbc2b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -3130,6 +3130,10 @@ extern int simple_rmdir(struct inode *, struct dentry *);
 extern int simple_rename(struct inode *, struct dentry *,
                         struct inode *, struct dentry *, unsigned int);
 extern int noop_fsync(struct file *, loff_t, loff_t, int);
+extern int noop_set_page_dirty(struct page *page);
+extern void noop_invalidatepage(struct page *page, unsigned int offset,
+                unsigned int length);
+extern ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter);
 extern int simple_empty(struct dentry *);
 extern int simple_readpage(struct file *file, struct page *page);
 extern int simple_write_begin(struct file *file, struct address_space *mapping,
diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h
index a5bc8728ead7..0cb034331cbb 100644
--- a/include/linux/sched/deadline.h
+++ b/include/linux/sched/deadline.h
@@ -1,8 +1,4 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_SCHED_DEADLINE_H
-#define _LINUX_SCHED_DEADLINE_H
-#include <linux/sched.h>
 /*
 * SCHED_DEADLINE tasks has negative priorities, reflecting
@@ -28,5 +24,3 @@ static inline bool dl_time_before(u64 a, u64 b)
 {
        return (s64)(a - b) < 0;
 }
-#endif /* _LINUX_SCHED_DEADLINE_H */
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index d849431c8060..4a6582c27dea 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h
@@ -12,6 +12,7 @@ enum hk_flags {
        HK_FLAG_SCHED           = (1 << 3),
        HK_FLAG_TICK            = (1 << 4),
        HK_FLAG_DOMAIN          = (1 << 5),
+        HK_FLAG_WQ              = (1 << 6),
 };
 #ifdef CONFIG_CPU_ISOLATION
diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h
index 3d3a97d9399d..094217273ff9 100644
--- a/include/linux/sched/nohz.h
+++ b/include/linux/sched/nohz.h
@@ -37,8 +37,4 @@ extern void wake_up_nohz_cpu(int cpu);
 static inline void wake_up_nohz_cpu(int cpu) { }
 #endif
-#ifdef CONFIG_NO_HZ_FULL
-extern u64 scheduler_tick_max_deferment(void);
-#endif
 #endif /* _LINUX_SCHED_NOHZ_H */
diff --git a/include/linux/tick.h b/include/linux/tick.h
index 7cc35921218e..7f8c9a127f5a 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -113,7 +113,8 @@ enum tick_dep_bits {
 #ifdef CONFIG_NO_HZ_COMMON
 extern bool tick_nohz_enabled;
-extern int tick_nohz_tick_stopped(void);
+extern bool tick_nohz_tick_stopped(void);
+extern bool tick_nohz_tick_stopped_cpu(int cpu);
 extern void tick_nohz_idle_enter(void);
 extern void tick_nohz_idle_exit(void);
 extern void tick_nohz_irq_exit(void);
@@ -125,6 +126,7 @@ extern u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time);
 #else /* !CONFIG_NO_HZ_COMMON */
 #define tick_nohz_enabled (0)
 static inline int tick_nohz_tick_stopped(void) { return 0; }
+static inline int tick_nohz_tick_stopped_cpu(int cpu) { return 0; }
 static inline void tick_nohz_idle_enter(void) { }
 static inline void tick_nohz_idle_exit(void) { }
diff --git a/include/linux/wait_bit.h b/include/linux/wait_bit.h
index 61b39eaf7cad..3fcdb75d69cf 100644
--- a/include/linux/wait_bit.h
+++ b/include/linux/wait_bit.h
@@ -262,4 +262,74 @@ int wait_on_atomic_t(atomic_t *val, wait_atomic_t_action_f action, unsigned mode
        return out_of_line_wait_on_atomic_t(val, action, mode);
 }
+extern void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags);
+extern void wake_up_var(void *var);
+extern wait_queue_head_t *__var_waitqueue(void *p);
+#define ___wait_var_event(var, condition, state, exclusive, ret, cmd)   \
+({                                                                      \
+        __label__ __out;                                                \
+        struct wait_queue_head *__wq_head = __var_waitqueue(var);       \
+        struct wait_bit_queue_entry __wbq_entry;                        \
+        long __ret = ret; /* explicit shadow */                         \
+                                                                        \
+        init_wait_var_entry(&__wbq_entry, var,                          \
+                            exclusive ? WQ_FLAG_EXCLUSIVE : 0);         \
+        for (;;) {                                                      \
+                long __int = prepare_to_wait_event(__wq_head,           \
+                                                   &__wbq_entry.wq_entry, \
+                                                   state);              \
+                if (condition)                                          \
+                        break;                                          \
+                                                                        \
+                if (___wait_is_interruptible(state) && __int) {         \
+                        __ret = __int;                                  \
+                        goto __out;                                     \
+                }                                                       \
+                                                                        \
+                cmd;                                                    \
+        }                                                               \
+        finish_wait(__wq_head, &__wbq_entry.wq_entry);                  \
+__out:  __ret;                                                          \
+})
+#define __wait_var_event(var, condition)                                \
+        ___wait_var_event(var, condition, TASK_UNINTERRUPTIBLE, 0, 0,   \
+                          schedule())
+#define wait_var_event(var, condition)                                  \
+do {                                                                    \
+        might_sleep();                                                  \
+        if (condition)                                                  \
+                break;                                                  \
+        __wait_var_event(var, condition);                               \
+} while (0)
+#define __wait_var_event_killable(var, condition)                       \
+        ___wait_var_event(var, condition, TASK_KILLABLE, 0, 0,          \
+                          schedule())
+#define wait_var_event_killable(var, condition)                         \
+({                                                                      \
+        int __ret = 0;                                                  \
+        might_sleep();                                                  \
+        if (!(condition))                                               \
+                __ret = __wait_var_event_killable(var, condition);      \
+        __ret;                                                          \
+})
+#define __wait_var_event_timeout(var, condition, timeout)               \
+        ___wait_var_event(var, ___wait_cond_timeout(condition),         \
+                          TASK_UNINTERRUPTIBLE, 0, timeout,             \
+                          __ret = schedule_timeout(__ret))
+#define wait_var_event_timeout(var, condition, timeout)                 \
+({                                                                      \
+        long __ret = timeout;                                           \
+        might_sleep();                                                  \
+        if (!___wait_cond_timeout(condition))                           \
+                __ret = __wait_var_event_timeout(var, condition, timeout); \
+        __ret;                                                          \
+})
 #endif /* _LINUX_WAIT_BIT_H */
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index e2f9d4feff40..d9a02b318108 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -17,8 +17,9 @@ CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer
 endif
 obj-y += core.o loadavg.o clock.o cputime.o
-obj-y += idle_task.o fair.o rt.o deadline.o
+obj-y += idle.o fair.o rt.o deadline.o
-obj-y += wait.o wait_bit.o swait.o completion.o idle.o
+obj-y += wait.o wait_bit.o swait.o completion.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index bb4b9fe026a1..6be6c575b6cd 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -1,10 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/proc_fs.h>
+/*
-#include <linux/seq_file.h>
+ * Auto-group scheduling implementation:
-#include <linux/utsname.h>
+ */
-#include <linux/security.h>
-#include <linux/export.h>
 #include "sched.h"
 unsigned int __read_mostly sysctl_sched_autogroup_enabled = 1;
@@ -168,18 +165,19 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
        autogroup_kref_put(prev);
 }
-/* Allocates GFP_KERNEL, cannot be called under any spinlock */
+/* Allocates GFP_KERNEL, cannot be called under any spinlock: */
 void sched_autogroup_create_attach(struct task_struct *p)
 {
        struct autogroup *ag = autogroup_create();
        autogroup_move_group(p, ag);
-        /* drop extra reference added by autogroup_create() */
+        /* Drop extra reference added by autogroup_create(): */
        autogroup_kref_put(ag);
 }
 EXPORT_SYMBOL(sched_autogroup_create_attach);
-/* Cannot be called under siglock.  Currently has no users */
+/* Cannot be called under siglock. Currently has no users: */
 void sched_autogroup_detach(struct task_struct *p)
 {
        autogroup_move_group(p, &autogroup_default);
@@ -202,7 +200,6 @@ static int __init setup_autogroup(char *str)
        return 1;
 }
 __setup("noautogroup", setup_autogroup);
 #ifdef CONFIG_PROC_FS
@@ -224,7 +221,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice)
        if (nice < 0 && !can_nice(current, nice))
                return -EPERM;
-        /* this is a heavy operation taking global locks.. */
+        /* This is a heavy operation, taking global locks.. */
        if (!capable(CAP_SYS_ADMIN) && time_before(jiffies, next))
                return -EAGAIN;
@@ -267,4 +264,4 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
        return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
 }
-#endif /* CONFIG_SCHED_DEBUG */
+#endif
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h
index 27cd22b89824..b96419974a1f 100644
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -1,15 +1,11 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 #ifdef CONFIG_SCHED_AUTOGROUP
-#include <linux/kref.h>
-#include <linux/rwsem.h>
-#include <linux/sched/autogroup.h>
 struct autogroup {
        /*
-         * reference doesn't mean how many thread attach to this
+         * Reference doesn't mean how many threads attach to this
-         * autogroup now. It just stands for the number of task
+         * autogroup now. It just stands for the number of tasks
-         * could use this autogroup.
+         * which could use this autogroup.
         */
        struct kref             kref;
        struct task_group       *tg;
@@ -56,11 +52,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg)
        return tg;
 }
-#ifdef CONFIG_SCHED_DEBUG
 static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
 {
        return 0;
 }
-#endif
 #endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index e086babe6c61..10c83e73837a 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -1,5 +1,5 @@
 /*
- * sched_clock for unstable cpu clocks
+ * sched_clock() for unstable CPU clocks
 *
 *  Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra
 *
@@ -11,7 +11,7 @@
 *   Guillaume Chazarain <guichaz@gmail.com>
 *
 *
- * What:
+ * What this file implements:
 *
 * cpu_clock(i) provides a fast (execution time) high resolution
 * clock with bounded drift between CPUs. The value of cpu_clock(i)
@@ -26,11 +26,11 @@
 * at 0 on boot (but people really shouldn't rely on that).
 *
 * cpu_clock(i)       -- can be used from any context, including NMI.
- * local_clock()      -- is cpu_clock() on the current cpu.
+ * local_clock()      -- is cpu_clock() on the current CPU.
 *
 * sched_clock_cpu(i)
 *
- * How:
+ * How it is implemented:
 *
 * The implementation either uses sched_clock() when
 * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the
@@ -52,19 +52,7 @@
 * that is otherwise invisible (TSC gets stopped).
 *
 */
-#include <linux/spinlock.h>
+#include "sched.h"
-#include <linux/hardirq.h>
-#include <linux/export.h>
-#include <linux/percpu.h>
-#include <linux/ktime.h>
-#include <linux/sched.h>
-#include <linux/nmi.h>
-#include <linux/sched/clock.h>
-#include <linux/static_key.h>
-#include <linux/workqueue.h>
-#include <linux/compiler.h>
-#include <linux/tick.h>
-#include <linux/init.h>
 /*
 * Scheduler clock - returns current time in nanosec units.
@@ -302,21 +290,21 @@ again:
         * cmpxchg64 below only protects one readout.
         *
         * We must reread via sched_clock_local() in the retry case on
-         * 32bit as an NMI could use sched_clock_local() via the
+         * 32-bit kernels as an NMI could use sched_clock_local() via the
         * tracer and hit between the readout of
-         * the low32bit and the high 32bit portion.
+         * the low 32-bit and the high 32-bit portion.
         */
        this_clock = sched_clock_local(my_scd);
        /*
-         * We must enforce atomic readout on 32bit, otherwise the
+         * We must enforce atomic readout on 32-bit, otherwise the
-         * update on the remote cpu can hit inbetween the readout of
+         * update on the remote CPU can hit inbetween the readout of
-         * the low32bit and the high 32bit portion.
+         * the low 32-bit and the high 32-bit portion.
         */
        remote_clock = cmpxchg64(&scd->clock, 0, 0);
 #else
        /*
-         * On 64bit the read of [my]scd->clock is atomic versus the
+         * On 64-bit kernels the read of [my]scd->clock is atomic versus the
-         * update, so we can avoid the above 32bit dance.
+         * update, so we can avoid the above 32-bit dance.
         */
        sched_clock_local(my_scd);
 again:
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 0926aef10dad..5d2d56b0817a 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -11,10 +11,7 @@
 * typically be used for exclusion which gives rise to priority inversion.
 * Waiting for completion is a typically sync point, but not an exclusion point.
 */
+#include "sched.h"
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/completion.h>
 /**
 * complete: - signals a single thread waiting on this completion
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c94895bc5a2c..74e750ffe64f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5,37 +5,11 @@
 *
 *  Copyright (C) 1991-2002  Linus Torvalds
 */
-#include <linux/sched.h>
+#include "sched.h"
-#include <linux/sched/clock.h>
-#include <uapi/linux/sched/types.h>
-#include <linux/sched/loadavg.h>
-#include <linux/sched/hotplug.h>
-#include <linux/wait_bit.h>
-#include <linux/cpuset.h>
-#include <linux/delayacct.h>
-#include <linux/init_task.h>
-#include <linux/context_tracking.h>
-#include <linux/rcupdate_wait.h>
-#include <linux/compat.h>
-#include <linux/blkdev.h>
-#include <linux/kprobes.h>
-#include <linux/mmu_context.h>
-#include <linux/module.h>
-#include <linux/nmi.h>
-#include <linux/prefetch.h>
-#include <linux/profile.h>
-#include <linux/security.h>
-#include <linux/syscalls.h>
-#include <linux/sched/isolation.h>
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
-#ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
-#endif
-#include "sched.h"
 #include "../workqueue_internal.h"
 #include "../smpboot.h"
@@ -135,7 +109,7 @@ struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
                 *                                      [L] ->on_rq
                 *      RELEASE (rq->lock)
                 *
-                 * If we observe the old cpu in task_rq_lock, the acquire of
+                 * If we observe the old CPU in task_rq_lock, the acquire of
                 * the old rq->lock will fully serialize against the stores.
                 *
                 * If we observe the new CPU in task_rq_lock, the acquire will
@@ -333,7 +307,7 @@ void hrtick_start(struct rq *rq, u64 delay)
 }
 #endif /* CONFIG_SMP */
-static void init_rq_hrtick(struct rq *rq)
+static void hrtick_rq_init(struct rq *rq)
 {
 #ifdef CONFIG_SMP
        rq->hrtick_csd_pending = 0;
@@ -351,7 +325,7 @@ static inline void hrtick_clear(struct rq *rq)
 {
 }
-static inline void init_rq_hrtick(struct rq *rq)
+static inline void hrtick_rq_init(struct rq *rq)
 {
 }
 #endif  /* CONFIG_SCHED_HRTICK */
@@ -1457,7 +1431,7 @@ EXPORT_SYMBOL_GPL(kick_process);
 *
 *  - cpu_active must be a subset of cpu_online
 *
- *  - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
+ *  - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
 *    see __set_cpus_allowed_ptr(). At this point the newly online
 *    CPU isn't yet part of the sched domains, and balancing will not
 *    see it.
@@ -2629,6 +2603,18 @@ static inline void finish_lock_switch(struct rq *rq)
        raw_spin_unlock_irq(&rq->lock);
 }
+/*
+ * NOP if the arch has not defined these:
+ */
+#ifndef prepare_arch_switch
+# define prepare_arch_switch(next)      do { } while (0)
+#endif
+#ifndef finish_arch_post_lock_switch
+# define finish_arch_post_lock_switch() do { } while (0)
+#endif
 /**
 * prepare_task_switch - prepare to switch tasks
 * @rq: the runqueue preparing to switch
@@ -3037,7 +3023,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
        /*
-         * 64-bit doesn't need locks to atomically read a 64bit value.
+         * 64-bit doesn't need locks to atomically read a 64-bit value.
         * So we have a optimization chance when the task's delta_exec is 0.
         * Reading ->on_cpu is racy, but this is ok.
         *
@@ -3096,35 +3082,99 @@ void scheduler_tick(void)
        rq->idle_balance = idle_cpu(cpu);
        trigger_load_balance(rq);
 #endif
-        rq_last_tick_reset(rq);
 }
 #ifdef CONFIG_NO_HZ_FULL
-/**
- * scheduler_tick_max_deferment
+struct tick_work {
- *
+        int                     cpu;
- * Keep at least one tick per second when a single
+        struct delayed_work     work;
- * active task is running because the scheduler doesn't
+};
- * yet completely support full dynticks environment.
- *
+static struct tick_work __percpu *tick_work_cpu;
- * This makes sure that uptime, CFS vruntime, load
- * balancing, etc... continue to move forward, even
+static void sched_tick_remote(struct work_struct *work)
- * with a very low granularity.
- *
- * Return: Maximum deferment in nanoseconds.
- */
-u64 scheduler_tick_max_deferment(void)
 {
-        struct rq *rq = this_rq();
+        struct delayed_work *dwork = to_delayed_work(work);
-        unsigned long next, now = READ_ONCE(jiffies);
+        struct tick_work *twork = container_of(dwork, struct tick_work, work);
+        int cpu = twork->cpu;
+        struct rq *rq = cpu_rq(cpu);
+        struct rq_flags rf;
-        next = rq->last_sched_tick + HZ;
+        /*
+         * Handle the tick only if it appears the remote CPU is running in full
+         * dynticks mode. The check is racy by nature, but missing a tick or
+         * having one too much is no big deal because the scheduler tick updates
+         * statistics and checks timeslices in a time-independent way, regardless
+         * of when exactly it is running.
+         */
+        if (!idle_cpu(cpu) && tick_nohz_tick_stopped_cpu(cpu)) {
+                struct task_struct *curr;
+                u64 delta;
-        if (time_before_eq(next, now))
+                rq_lock_irq(rq, &rf);
-                return 0;
+                update_rq_clock(rq);
+                curr = rq->curr;
+                delta = rq_clock_task(rq) - curr->se.exec_start;
-        return jiffies_to_nsecs(next - now);
+                /*
+                 * Make sure the next tick runs within a reasonable
+                 * amount of time.
+                 */
+                WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3);
+                curr->sched_class->task_tick(rq, curr, 0);
+                rq_unlock_irq(rq, &rf);
+        }
+        /*
+         * Run the remote tick once per second (1Hz). This arbitrary
+         * frequency is large enough to avoid overload but short enough
+         * to keep scheduler internal stats reasonably up to date.
+         */
+        queue_delayed_work(system_unbound_wq, dwork, HZ);
 }
+static void sched_tick_start(int cpu)
+{
+        struct tick_work *twork;
+        if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+                return;
+        WARN_ON_ONCE(!tick_work_cpu);
+        twork = per_cpu_ptr(tick_work_cpu, cpu);
+        twork->cpu = cpu;
+        INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
+        queue_delayed_work(system_unbound_wq, &twork->work, HZ);
+}
+#ifdef CONFIG_HOTPLUG_CPU
+static void sched_tick_stop(int cpu)
+{
+        struct tick_work *twork;
+        if (housekeeping_cpu(cpu, HK_FLAG_TICK))
+                return;
+        WARN_ON_ONCE(!tick_work_cpu);
+        twork = per_cpu_ptr(tick_work_cpu, cpu);
+        cancel_delayed_work_sync(&twork->work);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+int __init sched_tick_offload_init(void)
+{
+        tick_work_cpu = alloc_percpu(struct tick_work);
+        BUG_ON(!tick_work_cpu);
+        return 0;
+}
+#else /* !CONFIG_NO_HZ_FULL */
+static inline void sched_tick_start(int cpu) { }
+static inline void sched_tick_stop(int cpu) { }
 #endif
 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
@@ -5786,6 +5836,7 @@ int sched_cpu_starting(unsigned int cpu)
 {
        set_cpu_rq_start_time(cpu);
        sched_rq_cpu_starting(cpu);
+        sched_tick_start(cpu);
        return 0;
 }
@@ -5797,6 +5848,7 @@ int sched_cpu_dying(unsigned int cpu)
        /* Handle pending wakeups and then migrate everything off */
        sched_ttwu_pending();
+        sched_tick_stop(cpu);
        rq_lock_irqsave(rq, &rf);
        if (rq->rd) {
@@ -6024,11 +6076,8 @@ void __init sched_init(void)
                rq->last_load_update_tick = jiffies;
                rq->nohz_flags = 0;
 #endif
-#ifdef CONFIG_NO_HZ_FULL
-                rq->last_sched_tick = 0;
-#endif
 #endif /* CONFIG_SMP */
-                init_rq_hrtick(rq);
+                hrtick_rq_init(rq);
                atomic_set(&rq->nr_iowait, 0);
        }
@@ -7027,3 +7076,5 @@ const u32 sched_prio_to_wmult[40] = {
 /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
 /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
+#undef CREATE_TRACE_POINTS
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
index 44ab32a4fab6..9fbb10383434 100644
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -1,24 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/cgroup.h>
-#include <linux/slab.h>
-#include <linux/percpu.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
-#include <linux/seq_file.h>
-#include <linux/rcupdate.h>
-#include <linux/kernel_stat.h>
-#include <linux/err.h>
-#include "sched.h"
 /*
 * CPU accounting code for task groups.
 *
 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
 * (balbir@in.ibm.com).
 */
+#include "sched.h"
-/* Time spent by the tasks of the cpu accounting group executing in ... */
+/* Time spent by the tasks of the CPU accounting group executing in ... */
 enum cpuacct_stat_index {
        CPUACCT_STAT_USER,      /* ... user mode */
        CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
@@ -35,12 +24,12 @@ struct cpuacct_usage {
        u64     usages[CPUACCT_STAT_NSTATS];
 };
-/* track cpu usage of a group of tasks and its child groups */
+/* track CPU usage of a group of tasks and its child groups */
 struct cpuacct {
-        struct cgroup_subsys_state css;
+        struct cgroup_subsys_state      css;
-        /* cpuusage holds pointer to a u64-type object on every cpu */
+        /* cpuusage holds pointer to a u64-type object on every CPU */
-        struct cpuacct_usage __percpu *cpuusage;
+        struct cpuacct_usage __percpu   *cpuusage;
-        struct kernel_cpustat __percpu *cpustat;
+        struct kernel_cpustat __percpu  *cpustat;
 };
 static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
@@ -48,7 +37,7 @@ static inline struct cpuacct *css_ca(struct cgroup_subsys_state *css)
        return css ? container_of(css, struct cpuacct, css) : NULL;
 }
-/* return cpu accounting group to which this task belongs */
+/* Return CPU accounting group to which this task belongs */
 static inline struct cpuacct *task_ca(struct task_struct *tsk)
 {
        return css_ca(task_css(tsk, cpuacct_cgrp_id));
@@ -65,7 +54,7 @@ static struct cpuacct root_cpuacct = {
        .cpuusage       = &root_cpuacct_cpuusage,
 };
-/* create a new cpu accounting group */
+/* Create a new CPU accounting group */
 static struct cgroup_subsys_state *
 cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
 {
@@ -96,7 +85,7 @@ out:
        return ERR_PTR(-ENOMEM);
 }
-/* destroy an existing cpu accounting group */
+/* Destroy an existing CPU accounting group */
 static void cpuacct_css_free(struct cgroup_subsys_state *css)
 {
        struct cpuacct *ca = css_ca(css);
@@ -162,7 +151,7 @@ static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 #endif
 }
-/* return total cpu usage (in nanoseconds) of a group */
+/* Return total CPU usage (in nanoseconds) of a group */
 static u64 __cpuusage_read(struct cgroup_subsys_state *css,
                           enum cpuacct_stat_index index)
 {
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
index 8d9562d890d3..50316455ea66 100644
--- a/kernel/sched/cpudeadline.c
+++ b/kernel/sched/cpudeadline.c
@@ -10,11 +10,7 @@
 *  as published by the Free Software Foundation; version 2
 *  of the License.
 */
+#include "sched.h"
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include "cpudeadline.h"
 static inline int parent(int i)
 {
@@ -42,8 +38,9 @@ static void cpudl_heapify_down(struct cpudl *cp, int idx)
                return;
        /* adapted from lib/prio_heap.c */
-        while(1) {
+        while (1) {
                u64 largest_dl;
                l = left_child(idx);
                r = right_child(idx);
                largest = idx;
@@ -131,6 +128,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
                return 1;
        } else {
                int best_cpu = cpudl_maximum(cp);
                WARN_ON(best_cpu != -1 && !cpu_present(best_cpu));
                if (cpumask_test_cpu(best_cpu, &p->cpus_allowed) &&
@@ -145,9 +143,9 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
 }
 /*
- * cpudl_clear - remove a cpu from the cpudl max-heap
+ * cpudl_clear - remove a CPU from the cpudl max-heap
 * @cp: the cpudl max-heap context
- * @cpu: the target cpu
+ * @cpu: the target CPU
 *
 * Notes: assumes cpu_rq(cpu)->lock is locked
 *
@@ -186,8 +184,8 @@ void cpudl_clear(struct cpudl *cp, int cpu)
 /*
 * cpudl_set - update the cpudl max-heap
 * @cp: the cpudl max-heap context
- * @cpu: the target cpu
+ * @cpu: the target CPU
- * @dl: the new earliest deadline for this cpu
+ * @dl: the new earliest deadline for this CPU
 *
 * Notes: assumes cpu_rq(cpu)->lock is locked
 *
@@ -205,6 +203,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
        old_idx = cp->elements[cpu].idx;
        if (old_idx == IDX_INVALID) {
                int new_idx = cp->size++;
                cp->elements[new_idx].dl = dl;
                cp->elements[new_idx].cpu = cpu;
                cp->elements[cpu].idx = new_idx;
@@ -221,7 +220,7 @@ void cpudl_set(struct cpudl *cp, int cpu, u64 dl)
 /*
 * cpudl_set_freecpu - Set the cpudl.free_cpus
 * @cp: the cpudl max-heap context
- * @cpu: rd attached cpu
+ * @cpu: rd attached CPU
 */
 void cpudl_set_freecpu(struct cpudl *cp, int cpu)
 {
@@ -231,7 +230,7 @@ void cpudl_set_freecpu(struct cpudl *cp, int cpu)
 /*
 * cpudl_clear_freecpu - Clear the cpudl.free_cpus
 * @cp: the cpudl max-heap context
- * @cpu: rd attached cpu
+ * @cpu: rd attached CPU
 */
 void cpudl_clear_freecpu(struct cpudl *cp, int cpu)
 {
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index b010d26e108e..0adeda93b5fb 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -1,35 +1,26 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_CPUDL_H
-#define _LINUX_CPUDL_H
-#include <linux/sched.h>
+#define IDX_INVALID             -1
-#include <linux/sched/deadline.h>
-#define IDX_INVALID     -1
 struct cpudl_item {
-        u64 dl;
+        u64                     dl;
-        int cpu;
+        int                     cpu;
-        int idx;
+        int                     idx;
 };
 struct cpudl {
-        raw_spinlock_t lock;
+        raw_spinlock_t          lock;
-        int size;
+        int                     size;
-        cpumask_var_t free_cpus;
+        cpumask_var_t           free_cpus;
-        struct cpudl_item *elements;
+        struct cpudl_item       *elements;
 };
 #ifdef CONFIG_SMP
-int cpudl_find(struct cpudl *cp, struct task_struct *p,
+int  cpudl_find(struct cpudl *cp, struct task_struct *p, struct cpumask *later_mask);
-               struct cpumask *later_mask);
 void cpudl_set(struct cpudl *cp, int cpu, u64 dl);
 void cpudl_clear(struct cpudl *cp, int cpu);
-int cpudl_init(struct cpudl *cp);
+int  cpudl_init(struct cpudl *cp);
 void cpudl_set_freecpu(struct cpudl *cp, int cpu);
 void cpudl_clear_freecpu(struct cpudl *cp, int cpu);
 void cpudl_cleanup(struct cpudl *cp);
 #endif /* CONFIG_SMP */
-#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index dbc51442ecbc..5e54cbcae673 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -8,7 +8,6 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
 #include "sched.h"
 DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 7936f548e071..feb5f89020f2 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -11,61 +11,57 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/cpufreq.h>
-#include <linux/kthread.h>
-#include <uapi/linux/sched/types.h>
-#include <linux/slab.h>
-#include <trace/events/power.h>
 #include "sched.h"
+#include <trace/events/power.h>
 struct sugov_tunables {
-        struct gov_attr_set attr_set;
+        struct gov_attr_set     attr_set;
-        unsigned int rate_limit_us;
+        unsigned int            rate_limit_us;
 };
 struct sugov_policy {
-        struct cpufreq_policy *policy;
+        struct cpufreq_policy   *policy;
-        struct sugov_tunables *tunables;
+        struct sugov_tunables   *tunables;
-        struct list_head tunables_hook;
+        struct list_head        tunables_hook;
-        raw_spinlock_t update_lock;  /* For shared policies */
+        raw_spinlock_t          update_lock;    /* For shared policies */
-        u64 last_freq_update_time;
+        u64                     last_freq_update_time;
-        s64 freq_update_delay_ns;
+        s64                     freq_update_delay_ns;
-        unsigned int next_freq;
+        unsigned int            next_freq;
-        unsigned int cached_raw_freq;
+        unsigned int            cached_raw_freq;
-        /* The next fields are only needed if fast switch cannot be used. */
+        /* The next fields are only needed if fast switch cannot be used: */
-        struct irq_work irq_work;
+        struct                  irq_work irq_work;
-        struct kthread_work work;
+        struct                  kthread_work work;
-        struct mutex work_lock;
+        struct                  mutex work_lock;
-        struct kthread_worker worker;
+        struct                  kthread_worker worker;
-        struct task_struct *thread;
+        struct task_struct      *thread;
-        bool work_in_progress;
+        bool                    work_in_progress;
-        bool need_freq_update;
+        bool                    need_freq_update;
 };
 struct sugov_cpu {
-        struct update_util_data update_util;
+        struct update_util_data update_util;
-        struct sugov_policy *sg_policy;
+        struct sugov_policy     *sg_policy;
-        unsigned int cpu;
+        unsigned int            cpu;
-        bool iowait_boost_pending;
+        bool                    iowait_boost_pending;
-        unsigned int iowait_boost;
+        unsigned int            iowait_boost;
-        unsigned int iowait_boost_max;
+        unsigned int            iowait_boost_max;
        u64 last_update;
-        /* The fields below are only needed when sharing a policy. */
+        /* The fields below are only needed when sharing a policy: */
-        unsigned long util_cfs;
+        unsigned long           util_cfs;
-        unsigned long util_dl;
+        unsigned long           util_dl;
-        unsigned long max;
+        unsigned long           max;
-        unsigned int flags;
+        unsigned int            flags;
-        /* The field below is for single-CPU policies only. */
+        /* The field below is for single-CPU policies only: */
 #ifdef CONFIG_NO_HZ_COMMON
-        unsigned long saved_idle_calls;
+        unsigned long           saved_idle_calls;
 #endif
 };
@@ -79,9 +75,9 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
        /*
         * Since cpufreq_update_util() is called with rq->lock held for
-         * the @target_cpu, our per-cpu data is fully serialized.
+         * the @target_cpu, our per-CPU data is fully serialized.
         *
-         * However, drivers cannot in general deal with cross-cpu
+         * However, drivers cannot in general deal with cross-CPU
         * requests, so while get_next_freq() will work, our
         * sugov_update_commit() call may not for the fast switching platforms.
         *
@@ -111,6 +107,7 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
        }
        delta_ns = time - sg_policy->last_freq_update_time;
        return delta_ns >= sg_policy->freq_update_delay_ns;
 }
@@ -345,8 +342,8 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
        return get_next_freq(sg_policy, util, max);
 }
-static void sugov_update_shared(struct update_util_data *hook, u64 time,
+static void
-                                unsigned int flags)
+sugov_update_shared(struct update_util_data *hook, u64 time, unsigned int flags)
 {
        struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
        struct sugov_policy *sg_policy = sg_cpu->sg_policy;
@@ -423,8 +420,8 @@ static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
        return sprintf(buf, "%u\n", tunables->rate_limit_us);
 }
-static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf,
+static ssize_t
-                                   size_t count)
+rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf, size_t count)
 {
        struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
        struct sugov_policy *sg_policy;
@@ -479,11 +476,11 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy)
 {
        struct task_struct *thread;
        struct sched_attr attr = {
-                .size = sizeof(struct sched_attr),
+                .size           = sizeof(struct sched_attr),
-                .sched_policy = SCHED_DEADLINE,
+                .sched_policy   = SCHED_DEADLINE,
-                .sched_flags = SCHED_FLAG_SUGOV,
+                .sched_flags    = SCHED_FLAG_SUGOV,
-                .sched_nice = 0,
+                .sched_nice     = 0,
-                .sched_priority = 0,
+                .sched_priority = 0,
                /*
                 * Fake (unused) bandwidth; workaround to "fix"
                 * priority inheritance.
@@ -663,21 +660,21 @@ static int sugov_start(struct cpufreq_policy *policy)
        struct sugov_policy *sg_policy = policy->governor_data;
        unsigned int cpu;
-        sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
+        sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
-        sg_policy->last_freq_update_time = 0;
+        sg_policy->last_freq_update_time        = 0;
-        sg_policy->next_freq = UINT_MAX;
+        sg_policy->next_freq                    = UINT_MAX;
-        sg_policy->work_in_progress = false;
+        sg_policy->work_in_progress             = false;
-        sg_policy->need_freq_update = false;
+        sg_policy->need_freq_update             = false;
-        sg_policy->cached_raw_freq = 0;
+        sg_policy->cached_raw_freq              = 0;
        for_each_cpu(cpu, policy->cpus) {
                struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
                memset(sg_cpu, 0, sizeof(*sg_cpu));
-                sg_cpu->cpu = cpu;
+                sg_cpu->cpu                     = cpu;
-                sg_cpu->sg_policy = sg_policy;
+                sg_cpu->sg_policy               = sg_policy;
-                sg_cpu->flags = 0;
+                sg_cpu->flags                   = 0;
-                sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+                sg_cpu->iowait_boost_max        = policy->cpuinfo.max_freq;
        }
        for_each_cpu(cpu, policy->cpus) {
@@ -721,14 +718,14 @@ static void sugov_limits(struct cpufreq_policy *policy)
 }
 static struct cpufreq_governor schedutil_gov = {
-        .name = "schedutil",
+        .name                   = "schedutil",
-        .owner = THIS_MODULE,
+        .owner                  = THIS_MODULE,
-        .dynamic_switching = true,
+        .dynamic_switching      = true,
-        .init = sugov_init,
+        .init                   = sugov_init,
-        .exit = sugov_exit,
+        .exit                   = sugov_exit,
-        .start = sugov_start,
+        .start                  = sugov_start,
-        .stop = sugov_stop,
+        .stop                   = sugov_stop,
-        .limits = sugov_limits,
+        .limits                 = sugov_limits,
 };
 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 2511aba36b89..daaadf939ccb 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -14,7 +14,7 @@
 *
 *  going from the lowest priority to the highest.  CPUs in the INVALID state
 *  are not eligible for routing.  The system maintains this state with
- *  a 2 dimensional bitmap (the first for priority class, the second for cpus
+ *  a 2 dimensional bitmap (the first for priority class, the second for CPUs
 *  in that class).  Therefore a typical application without affinity
 *  restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
 *  searches).  For tasks with affinity restrictions, the algorithm has a
@@ -26,12 +26,7 @@
 *  as published by the Free Software Foundation; version 2
 *  of the License.
 */
+#include "sched.h"
-#include <linux/gfp.h>
-#include <linux/sched.h>
-#include <linux/sched/rt.h>
-#include <linux/slab.h>
-#include "cpupri.h"
 /* Convert between a 140 based task->prio, and our 102 based cpupri */
 static int convert_prio(int prio)
@@ -128,9 +123,9 @@ int cpupri_find(struct cpupri *cp, struct task_struct *p,
 }
 /**
- * cpupri_set - update the cpu priority setting
+ * cpupri_set - update the CPU priority setting
 * @cp: The cpupri context
- * @cpu: The target cpu
+ * @cpu: The target CPU
 * @newpri: The priority (INVALID-RT99) to assign to this CPU
 *
 * Note: Assumes cpu_rq(cpu)->lock is locked
@@ -151,7 +146,7 @@ void cpupri_set(struct cpupri *cp, int cpu, int newpri)
                return;
        /*
-         * If the cpu was currently mapped to a different value, we
+         * If the CPU was currently mapped to a different value, we
         * need to map it to the new value then remove the old value.
         * Note, we must add the new value first, otherwise we risk the
         * cpu being missed by the priority loop in cpupri_find.
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index bab050019071..7dc20a3232e7 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -1,32 +1,25 @@
 /* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINUX_CPUPRI_H
-#define _LINUX_CPUPRI_H
-#include <linux/sched.h>
 #define CPUPRI_NR_PRIORITIES    (MAX_RT_PRIO + 2)
-#define CPUPRI_INVALID -1
+#define CPUPRI_INVALID          -1
-#define CPUPRI_IDLE     0
+#define CPUPRI_IDLE              0
-#define CPUPRI_NORMAL   1
+#define CPUPRI_NORMAL            1
 /* values 2-101 are RT priorities 0-99 */
 struct cpupri_vec {
-        atomic_t        count;
+        atomic_t                count;
-        cpumask_var_t   mask;
+        cpumask_var_t           mask;
 };
 struct cpupri {
-        struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
+        struct cpupri_vec       pri_to_cpu[CPUPRI_NR_PRIORITIES];
-        int *cpu_to_pri;
+        int                     *cpu_to_pri;
 };
 #ifdef CONFIG_SMP
-int  cpupri_find(struct cpupri *cp,
+int  cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask);
-                 struct task_struct *p, struct cpumask *lowest_mask);
 void cpupri_set(struct cpupri *cp, int cpu, int pri);
-int cpupri_init(struct cpupri *cp);
+int  cpupri_init(struct cpupri *cp);
 void cpupri_cleanup(struct cpupri *cp);
 #endif
-#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index bac6ac9a4ec7..0796f938c4f0 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -1,10 +1,6 @@
-#include <linux/export.h>
+/*
-#include <linux/sched.h>
+ * Simple CPU accounting cgroup controller
-#include <linux/tsacct_kern.h>
+ */
-#include <linux/kernel_stat.h>
-#include <linux/static_key.h>
-#include <linux/context_tracking.h>
-#include <linux/sched/cputime.h>
 #include "sched.h"
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -113,9 +109,9 @@ static inline void task_group_account_field(struct task_struct *p, int index,
 }
 /*
- * Account user cpu time to a process.
+ * Account user CPU time to a process.
- * @p: the process that the cpu time gets accounted to
+ * @p: the process that the CPU time gets accounted to
- * @cputime: the cpu time spent in user space since the last update
+ * @cputime: the CPU time spent in user space since the last update
 */
 void account_user_time(struct task_struct *p, u64 cputime)
 {
@@ -135,9 +131,9 @@ void account_user_time(struct task_struct *p, u64 cputime)
 }
 /*
- * Account guest cpu time to a process.
+ * Account guest CPU time to a process.
- * @p: the process that the cpu time gets accounted to
+ * @p: the process that the CPU time gets accounted to
- * @cputime: the cpu time spent in virtual machine since the last update
+ * @cputime: the CPU time spent in virtual machine since the last update
 */
 void account_guest_time(struct task_struct *p, u64 cputime)
 {
@@ -159,9 +155,9 @@ void account_guest_time(struct task_struct *p, u64 cputime)
 }
 /*
- * Account system cpu time to a process and desired cpustat field
+ * Account system CPU time to a process and desired cpustat field
- * @p: the process that the cpu time gets accounted to
+ * @p: the process that the CPU time gets accounted to
- * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime: the CPU time spent in kernel space since the last update
 * @index: pointer to cpustat field that has to be updated
 */
 void account_system_index_time(struct task_struct *p,
@@ -179,10 +175,10 @@ void account_system_index_time(struct task_struct *p,
 }
 /*
- * Account system cpu time to a process.
+ * Account system CPU time to a process.
- * @p: the process that the cpu time gets accounted to
+ * @p: the process that the CPU time gets accounted to
 * @hardirq_offset: the offset to subtract from hardirq_count()
- * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime: the CPU time spent in kernel space since the last update
 */
 void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
 {
@@ -205,7 +201,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset, u64 cputime)
 /*
 * Account for involuntary wait time.
- * @cputime: the cpu time spent in involuntary wait
+ * @cputime: the CPU time spent in involuntary wait
 */
 void account_steal_time(u64 cputime)
 {
@@ -216,7 +212,7 @@ void account_steal_time(u64 cputime)
 /*
 * Account for idle time.
- * @cputime: the cpu time spent in idle wait
+ * @cputime: the CPU time spent in idle wait
 */
 void account_idle_time(u64 cputime)
 {
@@ -338,7 +334,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 /*
 * Account a tick to a process and cpustat
- * @p: the process that the cpu time gets accounted to
+ * @p: the process that the CPU time gets accounted to
 * @user_tick: is the tick from userspace
 * @rq: the pointer to rq
 *
@@ -400,17 +396,16 @@ static void irqtime_account_idle_ticks(int ticks)
        irqtime_account_process_tick(current, 0, rq, ticks);
 }
 #else /* CONFIG_IRQ_TIME_ACCOUNTING */
-static inline void irqtime_account_idle_ticks(int ticks) {}
+static inline void irqtime_account_idle_ticks(int ticks) { }
 static inline void irqtime_account_process_tick(struct task_struct *p, int user_tick,
-                                                struct rq *rq, int nr_ticks) {}
+                                                struct rq *rq, int nr_ticks) { }
 #endif /* CONFIG_IRQ_TIME_ACCOUNTING */
 /*
 * Use precise platform statistics if available:
 */
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
+# ifndef __ARCH_HAS_VTIME_TASK_SWITCH
-#ifndef __ARCH_HAS_VTIME_TASK_SWITCH
 void vtime_common_task_switch(struct task_struct *prev)
 {
        if (is_idle_task(prev))
@@ -421,8 +416,7 @@ void vtime_common_task_switch(struct task_struct *prev)
        vtime_flush(prev);
        arch_vtime_task_switch(prev);
 }
-#endif
+# endif
 #endif /* CONFIG_VIRT_CPU_ACCOUNTING */
@@ -469,10 +463,12 @@ void thread_group_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st)
        *ut = cputime.utime;
        *st = cputime.stime;
 }
-#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */
+#else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE: */
 /*
- * Account a single tick of cpu time.
+ * Account a single tick of CPU time.
- * @p: the process that the cpu time gets accounted to
+ * @p: the process that the CPU time gets accounted to
 * @user_tick: indicates if the tick is a user or a system tick
 */
 void account_process_tick(struct task_struct *p, int user_tick)
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 9df09782025c..8b7c2b35bec9 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -17,9 +17,6 @@
 */
 #include "sched.h"
-#include <linux/slab.h>
-#include <uapi/linux/sched/types.h>
 struct dl_bandwidth def_dl_bandwidth;
 static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
@@ -514,7 +511,7 @@ static DEFINE_PER_CPU(struct callback_head, dl_pull_head);
 static void push_dl_tasks(struct rq *);
 static void pull_dl_task(struct rq *);
-static inline void queue_push_tasks(struct rq *rq)
+static inline void deadline_queue_push_tasks(struct rq *rq)
 {
        if (!has_pushable_dl_tasks(rq))
                return;
@@ -522,7 +519,7 @@ static inline void queue_push_tasks(struct rq *rq)
        queue_balance_callback(rq, &per_cpu(dl_push_head, rq->cpu), push_dl_tasks);
 }
-static inline void queue_pull_task(struct rq *rq)
+static inline void deadline_queue_pull_task(struct rq *rq)
 {
        queue_balance_callback(rq, &per_cpu(dl_pull_head, rq->cpu), pull_dl_task);
 }
@@ -539,12 +536,12 @@ static struct rq *dl_task_offline_migration(struct rq *rq, struct task_struct *p
                /*
                 * If we cannot preempt any rq, fall back to pick any
-                 * online cpu.
+                 * online CPU:
                 */
                cpu = cpumask_any_and(cpu_active_mask, &p->cpus_allowed);
                if (cpu >= nr_cpu_ids) {
                        /*
-                         * Fail to find any suitable cpu.
+                         * Failed to find any suitable CPU.
                         * The task will never come back!
                         */
                        BUG_ON(dl_bandwidth_enabled());
@@ -597,19 +594,18 @@ static inline void pull_dl_task(struct rq *rq)
 {
 }
-static inline void queue_push_tasks(struct rq *rq)
+static inline void deadline_queue_push_tasks(struct rq *rq)
 {
 }
-static inline void queue_pull_task(struct rq *rq)
+static inline void deadline_queue_pull_task(struct rq *rq)
 {
 }
 #endif /* CONFIG_SMP */
 static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags);
 static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags);
-static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p,
+static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags);
-                                  int flags);
 /*
 * We are being explicitly informed that a new instance is starting,
@@ -1763,7 +1759,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
        if (hrtick_enabled(rq))
                start_hrtick_dl(rq, p);
-        queue_push_tasks(rq);
+        deadline_queue_push_tasks(rq);
        return p;
 }
@@ -1776,6 +1772,14 @@ static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
                enqueue_pushable_dl_task(rq, p);
 }
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
 static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
 {
        update_curr_dl(rq);
@@ -1865,7 +1869,7 @@ static int find_later_rq(struct task_struct *task)
        /*
         * We have to consider system topology and task affinity
-         * first, then we can look for a suitable cpu.
+         * first, then we can look for a suitable CPU.
         */
        if (!cpudl_find(&task_rq(task)->rd->cpudl, task, later_mask))
                return -1;
@@ -1879,7 +1883,7 @@ static int find_later_rq(struct task_struct *task)
         * Now we check how well this matches with task's
         * affinity and system topology.
         *
-         * The last cpu where the task run is our first
+         * The last CPU where the task run is our first
         * guess, since it is most likely cache-hot there.
         */
        if (cpumask_test_cpu(cpu, later_mask))
@@ -1909,9 +1913,9 @@ static int find_later_rq(struct task_struct *task)
                        best_cpu = cpumask_first_and(later_mask,
                                                        sched_domain_span(sd));
                        /*
-                         * Last chance: if a cpu being in both later_mask
+                         * Last chance: if a CPU being in both later_mask
                         * and current sd span is valid, that becomes our
-                         * choice. Of course, the latest possible cpu is
+                         * choice. Of course, the latest possible CPU is
                         * already under consideration through later_mask.
                         */
                        if (best_cpu < nr_cpu_ids) {
@@ -2067,7 +2071,7 @@ retry:
                if (task == next_task) {
                        /*
                         * The task is still there. We don't try
-                         * again, some other cpu will pull it when ready.
+                         * again, some other CPU will pull it when ready.
                         */
                        goto out;
                }
@@ -2300,12 +2304,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
        /*
         * Since this might be the only -deadline task on the rq,
         * this is the right place to try to pull some other one
-         * from an overloaded cpu, if any.
+         * from an overloaded CPU, if any.
         */
        if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
                return;
-        queue_pull_task(rq);
+        deadline_queue_pull_task(rq);
 }
 /*
@@ -2327,7 +2331,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
        if (rq->curr != p) {
 #ifdef CONFIG_SMP
                if (p->nr_cpus_allowed > 1 && rq->dl.overloaded)
-                        queue_push_tasks(rq);
+                        deadline_queue_push_tasks(rq);
 #endif
                if (dl_task(rq->curr))
                        check_preempt_curr_dl(rq, p, 0);
@@ -2352,7 +2356,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
                 * or lowering its prio, so...
                 */
                if (!rq->dl.overloaded)
-                        queue_pull_task(rq);
+                        deadline_queue_pull_task(rq);
                /*
                 * If we now have a earlier deadline task than p,
@@ -2626,17 +2630,17 @@ void __dl_clear_params(struct task_struct *p)
 {
        struct sched_dl_entity *dl_se = &p->dl;
-        dl_se->dl_runtime = 0;
+        dl_se->dl_runtime               = 0;
-        dl_se->dl_deadline = 0;
+        dl_se->dl_deadline              = 0;
-        dl_se->dl_period = 0;
+        dl_se->dl_period                = 0;
-        dl_se->flags = 0;
+        dl_se->flags                    = 0;
-        dl_se->dl_bw = 0;
+        dl_se->dl_bw                    = 0;
-        dl_se->dl_density = 0;
+        dl_se->dl_density               = 0;
-        dl_se->dl_throttled = 0;
+        dl_se->dl_throttled             = 0;
-        dl_se->dl_yielded = 0;
+        dl_se->dl_yielded               = 0;
-        dl_se->dl_non_contending = 0;
+        dl_se->dl_non_contending        = 0;
-        dl_se->dl_overrun = 0;
+        dl_se->dl_overrun               = 0;
 }
 bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
@@ -2655,21 +2659,22 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
 #ifdef CONFIG_SMP
 int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed)
 {
-        unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
+        unsigned int dest_cpu;
-                                                        cs_cpus_allowed);
        struct dl_bw *dl_b;
        bool overflow;
        int cpus, ret;
        unsigned long flags;
+        dest_cpu = cpumask_any_and(cpu_active_mask, cs_cpus_allowed);
        rcu_read_lock_sched();
        dl_b = dl_bw_of(dest_cpu);
        raw_spin_lock_irqsave(&dl_b->lock, flags);
        cpus = dl_bw_cpus(dest_cpu);
        overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
-        if (overflow)
+        if (overflow) {
                ret = -EBUSY;
-        else {
+        } else {
                /*
                 * We reserve space for this task in the destination
                 * root_domain, as we can't fail after this point.
@@ -2681,6 +2686,7 @@ int dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allo
        }
        raw_spin_unlock_irqrestore(&dl_b->lock, flags);
        rcu_read_unlock_sched();
        return ret;
 }
@@ -2701,6 +2707,7 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
                ret = 0;
        raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
        rcu_read_unlock_sched();
        return ret;
 }
@@ -2718,6 +2725,7 @@ bool dl_cpu_busy(unsigned int cpu)
        overflow = __dl_overflow(dl_b, cpus, 0, 0);
        raw_spin_unlock_irqrestore(&dl_b->lock, flags);
        rcu_read_unlock_sched();
        return overflow;
 }
 #endif
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 72c401b3b15c..99e825b76633 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -1,7 +1,7 @@
 /*
 * kernel/sched/debug.c
 *
- * Print the CFS rbtree
+ * Print the CFS rbtree and other debugging details
 *
 * Copyright(C) 2007, Red Hat, Inc., Ingo Molnar
 *
@@ -9,16 +9,6 @@
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 */
-#include <linux/proc_fs.h>
-#include <linux/sched/mm.h>
-#include <linux/sched/task.h>
-#include <linux/seq_file.h>
-#include <linux/kallsyms.h>
-#include <linux/utsname.h>
-#include <linux/mempolicy.h>
-#include <linux/debugfs.h>
 #include "sched.h"
 static DEFINE_SPINLOCK(sched_debug_lock);
@@ -274,34 +264,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
        if (table == NULL)
                return NULL;
-        set_table_entry(&table[0], "min_interval", &sd->min_interval,
+        set_table_entry(&table[0] , "min_interval",        &sd->min_interval,        sizeof(long), 0644, proc_doulongvec_minmax, false);
-                sizeof(long), 0644, proc_doulongvec_minmax, false);
+        set_table_entry(&table[1] , "max_interval",        &sd->max_interval,        sizeof(long), 0644, proc_doulongvec_minmax, false);
-        set_table_entry(&table[1], "max_interval", &sd->max_interval,
+        set_table_entry(&table[2] , "busy_idx",            &sd->busy_idx,            sizeof(int) , 0644, proc_dointvec_minmax,   true );
-                sizeof(long), 0644, proc_doulongvec_minmax, false);
+        set_table_entry(&table[3] , "idle_idx",            &sd->idle_idx,            sizeof(int) , 0644, proc_dointvec_minmax,   true );
-        set_table_entry(&table[2], "busy_idx", &sd->busy_idx,
+        set_table_entry(&table[4] , "newidle_idx",         &sd->newidle_idx,         sizeof(int) , 0644, proc_dointvec_minmax,   true );
-                sizeof(int), 0644, proc_dointvec_minmax, true);
+        set_table_entry(&table[5] , "wake_idx",            &sd->wake_idx,            sizeof(int) , 0644, proc_dointvec_minmax,   true );
-        set_table_entry(&table[3], "idle_idx", &sd->idle_idx,
+        set_table_entry(&table[6] , "forkexec_idx",        &sd->forkexec_idx,        sizeof(int) , 0644, proc_dointvec_minmax,   true );
-                sizeof(int), 0644, proc_dointvec_minmax, true);
+        set_table_entry(&table[7] , "busy_factor",         &sd->busy_factor,         sizeof(int) , 0644, proc_dointvec_minmax,   false);
-        set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx,
+        set_table_entry(&table[8] , "imbalance_pct",       &sd->imbalance_pct,       sizeof(int) , 0644, proc_dointvec_minmax,   false);
-                sizeof(int), 0644, proc_dointvec_minmax, true);
+        set_table_entry(&table[9] , "cache_nice_tries",    &sd->cache_nice_tries,    sizeof(int) , 0644, proc_dointvec_minmax,   false);
-        set_table_entry(&table[5], "wake_idx", &sd->wake_idx,
+        set_table_entry(&table[10], "flags",               &sd->flags,               sizeof(int) , 0644, proc_dointvec_minmax,   false);
-                sizeof(int), 0644, proc_dointvec_minmax, true);
+        set_table_entry(&table[11], "max_newidle_lb_cost", &sd->max_newidle_lb_cost, sizeof(long), 0644, proc_doulongvec_minmax, false);
-        set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx,
+        set_table_entry(&table[12], "name",                sd->name,            CORENAME_MAX_SIZE, 0444, proc_dostring,          false);
-                sizeof(int), 0644, proc_dointvec_minmax, true);
-        set_table_entry(&table[7], "busy_factor", &sd->busy_factor,
-                sizeof(int), 0644, proc_dointvec_minmax, false);
-        set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct,
-                sizeof(int), 0644, proc_dointvec_minmax, false);
-        set_table_entry(&table[9], "cache_nice_tries",
-                &sd->cache_nice_tries,
-                sizeof(int), 0644, proc_dointvec_minmax, false);
-        set_table_entry(&table[10], "flags", &sd->flags,
-                sizeof(int), 0644, proc_dointvec_minmax, false);
-        set_table_entry(&table[11], "max_newidle_lb_cost",
-                &sd->max_newidle_lb_cost,
-                sizeof(long), 0644, proc_doulongvec_minmax, false);
-        set_table_entry(&table[12], "name", sd->name,
-                CORENAME_MAX_SIZE, 0444, proc_dostring, false);
        /* &table[13] is terminator */
        return table;
@@ -332,8 +307,8 @@ static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu)
        return table;
 }
-static cpumask_var_t sd_sysctl_cpus;
+static cpumask_var_t            sd_sysctl_cpus;
-static struct ctl_table_header *sd_sysctl_header;
+static struct ctl_table_header  *sd_sysctl_header;
 void register_sched_domain_sysctl(void)
 {
@@ -413,14 +388,10 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
 {
        struct sched_entity *se = tg->se[cpu];
-#define P(F) \
+#define P(F)            SEQ_printf(m, "  .%-30s: %lld\n",       #F, (long long)F)
-        SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)F)
+#define P_SCHEDSTAT(F)  SEQ_printf(m, "  .%-30s: %lld\n",       #F, (long long)schedstat_val(F))
-#define P_SCHEDSTAT(F) \
+#define PN(F)           SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
-        SEQ_printf(m, "  .%-30s: %lld\n", #F, (long long)schedstat_val(F))
+#define PN_SCHEDSTAT(F) SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
-#define PN(F) \
-        SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)F))
-#define PN_SCHEDSTAT(F) \
-        SEQ_printf(m, "  .%-30s: %lld.%06ld\n", #F, SPLIT_NS((long long)schedstat_val(F)))
        if (!se)
                return;
@@ -428,6 +399,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
        PN(se->exec_start);
        PN(se->vruntime);
        PN(se->sum_exec_runtime);
        if (schedstat_enabled()) {
                PN_SCHEDSTAT(se->statistics.wait_start);
                PN_SCHEDSTAT(se->statistics.sleep_start);
@@ -440,6 +412,7 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group
                PN_SCHEDSTAT(se->statistics.wait_sum);
                P_SCHEDSTAT(se->statistics.wait_count);
        }
        P(se->load.weight);
        P(se->runnable_weight);
 #ifdef CONFIG_SMP
@@ -464,6 +437,7 @@ static char *task_group_path(struct task_group *tg)
                return group_path;
        cgroup_path(tg->css.cgroup, group_path, PATH_MAX);
        return group_path;
 }
 #endif
@@ -804,9 +778,9 @@ void sysrq_sched_debug_show(void)
 /*
 * This itererator needs some explanation.
 * It returns 1 for the header position.
- * This means 2 is cpu 0.
+ * This means 2 is CPU 0.
- * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * In a hotplugged system some CPUs, including CPU 0, may be missing so we have
- * to use cpumask_* to iterate over the cpus.
+ * to use cpumask_* to iterate over the CPUs.
 */
 static void *sched_debug_start(struct seq_file *file, loff_t *offset)
 {
@@ -826,6 +800,7 @@ static void *sched_debug_start(struct seq_file *file, loff_t *offset)
        if (n < nr_cpu_ids)
                return (void *)(unsigned long)(n + 2);
        return NULL;
 }
@@ -840,10 +815,10 @@ static void sched_debug_stop(struct seq_file *file, void *data)
 }
 static const struct seq_operations sched_debug_sops = {
-        .start = sched_debug_start,
+        .start          = sched_debug_start,
-        .next = sched_debug_next,
+        .next           = sched_debug_next,
-        .stop = sched_debug_stop,
+        .stop           = sched_debug_stop,
-        .show = sched_debug_show,
+        .show           = sched_debug_show,
 };
 static int sched_debug_release(struct inode *inode, struct file *file)
@@ -881,14 +856,10 @@ static int __init init_sched_debug_procfs(void)
 __initcall(init_sched_debug_procfs);
-#define __P(F) \
+#define __P(F)  SEQ_printf(m, "%-45s:%21Ld\n",       #F, (long long)F)
-        SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)F)
+#define   P(F)  SEQ_printf(m, "%-45s:%21Ld\n",       #F, (long long)p->F)
-#define P(F) \
+#define __PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
-        SEQ_printf(m, "%-45s:%21Ld\n", #F, (long long)p->F)
+#define   PN(F) SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
-#define __PN(F) \
-        SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)F))
-#define PN(F) \
-        SEQ_printf(m, "%-45s:%14Ld.%06ld\n", #F, SPLIT_NS((long long)p->F))
 #ifdef CONFIG_NUMA_BALANCING
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 5eb3ffc9be84..f5591071ae98 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -20,25 +20,10 @@
 *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
 *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
 */
+#include "sched.h"
-#include <linux/sched/mm.h>
-#include <linux/sched/topology.h>
-#include <linux/latencytop.h>
-#include <linux/cpumask.h>
-#include <linux/cpuidle.h>
-#include <linux/slab.h>
-#include <linux/profile.h>
-#include <linux/interrupt.h>
-#include <linux/mempolicy.h>
-#include <linux/migrate.h>
-#include <linux/task_work.h>
-#include <linux/sched/isolation.h>
 #include <trace/events/sched.h>
-#include "sched.h"
 /*
 * Targeted preemption latency for CPU-bound tasks:
 *
@@ -103,7 +88,7 @@ const_debug unsigned int sysctl_sched_migration_cost	= 500000UL;
 #ifdef CONFIG_SMP
 /*
- * For asym packing, by default the lower numbered cpu has higher priority.
+ * For asym packing, by default the lower numbered CPU has higher priority.
 */
 int __weak arch_asym_cpu_priority(int cpu)
 {
@@ -1181,7 +1166,7 @@ pid_t task_numa_group_id(struct task_struct *p)
 }
 /*
- * The averaged statistics, shared & private, memory & cpu,
+ * The averaged statistics, shared & private, memory & CPU,
 * occupy the first half of the array. The second half of the
 * array is for current counters, which are averaged into the
 * first set by task_numa_placement.
@@ -1587,7 +1572,7 @@ static void task_numa_compare(struct task_numa_env *env,
         * be incurred if the tasks were swapped.
         */
        if (cur) {
-                /* Skip this swap candidate if cannot move to the source cpu */
+                /* Skip this swap candidate if cannot move to the source CPU: */
                if (!cpumask_test_cpu(env->src_cpu, &cur->cpus_allowed))
                        goto unlock;
@@ -1631,7 +1616,7 @@ static void task_numa_compare(struct task_numa_env *env,
                goto balance;
        }
-        /* Balance doesn't matter much if we're running a task per cpu */
+        /* Balance doesn't matter much if we're running a task per CPU: */
        if (imp > env->best_imp && src_rq->nr_running == 1 &&
                        dst_rq->nr_running == 1)
                goto assign;
@@ -1676,7 +1661,7 @@ balance:
         */
        if (!cur) {
                /*
-                 * select_idle_siblings() uses an per-cpu cpumask that
+                 * select_idle_siblings() uses an per-CPU cpumask that
                 * can be used from IRQ context.
                 */
                local_irq_disable();
@@ -1869,6 +1854,7 @@ static int task_numa_migrate(struct task_struct *p)
 static void numa_migrate_preferred(struct task_struct *p)
 {
        unsigned long interval = HZ;
+        unsigned long numa_migrate_retry;
        /* This task has no NUMA fault statistics yet */
        if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
@@ -1876,7 +1862,18 @@ static void numa_migrate_preferred(struct task_struct *p)
        /* Periodically retry migrating the task to the preferred node */
        interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
-        p->numa_migrate_retry = jiffies + interval;
+        numa_migrate_retry = jiffies + interval;
+        /*
+         * Check that the new retry threshold is after the current one. If
+         * the retry is in the future, it implies that wake_affine has
+         * temporarily asked NUMA balancing to backoff from placement.
+         */
+        if (numa_migrate_retry > p->numa_migrate_retry)
+                return;
+        /* Safe to try placing the task on the preferred node */
+        p->numa_migrate_retry = numa_migrate_retry;
        /* Success if task is already running on preferred CPU */
        if (task_node(p) == p->numa_preferred_nid)
@@ -2823,7 +2820,7 @@ void reweight_task(struct task_struct *p, int prio)
 }
 #ifdef CONFIG_FAIR_GROUP_SCHED
-# ifdef CONFIG_SMP
+#ifdef CONFIG_SMP
 /*
 * All this does is approximate the hierarchical proportion which includes that
 * global sum we all love to hate.
@@ -2974,7 +2971,7 @@ static long calc_group_runnable(struct cfs_rq *cfs_rq, long shares)
        return clamp_t(long, runnable, MIN_SHARES, shares);
 }
-# endif /* CONFIG_SMP */
+#endif /* CONFIG_SMP */
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
@@ -3350,7 +3347,7 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 }
 /*
- * Called within set_task_rq() right before setting a task's cpu. The
+ * Called within set_task_rq() right before setting a task's CPU. The
 * caller only guarantees p->pi_lock is held; no other assumptions,
 * including the state of rq->lock, should be made.
 */
@@ -3529,7 +3526,7 @@ update_tg_cfs_runnable(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cf
        /*
         * runnable_sum can't be lower than running_sum
-         * As running sum is scale with cpu capacity wehreas the runnable sum
+         * As running sum is scale with CPU capacity wehreas the runnable sum
         * is not we rescale running_sum 1st
         */
        running_sum = se->avg.util_sum /
@@ -4676,7 +4673,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
        if (!se)
                add_nr_running(rq, task_delta);
-        /* determine whether we need to wake up potentially idle cpu */
+        /* Determine whether we need to wake up potentially idle CPU: */
        if (rq->curr == rq->idle && rq->cfs.nr_running)
                resched_curr(rq);
 }
@@ -5041,7 +5038,7 @@ static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 }
 /*
- * Both these cpu hotplug callbacks race against unregister_fair_sched_group()
+ * Both these CPU hotplug callbacks race against unregister_fair_sched_group()
 *
 * The race is harmless, since modifying bandwidth settings of unhooked group
 * bits doesn't do much.
@@ -5086,7 +5083,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
                 */
                cfs_rq->runtime_remaining = 1;
                /*
-                 * Offline rq is schedulable till cpu is completely disabled
+                 * Offline rq is schedulable till CPU is completely disabled
                 * in take_cpu_down(), so we prevent new cfs throttling here.
                 */
                cfs_rq->runtime_enabled = 0;
@@ -5323,8 +5320,8 @@ DEFINE_PER_CPU(cpumask_var_t, select_idle_mask);
 *
 *   load' = (1 - 1/2^i) * load + (1/2^i) * cur_load
 *
- * If a cpu misses updates for n ticks (as it was idle) and update gets
+ * If a CPU misses updates for n ticks (as it was idle) and update gets
- * called on the n+1-th tick when cpu may be busy, then we have:
+ * called on the n+1-th tick when CPU may be busy, then we have:
 *
 *   load_n   = (1 - 1/2^i)^n * load_0
 *   load_n+1 = (1 - 1/2^i)   * load_n + (1/2^i) * cur_load
@@ -5468,7 +5465,7 @@ static unsigned long weighted_cpuload(struct rq *rq)
 #ifdef CONFIG_NO_HZ_COMMON
 /*
 * There is no sane way to deal with nohz on smp when using jiffies because the
- * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
+ * CPU doing the jiffies update might drift wrt the CPU doing the jiffy reading
 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
 *
 * Therefore we need to avoid the delta approach from the regular tick when
@@ -5579,7 +5576,7 @@ void cpu_load_update_active(struct rq *this_rq)
 }
 /*
- * Return a low guess at the load of a migration-source cpu weighted
+ * Return a low guess at the load of a migration-source CPU weighted
 * according to the scheduling class and "nice" value.
 *
 * We want to under-estimate the load of migration sources, to
@@ -5597,7 +5594,7 @@ static unsigned long source_load(int cpu, int type)
 }
 /*
- * Return a high guess at the load of a migration-target cpu weighted
+ * Return a high guess at the load of a migration-target CPU weighted
 * according to the scheduling class and "nice" value.
 */
 static unsigned long target_load(int cpu, int type)
@@ -5724,7 +5721,6 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
        unsigned long task_load;
        this_eff_load = target_load(this_cpu, sd->wake_idx);
-        prev_eff_load = source_load(prev_cpu, sd->wake_idx);
        if (sync) {
                unsigned long current_load = task_h_load(current);
@@ -5742,18 +5738,69 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
                this_eff_load *= 100;
        this_eff_load *= capacity_of(prev_cpu);
+        prev_eff_load = source_load(prev_cpu, sd->wake_idx);
        prev_eff_load -= task_load;
        if (sched_feat(WA_BIAS))
                prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
        prev_eff_load *= capacity_of(this_cpu);
-        return this_eff_load <= prev_eff_load ? this_cpu : nr_cpumask_bits;
+        /*
+         * If sync, adjust the weight of prev_eff_load such that if
+         * prev_eff == this_eff that select_idle_sibling() will consider
+         * stacking the wakee on top of the waker if no other CPU is
+         * idle.
+         */
+        if (sync)
+                prev_eff_load += 1;
+        return this_eff_load < prev_eff_load ? this_cpu : nr_cpumask_bits;
+}
+#ifdef CONFIG_NUMA_BALANCING
+static void
+update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
+{
+        unsigned long interval;
+        if (!static_branch_likely(&sched_numa_balancing))
+                return;
+        /* If balancing has no preference then continue gathering data */
+        if (p->numa_preferred_nid == -1)
+                return;
+        /*
+         * If the wakeup is not affecting locality then it is neutral from
+         * the perspective of NUMA balacing so continue gathering data.
+         */
+        if (cpu_to_node(prev_cpu) == cpu_to_node(target))
+                return;
+        /*
+         * Temporarily prevent NUMA balancing trying to place waker/wakee after
+         * wakee has been moved by wake_affine. This will potentially allow
+         * related tasks to converge and update their data placement. The
+         * 4 * numa_scan_period is to allow the two-pass filter to migrate
+         * hot data to the wakers node.
+         */
+        interval = max(sysctl_numa_balancing_scan_delay,
+                         p->numa_scan_period << 2);
+        p->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
+        interval = max(sysctl_numa_balancing_scan_delay,
+                         current->numa_scan_period << 2);
+        current->numa_migrate_retry = jiffies + msecs_to_jiffies(interval);
+}
+#else
+static void
+update_wa_numa_placement(struct task_struct *p, int prev_cpu, int target)
+{
 }
+#endif
 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
-                       int prev_cpu, int sync)
+                       int this_cpu, int prev_cpu, int sync)
 {
-        int this_cpu = smp_processor_id();
        int target = nr_cpumask_bits;
        if (sched_feat(WA_IDLE))
@@ -5766,6 +5813,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
        if (target == nr_cpumask_bits)
                return prev_cpu;
+        update_wa_numa_placement(p, prev_cpu, target);
        schedstat_inc(sd->ttwu_move_affine);
        schedstat_inc(p->se.statistics.nr_wakeups_affine);
        return target;
@@ -5826,7 +5874,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                max_spare_cap = 0;
                for_each_cpu(i, sched_group_span(group)) {
-                        /* Bias balancing toward cpus of our domain */
+                        /* Bias balancing toward CPUs of our domain */
                        if (local_group)
                                load = source_load(i, load_idx);
                        else
@@ -5856,7 +5904,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                        if (min_runnable_load > (runnable_load + imbalance)) {
                                /*
                                 * The runnable load is significantly smaller
-                                 * so we can pick this new cpu
+                                 * so we can pick this new CPU:
                                 */
                                min_runnable_load = runnable_load;
                                min_avg_load = avg_load;
@@ -5865,7 +5913,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
                                   (100*min_avg_load > imbalance_scale*avg_load)) {
                                /*
                                 * The runnable loads are close so take the
-                                 * blocked load into account through avg_load.
+                                 * blocked load into account through avg_load:
                                 */
                                min_avg_load = avg_load;
                                idlest = group;
@@ -5903,6 +5951,18 @@ skip_spare:
        if (!idlest)
                return NULL;
+        /*
+         * When comparing groups across NUMA domains, it's possible for the
+         * local domain to be very lightly loaded relative to the remote
+         * domains but "imbalance" skews the comparison making remote CPUs
+         * look much more favourable. When considering cross-domain, add
+         * imbalance to the runnable load on the remote node and consider
+         * staying local.
+         */
+        if ((sd->flags & SD_NUMA) &&
+            min_runnable_load + imbalance >= this_runnable_load)
+                return NULL;
        if (min_runnable_load > (this_runnable_load + imbalance))
                return NULL;
@@ -5914,7 +5974,7 @@ skip_spare:
 }
 /*
- * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
+ * find_idlest_group_cpu - find the idlest CPU among the CPUs in the group.
 */
 static int
 find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
@@ -5992,12 +6052,12 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
                new_cpu = find_idlest_group_cpu(group, p, cpu);
                if (new_cpu == cpu) {
-                        /* Now try balancing at a lower domain level of cpu */
+                        /* Now try balancing at a lower domain level of 'cpu': */
                        sd = sd->child;
                        continue;
                }
-                /* Now try balancing at a lower domain level of new_cpu */
+                /* Now try balancing at a lower domain level of 'new_cpu': */
                cpu = new_cpu;
                weight = sd->span_weight;
                sd = NULL;
@@ -6007,7 +6067,6 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
                        if (tmp->flags & sd_flag)
                                sd = tmp;
                }
-                /* while loop will break here if sd == NULL */
        }
        return new_cpu;
@@ -6203,12 +6262,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
                return target;
        /*
-         * If the previous cpu is cache affine and idle, don't be stupid.
+         * If the previous CPU is cache affine and idle, don't be stupid:
         */
        if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev))
                return prev;
-        /* Check a recently used CPU as a potential idle candidate */
+        /* Check a recently used CPU as a potential idle candidate: */
        recent_used_cpu = p->recent_used_cpu;
        if (recent_used_cpu != prev &&
            recent_used_cpu != target &&
@@ -6217,7 +6276,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
            cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) {
                /*
                 * Replace recent_used_cpu with prev as it is a potential
-                 * candidate for the next wake.
+                 * candidate for the next wake:
                 */
                p->recent_used_cpu = prev;
                return recent_used_cpu;
@@ -6282,7 +6341,7 @@ static inline unsigned long task_util(struct task_struct *p)
 }
 /*
- * cpu_util_wake: Compute cpu utilization with any contributions from
+ * cpu_util_wake: Compute CPU utilization with any contributions from
 * the waking task p removed.
 */
 static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
@@ -6328,10 +6387,10 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
 *
- * Balances load by selecting the idlest cpu in the idlest group, or under
+ * Balances load by selecting the idlest CPU in the idlest group, or under
- * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
+ * certain conditions an idle sibling CPU if the domain has SD_WAKE_AFFINE set.
 *
- * Returns the target cpu number.
+ * Returns the target CPU number.
 *
 * preempt must be disabled.
 */
@@ -6342,7 +6401,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
        int cpu = smp_processor_id();
        int new_cpu = prev_cpu;
        int want_affine = 0;
-        int sync = wake_flags & WF_SYNC;
+        int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING);
        if (sd_flag & SD_BALANCE_WAKE) {
                record_wakee(p);
@@ -6356,7 +6415,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                        break;
                /*
-                 * If both cpu and prev_cpu are part of this domain,
+                 * If both 'cpu' and 'prev_cpu' are part of this domain,
                 * cpu is a valid SD_WAKE_AFFINE target.
                 */
                if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
@@ -6376,7 +6435,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                if (cpu == prev_cpu)
                        goto pick_cpu;
-                new_cpu = wake_affine(affine_sd, p, prev_cpu, sync);
+                new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync);
        }
        if (sd && !(sd_flag & SD_BALANCE_FORK)) {
@@ -6407,9 +6466,9 @@ pick_cpu:
 static void detach_entity_cfs_rq(struct sched_entity *se);
 /*
- * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
+ * Called immediately before a task is migrated to a new CPU; task_cpu(p) and
 * cfs_rq_of(p) references at time of call are still valid and identify the
- * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
+ * previous CPU. The caller guarantees p->pi_lock or task_rq(p)->lock is held.
 */
 static void migrate_task_rq_fair(struct task_struct *p)
 {
@@ -6843,17 +6902,17 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 * BASICS
 *
 * The purpose of load-balancing is to achieve the same basic fairness the
- * per-cpu scheduler provides, namely provide a proportional amount of compute
+ * per-CPU scheduler provides, namely provide a proportional amount of compute
 * time to each task. This is expressed in the following equation:
 *
 *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
 *
- * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
+ * Where W_i,n is the n-th weight average for CPU i. The instantaneous weight
 * W_i,0 is defined as:
 *
 *   W_i,0 = \Sum_j w_i,j                                             (2)
 *
- * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
+ * Where w_i,j is the weight of the j-th runnable task on CPU i. This weight
 * is derived from the nice value as per sched_prio_to_weight[].
 *
 * The weight average is an exponential decay average of the instantaneous
@@ -6861,7 +6920,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 *
 *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
 *
- * C_i is the compute capacity of cpu i, typically it is the
+ * C_i is the compute capacity of CPU i, typically it is the
 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
 * can also include other factors [XXX].
 *
@@ -6882,11 +6941,11 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 * SCHED DOMAINS
 *
 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
- * for all i,j solution, we create a tree of cpus that follows the hardware
+ * for all i,j solution, we create a tree of CPUs that follows the hardware
 * topology where each level pairs two lower groups (or better). This results
- * in O(log n) layers. Furthermore we reduce the number of cpus going up the
+ * in O(log n) layers. Furthermore we reduce the number of CPUs going up the
 * tree to only the first of the previous level and we decrease the frequency
- * of load-balance at each level inv. proportional to the number of cpus in
+ * of load-balance at each level inv. proportional to the number of CPUs in
 * the groups.
 *
 * This yields:
@@ -6895,7 +6954,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
 *     i = 0      2^i   2^i
 *                               `- size of each group
- *         |         |     `- number of cpus doing load-balance
+ *         |         |     `- number of CPUs doing load-balance
 *         |         `- freq
 *         `- sum over all levels
 *
@@ -6903,7 +6962,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 * this makes (5) the runtime complexity of the balancer.
 *
 * An important property here is that each CPU is still (indirectly) connected
- * to every other cpu in at most O(log n) steps:
+ * to every other CPU in at most O(log n) steps:
 *
 * The adjacency matrix of the resulting graph is given by:
 *
@@ -6915,7 +6974,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 *
 *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
 *
- * Showing there's indeed a path between every cpu in at most O(log n) steps.
+ * Showing there's indeed a path between every CPU in at most O(log n) steps.
 * The task movement gives a factor of O(m), giving a convergence complexity
 * of:
 *
@@ -6925,7 +6984,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 * WORK CONSERVING
 *
 * In order to avoid CPUs going idle while there's still work to do, new idle
- * balancing is more aggressive and has the newly idle cpu iterate up the domain
+ * balancing is more aggressive and has the newly idle CPU iterate up the domain
 * tree itself instead of relying on other CPUs to bring it work.
 *
 * This adds some complexity to both (5) and (8) but it reduces the total idle
@@ -6946,7 +7005,7 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
 *
 *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
 *
- * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
+ * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on CPU i.
 *
 * The big problem is S_k, its a global sum needed to compute a local (W_i)
 * property.
@@ -7110,7 +7169,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
                env->flags |= LBF_SOME_PINNED;
                /*
-                 * Remember if this task can be migrated to any other cpu in
+                 * Remember if this task can be migrated to any other CPU in
                 * our sched_group. We may want to revisit it if we couldn't
                 * meet load balance goals by pulling other tasks on src_cpu.
                 *
@@ -7120,7 +7179,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env)
                if (env->idle == CPU_NEWLY_IDLE || (env->flags & LBF_DST_PINNED))
                        return 0;
-                /* Prevent to re-select dst_cpu via env's cpus */
+                /* Prevent to re-select dst_cpu via env's CPUs: */
                for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
                        if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
                                env->flags |= LBF_DST_PINNED;
@@ -7694,8 +7753,8 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
 * Group imbalance indicates (and tries to solve) the problem where balancing
 * groups is inadequate due to ->cpus_allowed constraints.
 *
- * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
+ * Imagine a situation of two groups of 4 CPUs each and 4 tasks each with a
- * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
+ * cpumask covering 1 CPU of the first group and 3 CPUs of the second group.
 * Something like:
 *
 *      { 0 1 2 3 } { 4 5 6 7 }
@@ -7703,7 +7762,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
 *
 * If we were to balance group-wise we'd place two tasks in the first group and
 * two tasks in the second group. Clearly this is undesired as it will overload
- * cpu 3 and leave one of the cpus in the second group unused.
+ * cpu 3 and leave one of the CPUs in the second group unused.
 *
 * The current solution to this issue is detecting the skew in the first group
 * by noticing the lower domain failed to reach balance and had difficulty
@@ -7816,7 +7875,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
        for_each_cpu_and(i, sched_group_span(group), env->cpus) {
                struct rq *rq = cpu_rq(i);
-                /* Bias balancing toward cpus of our domain */
+                /* Bias balancing toward CPUs of our domain: */
                if (local_group)
                        load = target_load(i, load_idx);
                else
@@ -7902,7 +7961,7 @@ asym_packing:
        if (!(env->sd->flags & SD_ASYM_PACKING))
                return true;
-        /* No ASYM_PACKING if target cpu is already busy */
+        /* No ASYM_PACKING if target CPU is already busy */
        if (env->idle == CPU_NOT_IDLE)
                return true;
        /*
@@ -7915,7 +7974,7 @@ asym_packing:
                if (!sds->busiest)
                        return true;
-                /* Prefer to move from lowest priority cpu's work */
+                /* Prefer to move from lowest priority CPU's work */
                if (sched_asym_prefer(sds->busiest->asym_prefer_cpu,
                                      sg->asym_prefer_cpu))
                        return true;
@@ -8168,7 +8227,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
        if (busiest->group_type == group_imbalanced) {
                /*
                 * In the group_imb case we cannot rely on group-wide averages
-                 * to ensure cpu-load equilibrium, look at wider averages. XXX
+                 * to ensure CPU-load equilibrium, look at wider averages. XXX
                 */
                busiest->load_per_task =
                        min(busiest->load_per_task, sds->avg_load);
@@ -8187,7 +8246,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
        }
        /*
-         * If there aren't any idle cpus, avoid creating some.
+         * If there aren't any idle CPUs, avoid creating some.
         */
        if (busiest->group_type == group_overloaded &&
            local->group_type   == group_overloaded) {
@@ -8201,9 +8260,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
        }
        /*
-         * We're trying to get all the cpus to the average_load, so we don't
+         * We're trying to get all the CPUs to the average_load, so we don't
         * want to push ourselves above the average load, nor do we wish to
-         * reduce the max loaded cpu below the average load. At the same time,
+         * reduce the max loaded CPU below the average load. At the same time,
         * we also don't want to reduce the group load below the group
         * capacity. Thus we look for the minimum possible imbalance.
         */
@@ -8297,9 +8356,9 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
        if (env->idle == CPU_IDLE) {
                /*
-                 * This cpu is idle. If the busiest group is not overloaded
+                 * This CPU is idle. If the busiest group is not overloaded
                 * and there is no imbalance between this and busiest group
-                 * wrt idle cpus, it is balanced. The imbalance becomes
+                 * wrt idle CPUs, it is balanced. The imbalance becomes
                 * significant if the diff is greater than 1 otherwise we
                 * might end up to just move the imbalance on another group
                 */
@@ -8327,7 +8386,7 @@ out_balanced:
 }
 /*
- * find_busiest_queue - find the busiest runqueue among the cpus in group.
+ * find_busiest_queue - find the busiest runqueue among the CPUs in the group.
 */
 static struct rq *find_busiest_queue(struct lb_env *env,
                                     struct sched_group *group)
@@ -8371,7 +8430,7 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                /*
                 * When comparing with imbalance, use weighted_cpuload()
-                 * which is not scaled with the cpu capacity.
+                 * which is not scaled with the CPU capacity.
                 */
                if (rq->nr_running == 1 && wl > env->imbalance &&
@@ -8379,9 +8438,9 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                        continue;
                /*
-                 * For the load comparisons with the other cpu's, consider
+                 * For the load comparisons with the other CPU's, consider
-                 * the weighted_cpuload() scaled with the cpu capacity, so
+                 * the weighted_cpuload() scaled with the CPU capacity, so
-                 * that the load can be moved away from the cpu that is
+                 * that the load can be moved away from the CPU that is
                 * potentially running at a lower capacity.
                 *
                 * Thus we're looking for max(wl_i / capacity_i), crosswise
@@ -8452,13 +8511,13 @@ static int should_we_balance(struct lb_env *env)
                return 0;
        /*
-         * In the newly idle case, we will allow all the cpu's
+         * In the newly idle case, we will allow all the CPUs
         * to do the newly idle load balance.
         */
        if (env->idle == CPU_NEWLY_IDLE)
                return 1;
-        /* Try to find first idle cpu */
+        /* Try to find first idle CPU */
        for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
                if (!idle_cpu(cpu))
                        continue;
@@ -8471,7 +8530,7 @@ static int should_we_balance(struct lb_env *env)
                balance_cpu = group_balance_cpu(sg);
        /*
-         * First idle cpu or the first cpu(busiest) in this sched group
+         * First idle CPU or the first CPU(busiest) in this sched group
         * is eligible for doing load balancing at this and above domains.
         */
        return balance_cpu == env->dst_cpu;
@@ -8580,7 +8639,7 @@ more_balance:
                 * Revisit (affine) tasks on src_cpu that couldn't be moved to
                 * us and move them to an alternate dst_cpu in our sched_group
                 * where they can run. The upper limit on how many times we
-                 * iterate on same src_cpu is dependent on number of cpus in our
+                 * iterate on same src_cpu is dependent on number of CPUs in our
                 * sched_group.
                 *
                 * This changes load balance semantics a bit on who can move
@@ -8597,7 +8656,7 @@ more_balance:
                 */
                if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
-                        /* Prevent to re-select dst_cpu via env's cpus */
+                        /* Prevent to re-select dst_cpu via env's CPUs */
                        cpumask_clear_cpu(env.dst_cpu, env.cpus);
                        env.dst_rq       = cpu_rq(env.new_dst_cpu);
@@ -8659,9 +8718,10 @@ more_balance:
                        raw_spin_lock_irqsave(&busiest->lock, flags);
-                        /* don't kick the active_load_balance_cpu_stop,
+                        /*
-                         * if the curr task on busiest cpu can't be
+                         * Don't kick the active_load_balance_cpu_stop,
-                         * moved to this_cpu
+                         * if the curr task on busiest CPU can't be
+                         * moved to this_cpu:
                         */
                        if (!cpumask_test_cpu(this_cpu, &busiest->curr->cpus_allowed)) {
                                raw_spin_unlock_irqrestore(&busiest->lock,
@@ -8887,7 +8947,7 @@ out:
 }
 /*
- * active_load_balance_cpu_stop is run by cpu stopper. It pushes
+ * active_load_balance_cpu_stop is run by the CPU stopper. It pushes
 * running tasks off the busiest CPU onto idle CPUs. It requires at
 * least 1 task to be running on each physical CPU where possible, and
 * avoids physical / logical imbalances.
@@ -8911,7 +8971,7 @@ static int active_load_balance_cpu_stop(void *data)
        if (!cpu_active(busiest_cpu) || !cpu_active(target_cpu))
                goto out_unlock;
-        /* make sure the requested cpu hasn't gone down in the meantime */
+        /* Make sure the requested CPU hasn't gone down in the meantime: */
        if (unlikely(busiest_cpu != smp_processor_id() ||
                     !busiest_rq->active_balance))
                goto out_unlock;
@@ -8923,7 +8983,7 @@ static int active_load_balance_cpu_stop(void *data)
        /*
         * This condition is "impossible", if it occurs
         * we need to fix it. Originally reported by
-         * Bjorn Helgaas on a 128-cpu setup.
+         * Bjorn Helgaas on a 128-CPU setup.
         */
        BUG_ON(busiest_rq == target_rq);
@@ -9025,7 +9085,7 @@ static void nohz_balancer_kick(void)
                return;
        /*
         * Use smp_send_reschedule() instead of resched_cpu().
-         * This way we generate a sched IPI on the target cpu which
+         * This way we generate a sched IPI on the target CPU which
         * is idle. And the softirq performing nohz idle load balance
         * will be run before returning from the IPI.
         */
@@ -9082,14 +9142,12 @@ unlock:
 }
 /*
- * This routine will record that the cpu is going idle with tick stopped.
+ * This routine will record that the CPU is going idle with tick stopped.
 * This info will be used in performing idle load balancing in the future.
 */
 void nohz_balance_enter_idle(int cpu)
 {
-        /*
+        /* If this CPU is going down, then nothing needs to be done: */
-         * If this cpu is going down, then nothing needs to be done.
-         */
        if (!cpu_active(cpu))
                return;
@@ -9100,9 +9158,7 @@ void nohz_balance_enter_idle(int cpu)
        if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
                return;
-        /*
+        /* If we're a completely isolated CPU, we don't play: */
-         * If we're a completely isolated CPU, we don't play.
-         */
        if (on_null_domain(cpu_rq(cpu)))
                return;
@@ -9211,7 +9267,7 @@ out:
        /*
         * next_balance will be updated only when there is a need.
-         * When the cpu is attached to null domain for ex, it will not be
+         * When the CPU is attached to null domain for ex, it will not be
         * updated.
         */
        if (likely(update_next_balance)) {
@@ -9235,7 +9291,7 @@ out:
 #ifdef CONFIG_NO_HZ_COMMON
 /*
 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
- * rebalancing for all the cpus for whom scheduler ticks are stopped.
+ * rebalancing for all the CPUs for whom scheduler ticks are stopped.
 */
 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
 {
@@ -9255,8 +9311,8 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
                        continue;
                /*
-                 * If this cpu gets work to do, stop the load balancing
+                 * If this CPU gets work to do, stop the load balancing
-                 * work being done for other cpus. Next load
+                 * work being done for other CPUs. Next load
                 * balancing owner will pick it up.
                 */
                if (need_resched())
@@ -9298,13 +9354,13 @@ end:
 /*
 * Current heuristic for kicking the idle load balancer in the presence
- * of an idle cpu in the system.
+ * of an idle CPU in the system.
 *   - This rq has more than one task.
 *   - This rq has at least one CFS task and the capacity of the CPU is
 *     significantly reduced because of RT tasks or IRQs.
- *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
+ *   - At parent of LLC scheduler domain level, this CPU's scheduler group has
- *     multiple busy cpu.
+ *     multiple busy CPUs.
- *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
+ *   - For SD_ASYM_PACKING, if the lower numbered CPU's in the scheduler
 *     domain span are idle.
 */
 static inline bool nohz_kick_needed(struct rq *rq)
@@ -9394,10 +9450,10 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
                                                CPU_IDLE : CPU_NOT_IDLE;
        /*
-         * If this cpu has a pending nohz_balance_kick, then do the
+         * If this CPU has a pending nohz_balance_kick, then do the
-         * balancing on behalf of the other idle cpus whose ticks are
+         * balancing on behalf of the other idle CPUs whose ticks are
         * stopped. Do nohz_idle_balance *before* rebalance_domains to
-         * give the idle cpus a chance to load balance. Else we may
+         * give the idle CPUs a chance to load balance. Else we may
         * load balance only within the local sched_domain hierarchy
         * and abort nohz_idle_balance altogether if we pull some load.
         */
@@ -9440,7 +9496,12 @@ static void rq_offline_fair(struct rq *rq)
 #endif /* CONFIG_SMP */
 /*
- * scheduler tick hitting a task of our scheduling class:
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
 */
 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
 {
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 7dae9eb8c042..2975f195e1c4 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -1,23 +1,14 @@
 /*
- * Generic entry point for the idle threads
+ * Generic entry points for the idle threads and
+ * implementation of the idle task scheduling class.
+ *
+ * (NOTE: these are not related to SCHED_IDLE batch scheduled
+ *        tasks which are handled in sched/fair.c )
 */
-#include <linux/sched.h>
+#include "sched.h"
-#include <linux/sched/idle.h>
-#include <linux/cpu.h>
-#include <linux/cpuidle.h>
-#include <linux/cpuhotplug.h>
-#include <linux/tick.h>
-#include <linux/mm.h>
-#include <linux/stackprotector.h>
-#include <linux/suspend.h>
-#include <linux/livepatch.h>
-#include <asm/tlb.h>
 #include <trace/events/power.h>
-#include "sched.h"
 /* Linker adds these: start and end of __cpuidle functions */
 extern char __cpuidle_text_start[], __cpuidle_text_end[];
@@ -46,6 +37,7 @@ void cpu_idle_poll_ctrl(bool enable)
 static int __init cpu_idle_poll_setup(char *__unused)
 {
        cpu_idle_force_poll = 1;
        return 1;
 }
 __setup("nohlt", cpu_idle_poll_setup);
@@ -53,6 +45,7 @@ __setup("nohlt", cpu_idle_poll_setup);
 static int __init cpu_idle_nopoll_setup(char *__unused)
 {
        cpu_idle_force_poll = 0;
        return 1;
 }
 __setup("hlt", cpu_idle_nopoll_setup);
@@ -64,12 +57,14 @@ static noinline int __cpuidle cpu_idle_poll(void)
        trace_cpu_idle_rcuidle(0, smp_processor_id());
        local_irq_enable();
        stop_critical_timings();
        while (!tif_need_resched() &&
                (cpu_idle_force_poll || tick_check_broadcast_expired()))
                cpu_relax();
        start_critical_timings();
        trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
        rcu_idle_exit();
        return 1;
 }
@@ -332,8 +327,8 @@ void cpu_startup_entry(enum cpuhp_state state)
 {
        /*
         * This #ifdef needs to die, but it's too late in the cycle to
-         * make this generic (arm and sh have never invoked the canary
+         * make this generic (ARM and SH have never invoked the canary
-         * init for the non boot cpus!). Will be fixed in 3.11
+         * init for the non boot CPUs!). Will be fixed in 3.11
         */
 #ifdef CONFIG_X86
        /*
@@ -350,3 +345,116 @@ void cpu_startup_entry(enum cpuhp_state state)
        while (1)
                do_idle();
 }
+/*
+ * idle-task scheduling class.
+ */
+#ifdef CONFIG_SMP
+static int
+select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+        return task_cpu(p); /* IDLE tasks as never migrated */
+}
+#endif
+/*
+ * Idle tasks are unconditionally rescheduled:
+ */
+static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
+{
+        resched_curr(rq);
+}
+static struct task_struct *
+pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
+{
+        put_prev_task(rq, prev);
+        update_idle_core(rq);
+        schedstat_inc(rq->sched_goidle);
+        return rq->idle;
+}
+/*
+ * It is not legal to sleep in the idle task - print a warning
+ * message if some code attempts to do it:
+ */
+static void
+dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
+{
+        raw_spin_unlock_irq(&rq->lock);
+        printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+        dump_stack();
+        raw_spin_lock_irq(&rq->lock);
+}
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
+{
+}
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
+static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
+{
+}
+static void set_curr_task_idle(struct rq *rq)
+{
+}
+static void switched_to_idle(struct rq *rq, struct task_struct *p)
+{
+        BUG();
+}
+static void
+prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
+{
+        BUG();
+}
+static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
+{
+        return 0;
+}
+static void update_curr_idle(struct rq *rq)
+{
+}
+/*
+ * Simple, special scheduling class for the per-CPU idle tasks:
+ */
+const struct sched_class idle_sched_class = {
+        /* .next is NULL */
+        /* no enqueue/yield_task for idle tasks */
+        /* dequeue is not valid, we print a debug message there: */
+        .dequeue_task           = dequeue_task_idle,
+        .check_preempt_curr     = check_preempt_curr_idle,
+        .pick_next_task         = pick_next_task_idle,
+        .put_prev_task          = put_prev_task_idle,
+#ifdef CONFIG_SMP
+        .select_task_rq         = select_task_rq_idle,
+        .set_cpus_allowed       = set_cpus_allowed_common,
+#endif
+        .set_curr_task          = set_curr_task_idle,
+        .task_tick              = task_tick_idle,
+        .get_rr_interval        = get_rr_interval_idle,
+        .prio_changed           = prio_changed_idle,
+        .switched_to            = switched_to_idle,
+        .update_curr            = update_curr_idle,
+};
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
deleted file mode 100644
index d518664cce4f..000000000000
--- a/kernel/sched/idle_task.c
+++ /dev/null
@@ -1,110 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include "sched.h"
-/*
- * idle-task scheduling class.
- *
- * (NOTE: these are not related to SCHED_IDLE tasks which are
- *  handled in sched/fair.c)
- */
-#ifdef CONFIG_SMP
-static int
-select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
-{
-        return task_cpu(p); /* IDLE tasks as never migrated */
-}
-#endif /* CONFIG_SMP */
-/*
- * Idle tasks are unconditionally rescheduled:
- */
-static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
-{
-        resched_curr(rq);
-}
-static struct task_struct *
-pick_next_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
-{
-        put_prev_task(rq, prev);
-        update_idle_core(rq);
-        schedstat_inc(rq->sched_goidle);
-        return rq->idle;
-}
-/*
- * It is not legal to sleep in the idle task - print a warning
- * message if some code attempts to do it:
- */
-static void
-dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
-{
-        raw_spin_unlock_irq(&rq->lock);
-        printk(KERN_ERR "bad: scheduling from the idle thread!\n");
-        dump_stack();
-        raw_spin_lock_irq(&rq->lock);
-}
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
-{
-        rq_last_tick_reset(rq);
-}
-static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
-{
-}
-static void set_curr_task_idle(struct rq *rq)
-{
-}
-static void switched_to_idle(struct rq *rq, struct task_struct *p)
-{
-        BUG();
-}
-static void
-prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
-{
-        BUG();
-}
-static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
-{
-        return 0;
-}
-static void update_curr_idle(struct rq *rq)
-{
-}
-/*
- * Simple, special scheduling class for the per-CPU idle tasks:
- */
-const struct sched_class idle_sched_class = {
-        /* .next is NULL */
-        /* no enqueue/yield_task for idle tasks */
-        /* dequeue is not valid, we print a debug message there: */
-        .dequeue_task           = dequeue_task_idle,
-        .check_preempt_curr     = check_preempt_curr_idle,
-        .pick_next_task         = pick_next_task_idle,
-        .put_prev_task          = put_prev_task_idle,
-#ifdef CONFIG_SMP
-        .select_task_rq         = select_task_rq_idle,
-        .set_cpus_allowed       = set_cpus_allowed_common,
-#endif
-        .set_curr_task          = set_curr_task_idle,
-        .task_tick              = task_tick_idle,
-        .get_rr_interval        = get_rr_interval_idle,
-        .prio_changed           = prio_changed_idle,
-        .switched_to            = switched_to_idle,
-        .update_curr            = update_curr_idle,
-};
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index b71b436f59f2..e6802181900f 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c
@@ -3,15 +3,10 @@
 *  any CPU: unbound workqueues, timers, kthreads and any offloadable work.
 *
 * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker
+ * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker
 *
 */
+#include "sched.h"
-#include <linux/sched/isolation.h>
-#include <linux/tick.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/static_key.h>
-#include <linux/ctype.h>
 DEFINE_STATIC_KEY_FALSE(housekeeping_overriden);
 EXPORT_SYMBOL_GPL(housekeeping_overriden);
@@ -60,6 +55,9 @@ void __init housekeeping_init(void)
        static_branch_enable(&housekeeping_overriden);
+        if (housekeeping_flags & HK_FLAG_TICK)
+                sched_tick_offload_init();
        /* We need at least one CPU to handle housekeeping work */
        WARN_ON_ONCE(cpumask_empty(housekeeping_mask));
 }
@@ -119,7 +117,7 @@ static int __init housekeeping_nohz_full_setup(char *str)
 {
        unsigned int flags;
-        flags = HK_FLAG_TICK | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
+        flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | HK_FLAG_MISC;
        return housekeeping_setup(str, flags);
 }
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index 89a989e4d758..a171c1258109 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -6,10 +6,6 @@
 * figure. Its a silly number but people think its important. We go through
 * great pains to make it work on big machines and tickless kernels.
 */
-#include <linux/export.h>
-#include <linux/sched/loadavg.h>
 #include "sched.h"
 /*
@@ -32,29 +28,29 @@
 * Due to a number of reasons the above turns in the mess below:
 *
 *  - for_each_possible_cpu() is prohibitively expensive on machines with
- *    serious number of cpus, therefore we need to take a distributed approach
+ *    serious number of CPUs, therefore we need to take a distributed approach
 *    to calculating nr_active.
 *
 *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
 *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
 *
 *    So assuming nr_active := 0 when we start out -- true per definition, we
- *    can simply take per-cpu deltas and fold those into a global accumulate
+ *    can simply take per-CPU deltas and fold those into a global accumulate
 *    to obtain the same result. See calc_load_fold_active().
 *
- *    Furthermore, in order to avoid synchronizing all per-cpu delta folding
+ *    Furthermore, in order to avoid synchronizing all per-CPU delta folding
 *    across the machine, we assume 10 ticks is sufficient time for every
- *    cpu to have completed this task.
+ *    CPU to have completed this task.
 *
 *    This places an upper-bound on the IRQ-off latency of the machine. Then
 *    again, being late doesn't loose the delta, just wrecks the sample.
 *
- *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-cpu because
+ *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
- *    this would add another cross-cpu cacheline miss and atomic operation
+ *    this would add another cross-CPU cacheline miss and atomic operation
- *    to the wakeup path. Instead we increment on whatever cpu the task ran
+ *    to the wakeup path. Instead we increment on whatever CPU the task ran
- *    when it went into uninterruptible state and decrement on whatever cpu
+ *    when it went into uninterruptible state and decrement on whatever CPU
 *    did the wakeup. This means that only the sum of nr_uninterruptible over
- *    all cpus yields the correct result.
+ *    all CPUs yields the correct result.
 *
 *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
 */
@@ -115,11 +111,11 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
 * Handle NO_HZ for the global load-average.
 *
 * Since the above described distributed algorithm to compute the global
- * load-average relies on per-cpu sampling from the tick, it is affected by
+ * load-average relies on per-CPU sampling from the tick, it is affected by
 * NO_HZ.
 *
 * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
- * entering NO_HZ state such that we can include this as an 'extra' cpu delta
+ * entering NO_HZ state such that we can include this as an 'extra' CPU delta
 * when we read the global state.
 *
 * Obviously reality has to ruin such a delightfully simple scheme:
@@ -146,9 +142,9 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
 *    busy state.
 *
 *    This is solved by pushing the window forward, and thus skipping the
- *    sample, for this cpu (effectively using the NO_HZ-delta for this cpu which
+ *    sample, for this CPU (effectively using the NO_HZ-delta for this CPU which
 *    was in effect at the time the window opened). This also solves the issue
- *    of having to deal with a cpu having been in NO_HZ for multiple LOAD_FREQ
+ *    of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ
 *    intervals.
 *
 * When making the ILB scale, we should try to pull this in as well.
@@ -299,7 +295,7 @@ calc_load_n(unsigned long load, unsigned long exp,
 }
 /*
- * NO_HZ can leave us missing all per-cpu ticks calling
+ * NO_HZ can leave us missing all per-CPU ticks calling
 * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
 * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
 * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
@@ -363,7 +359,7 @@ void calc_global_load(unsigned long ticks)
                return;
        /*
-         * Fold the 'old' NO_HZ-delta to include all NO_HZ cpus.
+         * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
         */
        delta = calc_load_nohz_fold();
        if (delta)
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 5d0762633639..76e0eaf4654e 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -13,32 +13,25 @@
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
+#include "sched.h"
-#include <linux/syscalls.h>
-#include <linux/membarrier.h>
-#include <linux/tick.h>
-#include <linux/cpumask.h>
-#include <linux/atomic.h>
-#include "sched.h"      /* for cpu_rq(). */
 /*
 * Bitmask made from a "or" of all commands within enum membarrier_cmd,
 * except MEMBARRIER_CMD_QUERY.
 */
 #ifdef CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE
-#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK  \
+#define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK                  \
-        (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE \
+        (MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE                     \
        | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED_SYNC_CORE)
 #else
 #define MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK  0
 #endif
-#define MEMBARRIER_CMD_BITMASK  \
+#define MEMBARRIER_CMD_BITMASK                                          \
-        (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED \
+        (MEMBARRIER_CMD_GLOBAL | MEMBARRIER_CMD_GLOBAL_EXPEDITED        \
-        | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED \
+        | MEMBARRIER_CMD_REGISTER_GLOBAL_EXPEDITED                      \
-        | MEMBARRIER_CMD_PRIVATE_EXPEDITED      \
+        | MEMBARRIER_CMD_PRIVATE_EXPEDITED                              \
-        | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED     \
+        | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED                     \
        | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
 static void ipi_mb(void *info)
@@ -85,6 +78,7 @@ static int membarrier_global_expedited(void)
                 */
                if (cpu == raw_smp_processor_id())
                        continue;
                rcu_read_lock();
                p = task_rcu_dereference(&cpu_rq(cpu)->curr);
                if (p && p->mm && (atomic_read(&p->mm->membarrier_state) &
@@ -188,6 +182,7 @@ static int membarrier_private_expedited(int flags)
         * rq->curr modification in scheduler.
         */
        smp_mb();       /* exit from system call is not a mb */
        return 0;
 }
@@ -219,6 +214,7 @@ static int membarrier_register_global_expedited(void)
        }
        atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
                  &mm->membarrier_state);
        return 0;
 }
@@ -253,6 +249,7 @@ static int membarrier_register_private_expedited(int flags)
                synchronize_sched();
        }
        atomic_or(state, &mm->membarrier_state);
        return 0;
 }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index aad49451584e..4f4fd3b157f1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -3,12 +3,8 @@
 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
 * policies)
 */
 #include "sched.h"
-#include <linux/slab.h>
-#include <linux/irq_work.h>
 int sched_rr_timeslice = RR_TIMESLICE;
 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
@@ -359,7 +355,7 @@ static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
 static void push_rt_tasks(struct rq *);
 static void pull_rt_task(struct rq *);
-static inline void queue_push_tasks(struct rq *rq)
+static inline void rt_queue_push_tasks(struct rq *rq)
 {
        if (!has_pushable_tasks(rq))
                return;
@@ -367,7 +363,7 @@ static inline void queue_push_tasks(struct rq *rq)
        queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
 }
-static inline void queue_pull_task(struct rq *rq)
+static inline void rt_queue_pull_task(struct rq *rq)
 {
        queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 }
@@ -425,7 +421,7 @@ static inline void pull_rt_task(struct rq *this_rq)
 {
 }
-static inline void queue_push_tasks(struct rq *rq)
+static inline void rt_queue_push_tasks(struct rq *rq)
 {
 }
 #endif /* CONFIG_SMP */
@@ -1453,9 +1449,9 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
                return;
        /*
-         * There appears to be other cpus that can accept
+         * There appear to be other CPUs that can accept
-         * current and none to run 'p', so lets reschedule
+         * the current task but none can run 'p', so lets reschedule
-         * to try and push current away:
+         * to try and push the current task away:
         */
        requeue_task_rt(rq, p, 1);
        resched_curr(rq);
@@ -1569,7 +1565,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
        /* The running task is never eligible for pushing */
        dequeue_pushable_task(rq, p);
-        queue_push_tasks(rq);
+        rt_queue_push_tasks(rq);
        return p;
 }
@@ -1596,12 +1592,13 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
        if (!task_running(rq, p) &&
            cpumask_test_cpu(cpu, &p->cpus_allowed))
                return 1;
        return 0;
 }
 /*
 * Return the highest pushable rq's task, which is suitable to be executed
- * on the cpu, NULL otherwise
+ * on the CPU, NULL otherwise
 */
 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
 {
@@ -1639,11 +1636,11 @@ static int find_lowest_rq(struct task_struct *task)
                return -1; /* No targets found */
        /*
-         * At this point we have built a mask of cpus representing the
+         * At this point we have built a mask of CPUs representing the
         * lowest priority tasks in the system.  Now we want to elect
         * the best one based on our affinity and topology.
         *
-         * We prioritize the last cpu that the task executed on since
+         * We prioritize the last CPU that the task executed on since
         * it is most likely cache-hot in that location.
         */
        if (cpumask_test_cpu(cpu, lowest_mask))
@@ -1651,7 +1648,7 @@ static int find_lowest_rq(struct task_struct *task)
        /*
         * Otherwise, we consult the sched_domains span maps to figure
-         * out which cpu is logically closest to our hot cache data.
+         * out which CPU is logically closest to our hot cache data.
         */
        if (!cpumask_test_cpu(this_cpu, lowest_mask))
                this_cpu = -1; /* Skip this_cpu opt if not among lowest */
@@ -1692,6 +1689,7 @@ static int find_lowest_rq(struct task_struct *task)
        cpu = cpumask_any(lowest_mask);
        if (cpu < nr_cpu_ids)
                return cpu;
        return -1;
 }
@@ -1827,7 +1825,7 @@ retry:
                         * The task hasn't migrated, and is still the next
                         * eligible task, but we failed to find a run-queue
                         * to push it to.  Do not retry in this case, since
-                         * other cpus will pull from us when ready.
+                         * other CPUs will pull from us when ready.
                         */
                        goto out;
                }
@@ -1919,7 +1917,7 @@ static int rto_next_cpu(struct root_domain *rd)
         * rt_next_cpu() will simply return the first CPU found in
         * the rto_mask.
         *
-         * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
+         * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
         * will return the next CPU found in the rto_mask.
         *
         * If there are no more CPUs left in the rto_mask, then a check is made
@@ -1980,7 +1978,7 @@ static void tell_cpu_to_push(struct rq *rq)
        raw_spin_lock(&rq->rd->rto_lock);
        /*
-         * The rto_cpu is updated under the lock, if it has a valid cpu
+         * The rto_cpu is updated under the lock, if it has a valid CPU
         * then the IPI is still running and will continue due to the
         * update to loop_next, and nothing needs to be done here.
         * Otherwise it is finishing up and an ipi needs to be sent.
@@ -2105,7 +2103,7 @@ static void pull_rt_task(struct rq *this_rq)
                        /*
                         * There's a chance that p is higher in priority
-                         * than what's currently running on its cpu.
+                         * than what's currently running on its CPU.
                         * This is just that p is wakeing up and hasn't
                         * had a chance to schedule. We only pull
                         * p if it is lower in priority than the
@@ -2187,7 +2185,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p)
        if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
                return;
-        queue_pull_task(rq);
+        rt_queue_pull_task(rq);
 }
 void __init init_sched_rt_class(void)
@@ -2218,7 +2216,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
        if (task_on_rq_queued(p) && rq->curr != p) {
 #ifdef CONFIG_SMP
                if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
-                        queue_push_tasks(rq);
+                        rt_queue_push_tasks(rq);
 #endif /* CONFIG_SMP */
                if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
                        resched_curr(rq);
@@ -2242,7 +2240,7 @@ prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
                 * may need to pull tasks to this runqueue.
                 */
                if (oldprio < p->prio)
-                        queue_pull_task(rq);
+                        rt_queue_pull_task(rq);
                /*
                 * If there's a higher priority task waiting to run
@@ -2292,6 +2290,14 @@ static void watchdog(struct rq *rq, struct task_struct *p)
 static inline void watchdog(struct rq *rq, struct task_struct *p) { }
 #endif
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
        struct sched_rt_entity *rt_se = &p->rt;
@@ -2685,6 +2691,7 @@ int sched_rr_handler(struct ctl_table *table, int write,
                        msecs_to_jiffies(sysctl_sched_rr_timeslice);
        }
        mutex_unlock(&mutex);
        return ret;
 }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index fb5fc458547f..23ba4dd76ac4 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1,39 +1,73 @@
 /* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Scheduler internal types and methods:
+ */
 #include <linux/sched.h>
 #include <linux/sched/autogroup.h>
-#include <linux/sched/sysctl.h>
-#include <linux/sched/topology.h>
-#include <linux/sched/rt.h>
-#include <linux/sched/deadline.h>
 #include <linux/sched/clock.h>
-#include <linux/sched/wake_q.h>
+#include <linux/sched/coredump.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/numa_balancing.h>
-#include <linux/sched/mm.h>
 #include <linux/sched/cpufreq.h>
-#include <linux/sched/stat.h>
+#include <linux/sched/cputime.h>
-#include <linux/sched/nohz.h>
+#include <linux/sched/deadline.h>
 #include <linux/sched/debug.h>
 #include <linux/sched/hotplug.h>
+#include <linux/sched/idle.h>
+#include <linux/sched/init.h>
+#include <linux/sched/isolation.h>
+#include <linux/sched/jobctl.h>
+#include <linux/sched/loadavg.h>
+#include <linux/sched/mm.h>
+#include <linux/sched/nohz.h>
+#include <linux/sched/numa_balancing.h>
+#include <linux/sched/prio.h>
+#include <linux/sched/rt.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/stat.h>
+#include <linux/sched/sysctl.h>
 #include <linux/sched/task.h>
 #include <linux/sched/task_stack.h>
-#include <linux/sched/cputime.h>
+#include <linux/sched/topology.h>
-#include <linux/sched/init.h>
+#include <linux/sched/user.h>
+#include <linux/sched/wake_q.h>
+#include <linux/sched/xacct.h>
+#include <uapi/linux/sched/types.h>
-#include <linux/u64_stats_sync.h>
-#include <linux/kernel_stat.h>
 #include <linux/binfmts.h>
-#include <linux/mutex.h>
+#include <linux/blkdev.h>
-#include <linux/spinlock.h>
+#include <linux/compat.h>
+#include <linux/context_tracking.h>
+#include <linux/cpufreq.h>
+#include <linux/cpuidle.h>
+#include <linux/cpuset.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/delayacct.h>
+#include <linux/init_task.h>
+#include <linux/kprobes.h>
+#include <linux/kthread.h>
+#include <linux/membarrier.h>
+#include <linux/migrate.h>
+#include <linux/mmu_context.h>
+#include <linux/nmi.h>
+#include <linux/proc_fs.h>
+#include <linux/prefetch.h>
+#include <linux/profile.h>
+#include <linux/rcupdate_wait.h>
+#include <linux/security.h>
+#include <linux/stackprotector.h>
 #include <linux/stop_machine.h>
-#include <linux/irq_work.h>
+#include <linux/suspend.h>
-#include <linux/tick.h>
+#include <linux/swait.h>
-#include <linux/slab.h>
+#include <linux/syscalls.h>
-#include <linux/cgroup.h>
+#include <linux/task_work.h>
+#include <linux/tsacct_kern.h>
+#include <asm/tlb.h>
 #ifdef CONFIG_PARAVIRT
-#include <asm/paravirt.h>
+# include <asm/paravirt.h>
 #endif
 #include "cpupri.h"
@@ -79,11 +113,11 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
 * and does not change the user-interface for setting shares/weights.
 *
 * We increase resolution only if we have enough bits to allow this increased
- * resolution (i.e. 64bit). The costs for increasing resolution when 32bit are
+ * resolution (i.e. 64-bit). The costs for increasing resolution when 32-bit
- * pretty high and the returns do not justify the increased costs.
+ * are pretty high and the returns do not justify the increased costs.
 *
- * Really only required when CONFIG_FAIR_GROUP_SCHED is also set, but to
+ * Really only required when CONFIG_FAIR_GROUP_SCHED=y is also set, but to
- * increase coverage and consistency always enable it on 64bit platforms.
+ * increase coverage and consistency always enable it on 64-bit platforms.
 */
 #ifdef CONFIG_64BIT
 # define NICE_0_LOAD_SHIFT      (SCHED_FIXEDPOINT_SHIFT + SCHED_FIXEDPOINT_SHIFT)
@@ -111,16 +145,12 @@ static inline void cpu_load_update_active(struct rq *this_rq) { }
 * 10 -> just above 1us
 * 9  -> just above 0.5us
 */
-#define DL_SCALE (10)
+#define DL_SCALE                10
-/*
- * These are the 'tuning knobs' of the scheduler:
- */
 /*
- * single value that denotes runtime == period, ie unlimited time.
+ * Single value that denotes runtime == period, ie unlimited time.
 */
-#define RUNTIME_INF     ((u64)~0ULL)
+#define RUNTIME_INF             ((u64)~0ULL)
 static inline int idle_policy(int policy)
 {
@@ -235,9 +265,9 @@ void __dl_clear_params(struct task_struct *p);
 * control.
 */
 struct dl_bandwidth {
-        raw_spinlock_t dl_runtime_lock;
+        raw_spinlock_t          dl_runtime_lock;
-        u64 dl_runtime;
+        u64                     dl_runtime;
-        u64 dl_period;
+        u64                     dl_period;
 };
 static inline int dl_bandwidth_enabled(void)
@@ -246,8 +276,9 @@ static inline int dl_bandwidth_enabled(void)
 }
 struct dl_bw {
-        raw_spinlock_t lock;
+        raw_spinlock_t          lock;
-        u64 bw, total_bw;
+        u64                     bw;
+        u64                     total_bw;
 };
 static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
@@ -273,20 +304,17 @@ bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
               dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
 }
-void dl_change_utilization(struct task_struct *p, u64 new_bw);
+extern void dl_change_utilization(struct task_struct *p, u64 new_bw);
 extern void init_dl_bw(struct dl_bw *dl_b);
-extern int sched_dl_global_validate(void);
+extern int  sched_dl_global_validate(void);
 extern void sched_dl_do_global(void);
-extern int sched_dl_overflow(struct task_struct *p, int policy,
+extern int  sched_dl_overflow(struct task_struct *p, int policy, const struct sched_attr *attr);
-                             const struct sched_attr *attr);
 extern void __setparam_dl(struct task_struct *p, const struct sched_attr *attr);
 extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
 extern bool __checkparam_dl(const struct sched_attr *attr);
 extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
-extern int dl_task_can_attach(struct task_struct *p,
+extern int  dl_task_can_attach(struct task_struct *p, const struct cpumask *cs_cpus_allowed);
-                              const struct cpumask *cs_cpus_allowed);
+extern int  dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
-extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
-                                        const struct cpumask *trial);
 extern bool dl_cpu_busy(unsigned int cpu);
 #ifdef CONFIG_CGROUP_SCHED
@@ -300,32 +328,36 @@ extern struct list_head task_groups;
 struct cfs_bandwidth {
 #ifdef CONFIG_CFS_BANDWIDTH
-        raw_spinlock_t lock;
+        raw_spinlock_t          lock;
-        ktime_t period;
+        ktime_t                 period;
-        u64 quota, runtime;
+        u64                     quota;
-        s64 hierarchical_quota;
+        u64                     runtime;
-        u64 runtime_expires;
+        s64                     hierarchical_quota;
+        u64                     runtime_expires;
-        int idle, period_active;
-        struct hrtimer period_timer, slack_timer;
+        int                     idle;
-        struct list_head throttled_cfs_rq;
+        int                     period_active;
+        struct hrtimer          period_timer;
-        /* statistics */
+        struct hrtimer          slack_timer;
-        int nr_periods, nr_throttled;
+        struct list_head        throttled_cfs_rq;
-        u64 throttled_time;
+        /* Statistics: */
+        int                     nr_periods;
+        int                     nr_throttled;
+        u64                     throttled_time;
 #endif
 };
-/* task group related information */
+/* Task group related information */
 struct task_group {
        struct cgroup_subsys_state css;
 #ifdef CONFIG_FAIR_GROUP_SCHED
-        /* schedulable entities of this group on each cpu */
+        /* schedulable entities of this group on each CPU */
-        struct sched_entity **se;
+        struct sched_entity     **se;
-        /* runqueue "owned" by this group on each cpu */
+        /* runqueue "owned" by this group on each CPU */
-        struct cfs_rq **cfs_rq;
+        struct cfs_rq           **cfs_rq;
-        unsigned long shares;
+        unsigned long           shares;
 #ifdef  CONFIG_SMP
        /*
@@ -333,29 +365,29 @@ struct task_group {
         * it in its own cacheline separated from the fields above which
         * will also be accessed at each tick.
         */
-        atomic_long_t load_avg ____cacheline_aligned;
+        atomic_long_t           load_avg ____cacheline_aligned;
 #endif
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
-        struct sched_rt_entity **rt_se;
+        struct sched_rt_entity  **rt_se;
-        struct rt_rq **rt_rq;
+        struct rt_rq            **rt_rq;
-        struct rt_bandwidth rt_bandwidth;
+        struct rt_bandwidth     rt_bandwidth;
 #endif
-        struct rcu_head rcu;
+        struct rcu_head         rcu;
-        struct list_head list;
+        struct list_head        list;
-        struct task_group *parent;
+        struct task_group       *parent;
-        struct list_head siblings;
+        struct list_head        siblings;
-        struct list_head children;
+        struct list_head        children;
 #ifdef CONFIG_SCHED_AUTOGROUP
-        struct autogroup *autogroup;
+        struct autogroup        *autogroup;
 #endif
-        struct cfs_bandwidth cfs_bandwidth;
+        struct cfs_bandwidth    cfs_bandwidth;
 };
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -369,8 +401,8 @@ struct task_group {
 * (The default weight is 1024 - so there's no practical
 *  limitation from this.)
 */
-#define MIN_SHARES      (1UL <<  1)
+#define MIN_SHARES              (1UL <<  1)
-#define MAX_SHARES      (1UL << 18)
+#define MAX_SHARES              (1UL << 18)
 #endif
 typedef int (*tg_visitor)(struct task_group *, void *);
@@ -443,35 +475,39 @@ struct cfs_bandwidth { };
 /* CFS-related fields in a runqueue */
 struct cfs_rq {
-        struct load_weight load;
+        struct load_weight      load;
-        unsigned long runnable_weight;
+        unsigned long           runnable_weight;
-        unsigned int nr_running, h_nr_running;
+        unsigned int            nr_running;
+        unsigned int            h_nr_running;
-        u64 exec_clock;
+        u64                     exec_clock;
-        u64 min_vruntime;
+        u64                     min_vruntime;
 #ifndef CONFIG_64BIT
-        u64 min_vruntime_copy;
+        u64                     min_vruntime_copy;
 #endif
-        struct rb_root_cached tasks_timeline;
+        struct rb_root_cached   tasks_timeline;
        /*
         * 'curr' points to currently running entity on this cfs_rq.
         * It is set to NULL otherwise (i.e when none are currently running).
         */
-        struct sched_entity *curr, *next, *last, *skip;
+        struct sched_entity     *curr;
+        struct sched_entity     *next;
+        struct sched_entity     *last;
+        struct sched_entity     *skip;
 #ifdef  CONFIG_SCHED_DEBUG
-        unsigned int nr_spread_over;
+        unsigned int            nr_spread_over;
 #endif
 #ifdef CONFIG_SMP
        /*
         * CFS load tracking
         */
-        struct sched_avg avg;
+        struct sched_avg        avg;
 #ifndef CONFIG_64BIT
-        u64 load_last_update_time_copy;
+        u64                     load_last_update_time_copy;
 #endif
        struct {
                raw_spinlock_t  lock ____cacheline_aligned;
@@ -482,9 +518,9 @@ struct cfs_rq {
        } removed;
 #ifdef CONFIG_FAIR_GROUP_SCHED
-        unsigned long tg_load_avg_contrib;
+        unsigned long           tg_load_avg_contrib;
-        long propagate;
+        long                    propagate;
-        long prop_runnable_sum;
+        long                    prop_runnable_sum;
        /*
         *   h_load = weight * f(tg)
@@ -492,36 +528,38 @@ struct cfs_rq {
         * Where f(tg) is the recursive weight fraction assigned to
         * this group.
         */
-        unsigned long h_load;
+        unsigned long           h_load;
-        u64 last_h_load_update;
+        u64                     last_h_load_update;
-        struct sched_entity *h_load_next;
+        struct sched_entity     *h_load_next;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_FAIR_GROUP_SCHED
-        struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
+        struct rq               *rq;    /* CPU runqueue to which this cfs_rq is attached */
        /*
         * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
         * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
         * (like users, containers etc.)
         *
-         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
+         * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.
-         * list is used during load balance.
+         * This list is used during load balance.
         */
-        int on_list;
+        int                     on_list;
-        struct list_head leaf_cfs_rq_list;
+        struct list_head        leaf_cfs_rq_list;
-        struct task_group *tg;  /* group that "owns" this runqueue */
+        struct task_group       *tg;    /* group that "owns" this runqueue */
 #ifdef CONFIG_CFS_BANDWIDTH
-        int runtime_enabled;
+        int                     runtime_enabled;
-        u64 runtime_expires;
+        u64                     runtime_expires;
-        s64 runtime_remaining;
+        s64                     runtime_remaining;
-        u64 throttled_clock, throttled_clock_task;
+        u64                     throttled_clock;
-        u64 throttled_clock_task_time;
+        u64                     throttled_clock_task;
-        int throttled, throttle_count;
+        u64                     throttled_clock_task_time;
-        struct list_head throttled_list;
+        int                     throttled;
+        int                     throttle_count;
+        struct list_head        throttled_list;
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 };
@@ -538,45 +576,45 @@ static inline int rt_bandwidth_enabled(void)
 /* Real-Time classes' related field in a runqueue: */
 struct rt_rq {
-        struct rt_prio_array active;
+        struct rt_prio_array    active;
-        unsigned int rt_nr_running;
+        unsigned int            rt_nr_running;
-        unsigned int rr_nr_running;
+        unsigned int            rr_nr_running;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
        struct {
-                int curr; /* highest queued rt task prio */
+                int             curr; /* highest queued rt task prio */
 #ifdef CONFIG_SMP
-                int next; /* next highest */
+                int             next; /* next highest */
 #endif
        } highest_prio;
 #endif
 #ifdef CONFIG_SMP
-        unsigned long rt_nr_migratory;
+        unsigned long           rt_nr_migratory;
-        unsigned long rt_nr_total;
+        unsigned long           rt_nr_total;
-        int overloaded;
+        int                     overloaded;
-        struct plist_head pushable_tasks;
+        struct plist_head       pushable_tasks;
 #endif /* CONFIG_SMP */
-        int rt_queued;
+        int                     rt_queued;
-        int rt_throttled;
+        int                     rt_throttled;
-        u64 rt_time;
+        u64                     rt_time;
-        u64 rt_runtime;
+        u64                     rt_runtime;
        /* Nests inside the rq lock: */
-        raw_spinlock_t rt_runtime_lock;
+        raw_spinlock_t          rt_runtime_lock;
 #ifdef CONFIG_RT_GROUP_SCHED
-        unsigned long rt_nr_boosted;
+        unsigned long           rt_nr_boosted;
-        struct rq *rq;
+        struct rq               *rq;
-        struct task_group *tg;
+        struct task_group       *tg;
 #endif
 };
 /* Deadline class' related fields in a runqueue */
 struct dl_rq {
        /* runqueue is an rbtree, ordered by deadline */
-        struct rb_root_cached root;
+        struct rb_root_cached   root;
-        unsigned long dl_nr_running;
+        unsigned long           dl_nr_running;
 #ifdef CONFIG_SMP
        /*
@@ -586,28 +624,28 @@ struct dl_rq {
         * should migrate somewhere else.
         */
        struct {
-                u64 curr;
+                u64             curr;
-                u64 next;
+                u64             next;
        } earliest_dl;
-        unsigned long dl_nr_migratory;
+        unsigned long           dl_nr_migratory;
-        int overloaded;
+        int                     overloaded;
        /*
         * Tasks on this rq that can be pushed away. They are kept in
         * an rb-tree, ordered by tasks' deadlines, with caching
         * of the leftmost (earliest deadline) element.
         */
-        struct rb_root_cached pushable_dl_tasks_root;
+        struct rb_root_cached   pushable_dl_tasks_root;
 #else
-        struct dl_bw dl_bw;
+        struct dl_bw            dl_bw;
 #endif
        /*
         * "Active utilization" for this runqueue: increased when a
         * task wakes up (becomes TASK_RUNNING) and decreased when a
         * task blocks
         */
-        u64 running_bw;
+        u64                     running_bw;
        /*
         * Utilization of the tasks "assigned" to this runqueue (including
@@ -618,14 +656,14 @@ struct dl_rq {
         * This is needed to compute the "inactive utilization" for the
         * runqueue (inactive utilization = this_bw - running_bw).
         */
-        u64 this_bw;
+        u64                     this_bw;
-        u64 extra_bw;
+        u64                     extra_bw;
        /*
         * Inverse of the fraction of CPU utilization that can be reclaimed
         * by the GRUB algorithm.
         */
-        u64 bw_ratio;
+        u64                     bw_ratio;
 };
 #ifdef CONFIG_SMP
@@ -638,51 +676,51 @@ static inline bool sched_asym_prefer(int a, int b)
 /*
 * We add the notion of a root-domain which will be used to define per-domain
 * variables. Each exclusive cpuset essentially defines an island domain by
- * fully partitioning the member cpus from any other cpuset. Whenever a new
+ * fully partitioning the member CPUs from any other cpuset. Whenever a new
 * exclusive cpuset is created, we also create and attach a new root-domain
 * object.
 *
 */
 struct root_domain {
-        atomic_t refcount;
+        atomic_t                refcount;
-        atomic_t rto_count;
+        atomic_t                rto_count;
-        struct rcu_head rcu;
+        struct rcu_head         rcu;
-        cpumask_var_t span;
+        cpumask_var_t           span;
-        cpumask_var_t online;
+        cpumask_var_t           online;
        /* Indicate more than one runnable task for any CPU */
-        bool overload;
+        bool                    overload;
        /*
         * The bit corresponding to a CPU gets set here if such CPU has more
         * than one runnable -deadline task (as it is below for RT tasks).
         */
-        cpumask_var_t dlo_mask;
+        cpumask_var_t           dlo_mask;
-        atomic_t dlo_count;
+        atomic_t                dlo_count;
-        struct dl_bw dl_bw;
+        struct dl_bw            dl_bw;
-        struct cpudl cpudl;
+        struct cpudl            cpudl;
 #ifdef HAVE_RT_PUSH_IPI
        /*
         * For IPI pull requests, loop across the rto_mask.
         */
-        struct irq_work rto_push_work;
+        struct irq_work         rto_push_work;
-        raw_spinlock_t rto_lock;
+        raw_spinlock_t          rto_lock;
        /* These are only updated and read within rto_lock */
-        int rto_loop;
+        int                     rto_loop;
-        int rto_cpu;
+        int                     rto_cpu;
        /* These atomics are updated outside of a lock */
-        atomic_t rto_loop_next;
+        atomic_t                rto_loop_next;
-        atomic_t rto_loop_start;
+        atomic_t                rto_loop_start;
 #endif
        /*
         * The "RT overload" flag: it gets set if a CPU has more than
         * one runnable RT task.
         */
-        cpumask_var_t rto_mask;
+        cpumask_var_t           rto_mask;
-        struct cpupri cpupri;
+        struct cpupri           cpupri;
-        unsigned long max_cpu_capacity;
+        unsigned long           max_cpu_capacity;
 };
 extern struct root_domain def_root_domain;
@@ -708,41 +746,39 @@ extern void rto_push_irq_work_func(struct irq_work *work);
 */
 struct rq {
        /* runqueue lock: */
-        raw_spinlock_t lock;
+        raw_spinlock_t          lock;
        /*
         * nr_running and cpu_load should be in the same cacheline because
         * remote CPUs use both these fields when doing load calculation.
         */
-        unsigned int nr_running;
+        unsigned int            nr_running;
 #ifdef CONFIG_NUMA_BALANCING
-        unsigned int nr_numa_running;
+        unsigned int            nr_numa_running;
-        unsigned int nr_preferred_running;
+        unsigned int            nr_preferred_running;
 #endif
        #define CPU_LOAD_IDX_MAX 5
-        unsigned long cpu_load[CPU_LOAD_IDX_MAX];
+        unsigned long           cpu_load[CPU_LOAD_IDX_MAX];
 #ifdef CONFIG_NO_HZ_COMMON
 #ifdef CONFIG_SMP
-        unsigned long last_load_update_tick;
+        unsigned long           last_load_update_tick;
 #endif /* CONFIG_SMP */
-        unsigned long nohz_flags;
+        unsigned long           nohz_flags;
 #endif /* CONFIG_NO_HZ_COMMON */
-#ifdef CONFIG_NO_HZ_FULL
-        unsigned long last_sched_tick;
-#endif
-        /* capture load from *all* tasks on this cpu: */
-        struct load_weight load;
-        unsigned long nr_load_updates;
-        u64 nr_switches;
-        struct cfs_rq cfs;
+        /* capture load from *all* tasks on this CPU: */
-        struct rt_rq rt;
+        struct load_weight      load;
-        struct dl_rq dl;
+        unsigned long           nr_load_updates;
+        u64                     nr_switches;
+        struct cfs_rq           cfs;
+        struct rt_rq            rt;
+        struct dl_rq            dl;
 #ifdef CONFIG_FAIR_GROUP_SCHED
-        /* list of leaf cfs_rq on this cpu: */
+        /* list of leaf cfs_rq on this CPU: */
-        struct list_head leaf_cfs_rq_list;
+        struct list_head        leaf_cfs_rq_list;
-        struct list_head *tmp_alone_branch;
+        struct list_head        *tmp_alone_branch;
 #endif /* CONFIG_FAIR_GROUP_SCHED */
        /*
@@ -751,94 +787,98 @@ struct rq {
         * one CPU and if it got migrated afterwards it may decrease
         * it on another CPU. Always updated under the runqueue lock:
         */
-        unsigned long nr_uninterruptible;
+        unsigned long           nr_uninterruptible;
-        struct task_struct *curr, *idle, *stop;
+        struct task_struct      *curr;
-        unsigned long next_balance;
+        struct task_struct      *idle;
-        struct mm_struct *prev_mm;
+        struct task_struct      *stop;
+        unsigned long           next_balance;
+        struct mm_struct        *prev_mm;
-        unsigned int clock_update_flags;
+        unsigned int            clock_update_flags;
-        u64 clock;
+        u64                     clock;
-        u64 clock_task;
+        u64                     clock_task;
-        atomic_t nr_iowait;
+        atomic_t                nr_iowait;
 #ifdef CONFIG_SMP
-        struct root_domain *rd;
+        struct root_domain      *rd;
-        struct sched_domain *sd;
+        struct sched_domain     *sd;
+        unsigned long           cpu_capacity;
+        unsigned long           cpu_capacity_orig;
-        unsigned long cpu_capacity;
+        struct callback_head    *balance_callback;
-        unsigned long cpu_capacity_orig;
-        struct callback_head *balance_callback;
+        unsigned char           idle_balance;
-        unsigned char idle_balance;
        /* For active balancing */
-        int active_balance;
+        int                     active_balance;
-        int push_cpu;
+        int                     push_cpu;
-        struct cpu_stop_work active_balance_work;
+        struct cpu_stop_work    active_balance_work;
-        /* cpu of this runqueue: */
-        int cpu;
+        /* CPU of this runqueue: */
-        int online;
+        int                     cpu;
+        int                     online;
        struct list_head cfs_tasks;
-        u64 rt_avg;
+        u64                     rt_avg;
-        u64 age_stamp;
+        u64                     age_stamp;
-        u64 idle_stamp;
+        u64                     idle_stamp;
-        u64 avg_idle;
+        u64                     avg_idle;
        /* This is used to determine avg_idle's max value */
-        u64 max_idle_balance_cost;
+        u64                     max_idle_balance_cost;
 #endif
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
-        u64 prev_irq_time;
+        u64                     prev_irq_time;
 #endif
 #ifdef CONFIG_PARAVIRT
-        u64 prev_steal_time;
+        u64                     prev_steal_time;
 #endif
 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
-        u64 prev_steal_time_rq;
+        u64                     prev_steal_time_rq;
 #endif
        /* calc_load related fields */
-        unsigned long calc_load_update;
+        unsigned long           calc_load_update;
-        long calc_load_active;
+        long                    calc_load_active;
 #ifdef CONFIG_SCHED_HRTICK
 #ifdef CONFIG_SMP
-        int hrtick_csd_pending;
+        int                     hrtick_csd_pending;
-        call_single_data_t hrtick_csd;
+        call_single_data_t      hrtick_csd;
 #endif
-        struct hrtimer hrtick_timer;
+        struct hrtimer          hrtick_timer;
 #endif
 #ifdef CONFIG_SCHEDSTATS
        /* latency stats */
-        struct sched_info rq_sched_info;
+        struct sched_info       rq_sched_info;
-        unsigned long long rq_cpu_time;
+        unsigned long long      rq_cpu_time;
        /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
        /* sys_sched_yield() stats */
-        unsigned int yld_count;
+        unsigned int            yld_count;
        /* schedule() stats */
-        unsigned int sched_count;
+        unsigned int            sched_count;
-        unsigned int sched_goidle;
+        unsigned int            sched_goidle;
        /* try_to_wake_up() stats */
-        unsigned int ttwu_count;
+        unsigned int            ttwu_count;
-        unsigned int ttwu_local;
+        unsigned int            ttwu_local;
 #endif
 #ifdef CONFIG_SMP
-        struct llist_head wake_list;
+        struct llist_head       wake_list;
 #endif
 #ifdef CONFIG_CPU_IDLE
        /* Must be inspected within a rcu lock section */
-        struct cpuidle_state *idle_state;
+        struct cpuidle_state    *idle_state;
 #endif
 };
@@ -904,9 +944,9 @@ static inline u64 __rq_clock_broken(struct rq *rq)
 * one position though, because the next rq_unpin_lock() will shift it
 * back.
 */
-#define RQCF_REQ_SKIP   0x01
+#define RQCF_REQ_SKIP           0x01
-#define RQCF_ACT_SKIP   0x02
+#define RQCF_ACT_SKIP           0x02
-#define RQCF_UPDATED    0x04
+#define RQCF_UPDATED            0x04
 static inline void assert_clock_updated(struct rq *rq)
 {
@@ -1059,12 +1099,12 @@ extern void sched_ttwu_pending(void);
 /**
 * highest_flag_domain - Return highest sched_domain containing flag.
- * @cpu:        The cpu whose highest level of sched domain is to
+ * @cpu:        The CPU whose highest level of sched domain is to
 *              be returned.
 * @flag:       The flag to check for the highest sched_domain
- *              for the given cpu.
+ *              for the given CPU.
 *
- * Returns the highest sched_domain of a cpu which contains the given flag.
+ * Returns the highest sched_domain of a CPU which contains the given flag.
 */
 static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
 {
@@ -1099,30 +1139,30 @@ DECLARE_PER_CPU(struct sched_domain *, sd_numa);
 DECLARE_PER_CPU(struct sched_domain *, sd_asym);
 struct sched_group_capacity {
-        atomic_t ref;
+        atomic_t                ref;
        /*
         * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity
         * for a single CPU.
         */
-        unsigned long capacity;
+        unsigned long           capacity;
-        unsigned long min_capacity; /* Min per-CPU capacity in group */
+        unsigned long           min_capacity;           /* Min per-CPU capacity in group */
-        unsigned long next_update;
+        unsigned long           next_update;
-        int imbalance; /* XXX unrelated to capacity but shared group state */
+        int                     imbalance;              /* XXX unrelated to capacity but shared group state */
 #ifdef CONFIG_SCHED_DEBUG
-        int id;
+        int                     id;
 #endif
-        unsigned long cpumask[0]; /* balance mask */
+        unsigned long           cpumask[0];             /* Balance mask */
 };
 struct sched_group {
-        struct sched_group *next;       /* Must be a circular list */
+        struct sched_group      *next;                  /* Must be a circular list */
-        atomic_t ref;
+        atomic_t                ref;
-        unsigned int group_weight;
+        unsigned int            group_weight;
        struct sched_group_capacity *sgc;
-        int asym_prefer_cpu;            /* cpu of highest priority in group */
+        int                     asym_prefer_cpu;        /* CPU of highest priority in group */
        /*
         * The CPUs this group covers.
@@ -1131,7 +1171,7 @@ struct sched_group {
         * by attaching extra space to the end of the structure,
         * depending on how many CPUs the kernel has booted up with)
         */
-        unsigned long cpumask[0];
+        unsigned long           cpumask[0];
 };
 static inline struct cpumask *sched_group_span(struct sched_group *sg)
@@ -1148,8 +1188,8 @@ static inline struct cpumask *group_balance_mask(struct sched_group *sg)
 }
 /**
- * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * group_first_cpu - Returns the first CPU in the cpumask of a sched_group.
- * @group: The group whose first cpu is to be returned.
+ * @group: The group whose first CPU is to be returned.
 */
 static inline unsigned int group_first_cpu(struct sched_group *group)
 {
@@ -1349,19 +1389,12 @@ static inline int task_on_rq_migrating(struct task_struct *p)
        return p->on_rq == TASK_ON_RQ_MIGRATING;
 }
-#ifndef prepare_arch_switch
-# define prepare_arch_switch(next)      do { } while (0)
-#endif
-#ifndef finish_arch_post_lock_switch
-# define finish_arch_post_lock_switch() do { } while (0)
-#endif
 /*
 * wake flags
 */
-#define WF_SYNC         0x01            /* waker goes to sleep after wakeup */
+#define WF_SYNC                 0x01            /* Waker goes to sleep after wakeup */
-#define WF_FORK         0x02            /* child wakeup after fork */
+#define WF_FORK                 0x02            /* Child wakeup after fork */
-#define WF_MIGRATED     0x4             /* internal use, task got migrated */
+#define WF_MIGRATED             0x4             /* Internal use, task got migrated */
 /*
 * To aid in avoiding the subversion of "niceness" due to uneven distribution
@@ -1372,11 +1405,11 @@ static inline int task_on_rq_migrating(struct task_struct *p)
 * slice expiry etc.
 */
-#define WEIGHT_IDLEPRIO                3
+#define WEIGHT_IDLEPRIO         3
-#define WMULT_IDLEPRIO         1431655765
+#define WMULT_IDLEPRIO          1431655765
-extern const int sched_prio_to_weight[40];
+extern const int                sched_prio_to_weight[40];
-extern const u32 sched_prio_to_wmult[40];
+extern const u32                sched_prio_to_wmult[40];
 /*
 * {de,en}queue flags:
@@ -1398,9 +1431,9 @@ extern const u32 sched_prio_to_wmult[40];
 */
 #define DEQUEUE_SLEEP           0x01
-#define DEQUEUE_SAVE            0x02 /* matches ENQUEUE_RESTORE */
+#define DEQUEUE_SAVE            0x02 /* Matches ENQUEUE_RESTORE */
-#define DEQUEUE_MOVE            0x04 /* matches ENQUEUE_MOVE */
+#define DEQUEUE_MOVE            0x04 /* Matches ENQUEUE_MOVE */
-#define DEQUEUE_NOCLOCK         0x08 /* matches ENQUEUE_NOCLOCK */
+#define DEQUEUE_NOCLOCK         0x08 /* Matches ENQUEUE_NOCLOCK */
 #define ENQUEUE_WAKEUP          0x01
 #define ENQUEUE_RESTORE         0x02
@@ -1422,10 +1455,10 @@ struct sched_class {
        void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
        void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
-        void (*yield_task) (struct rq *rq);
+        void (*yield_task)   (struct rq *rq);
-        bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
+        bool (*yield_to_task)(struct rq *rq, struct task_struct *p, bool preempt);
-        void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
+        void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags);
        /*
         * It is the responsibility of the pick_next_task() method that will
@@ -1435,16 +1468,16 @@ struct sched_class {
         * May return RETRY_TASK when it finds a higher prio class has runnable
         * tasks.
         */
-        struct task_struct * (*pick_next_task) (struct rq *rq,
+        struct task_struct * (*pick_next_task)(struct rq *rq,
-                                                struct task_struct *prev,
+                                               struct task_struct *prev,
-                                                struct rq_flags *rf);
+                                               struct rq_flags *rf);
-        void (*put_prev_task) (struct rq *rq, struct task_struct *p);
+        void (*put_prev_task)(struct rq *rq, struct task_struct *p);
 #ifdef CONFIG_SMP
        int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
        void (*migrate_task_rq)(struct task_struct *p);
-        void (*task_woken) (struct rq *this_rq, struct task_struct *task);
+        void (*task_woken)(struct rq *this_rq, struct task_struct *task);
        void (*set_cpus_allowed)(struct task_struct *p,
                                 const struct cpumask *newmask);
@@ -1453,31 +1486,31 @@ struct sched_class {
        void (*rq_offline)(struct rq *rq);
 #endif
-        void (*set_curr_task) (struct rq *rq);
+        void (*set_curr_task)(struct rq *rq);
-        void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
+        void (*task_tick)(struct rq *rq, struct task_struct *p, int queued);
-        void (*task_fork) (struct task_struct *p);
+        void (*task_fork)(struct task_struct *p);
-        void (*task_dead) (struct task_struct *p);
+        void (*task_dead)(struct task_struct *p);
        /*
         * The switched_from() call is allowed to drop rq->lock, therefore we
         * cannot assume the switched_from/switched_to pair is serliazed by
         * rq->lock. They are however serialized by p->pi_lock.
         */
-        void (*switched_from) (struct rq *this_rq, struct task_struct *task);
+        void (*switched_from)(struct rq *this_rq, struct task_struct *task);
-        void (*switched_to) (struct rq *this_rq, struct task_struct *task);
+        void (*switched_to)  (struct rq *this_rq, struct task_struct *task);
        void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
-                             int oldprio);
+                              int oldprio);
-        unsigned int (*get_rr_interval) (struct rq *rq,
+        unsigned int (*get_rr_interval)(struct rq *rq,
-                                         struct task_struct *task);
+                                        struct task_struct *task);
-        void (*update_curr) (struct rq *rq);
+        void (*update_curr)(struct rq *rq);
-#define TASK_SET_GROUP  0
+#define TASK_SET_GROUP          0
-#define TASK_MOVE_GROUP 1
+#define TASK_MOVE_GROUP         1
 #ifdef CONFIG_FAIR_GROUP_SCHED
-        void (*task_change_group) (struct task_struct *p, int type);
+        void (*task_change_group)(struct task_struct *p, int type);
 #endif
 };
@@ -1526,6 +1559,7 @@ static inline void idle_set_state(struct rq *rq,
 static inline struct cpuidle_state *idle_get_state(struct rq *rq)
 {
        SCHED_WARN_ON(!rcu_read_lock_held());
        return rq->idle_state;
 }
 #else
@@ -1564,9 +1598,9 @@ extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
 extern void init_dl_inactive_task_timer(struct sched_dl_entity *dl_se);
 extern void init_dl_rq_bw_ratio(struct dl_rq *dl_rq);
-#define BW_SHIFT        20
+#define BW_SHIFT                20
-#define BW_UNIT         (1 << BW_SHIFT)
+#define BW_UNIT                 (1 << BW_SHIFT)
-#define RATIO_SHIFT     8
+#define RATIO_SHIFT             8
 unsigned long to_ratio(u64 period, u64 runtime);
 extern void init_entity_runnable_average(struct sched_entity *se);
@@ -1574,6 +1608,7 @@ extern void post_init_entity_util_avg(struct sched_entity *se);
 #ifdef CONFIG_NO_HZ_FULL
 extern bool sched_can_stop_tick(struct rq *rq);
+extern int __init sched_tick_offload_init(void);
 /*
 * Tick may be needed by tasks in the runqueue depending on their policy and
@@ -1598,6 +1633,7 @@ static inline void sched_update_tick_dependency(struct rq *rq)
                tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED);
 }
 #else
+static inline int sched_tick_offload_init(void) { return 0; }
 static inline void sched_update_tick_dependency(struct rq *rq) { }
 #endif
@@ -1624,13 +1660,6 @@ static inline void sub_nr_running(struct rq *rq, unsigned count)
        sched_update_tick_dependency(rq);
 }
-static inline void rq_last_tick_reset(struct rq *rq)
-{
-#ifdef CONFIG_NO_HZ_FULL
-        rq->last_sched_tick = jiffies;
-#endif
-}
 extern void update_rq_clock(struct rq *rq);
 extern void activate_task(struct rq *rq, struct task_struct *p, int flags);
@@ -1821,8 +1850,8 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
 /*
 * Unfair double_lock_balance: Optimizes throughput at the expense of
 * latency by eliminating extra atomic operations when the locks are
- * already in proper order on entry.  This favors lower cpu-ids and will
+ * already in proper order on entry.  This favors lower CPU-ids and will
- * grant the double lock to lower cpus over higher ids under contention,
+ * grant the double lock to lower CPUs over higher ids under contention,
 * regardless of entry order into the function.
 */
 static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
@@ -1854,7 +1883,7 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
 static inline int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 {
        if (unlikely(!irqs_disabled())) {
-                /* printk() doesn't work good under rq->lock */
+                /* printk() doesn't work well under rq->lock */
                raw_spin_unlock(&this_rq->lock);
                BUG_ON(1);
        }
@@ -2113,15 +2142,14 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
 #endif /* CONFIG_CPU_FREQ */
 #ifdef arch_scale_freq_capacity
-#ifndef arch_scale_freq_invariant
+# ifndef arch_scale_freq_invariant
-#define arch_scale_freq_invariant()     (true)
+#  define arch_scale_freq_invariant()   true
-#endif
+# endif
-#else /* arch_scale_freq_capacity */
+#else
-#define arch_scale_freq_invariant()     (false)
+# define arch_scale_freq_invariant()    false
 #endif
 #ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
 static inline unsigned long cpu_util_dl(struct rq *rq)
 {
        return (rq->dl.running_bw * SCHED_CAPACITY_SCALE) >> BW_SHIFT;
@@ -2131,5 +2159,4 @@ static inline unsigned long cpu_util_cfs(struct rq *rq)
 {
        return rq->cfs.avg.util_avg;
 }
 #endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index 940b1fa1d2ce..ab112cbfd7c8 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -1,14 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
+/*
-#include <linux/slab.h>
+ * /proc/schedstat implementation
-#include <linux/fs.h>
+ */
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
 #include "sched.h"
 /*
- * bump this up when changing the output format or the meaning of an existing
+ * Current schedstat API version.
+ *
+ * Bump this up when changing the output format or the meaning of an existing
 * format, so that tools can adapt (or abort)
 */
 #define SCHEDSTAT_VERSION 15
@@ -78,8 +77,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
 * This itererator needs some explanation.
 * It returns 1 for the header position.
 * This means 2 is cpu 0.
- * In a hotplugged system some cpus, including cpu 0, may be missing so we have
+ * In a hotplugged system some CPUs, including cpu 0, may be missing so we have
- * to use cpumask_* to iterate over the cpus.
+ * to use cpumask_* to iterate over the CPUs.
 */
 static void *schedstat_start(struct seq_file *file, loff_t *offset)
 {
@@ -99,12 +98,14 @@ static void *schedstat_start(struct seq_file *file, loff_t *offset)
        if (n < nr_cpu_ids)
                return (void *)(unsigned long)(n + 2);
        return NULL;
 }
 static void *schedstat_next(struct seq_file *file, void *data, loff_t *offset)
 {
        (*offset)++;
        return schedstat_start(file, offset);
 }
@@ -134,6 +135,7 @@ static const struct file_operations proc_schedstat_operations = {
 static int __init proc_schedstat_init(void)
 {
        proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
        return 0;
 }
 subsys_initcall(proc_schedstat_init);
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
index 8e7b58de61e7..8aea199a39b4 100644
--- a/kernel/sched/stats.h
+++ b/kernel/sched/stats.h
@@ -30,35 +30,29 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
        if (rq)
                rq->rq_sched_info.run_delay += delta;
 }
-#define schedstat_enabled()             static_branch_unlikely(&sched_schedstats)
+#define   schedstat_enabled()           static_branch_unlikely(&sched_schedstats)
 #define __schedstat_inc(var)            do { var++; } while (0)
-#define schedstat_inc(var)              do { if (schedstat_enabled()) { var++; } } while (0)
+#define   schedstat_inc(var)            do { if (schedstat_enabled()) { var++; } } while (0)
 #define __schedstat_add(var, amt)       do { var += (amt); } while (0)
-#define schedstat_add(var, amt)         do { if (schedstat_enabled()) { var += (amt); } } while (0)
+#define   schedstat_add(var, amt)       do { if (schedstat_enabled()) { var += (amt); } } while (0)
-#define __schedstat_set(var, val)               do { var = (val); } while (0)
+#define __schedstat_set(var, val)       do { var = (val); } while (0)
-#define schedstat_set(var, val)         do { if (schedstat_enabled()) { var = (val); } } while (0)
+#define   schedstat_set(var, val)       do { if (schedstat_enabled()) { var = (val); } } while (0)
-#define schedstat_val(var)              (var)
+#define   schedstat_val(var)            (var)
-#define schedstat_val_or_zero(var)      ((schedstat_enabled()) ? (var) : 0)
+#define   schedstat_val_or_zero(var)    ((schedstat_enabled()) ? (var) : 0)
-#else /* !CONFIG_SCHEDSTATS */
+#else /* !CONFIG_SCHEDSTATS: */
-static inline void
+static inline void rq_sched_info_arrive  (struct rq *rq, unsigned long long delta) { }
-rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
+static inline void rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) { }
-{}
+static inline void rq_sched_info_depart  (struct rq *rq, unsigned long long delta) { }
-static inline void
+# define   schedstat_enabled()          0
-rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
+# define __schedstat_inc(var)           do { } while (0)
-{}
+# define   schedstat_inc(var)           do { } while (0)
-static inline void
+# define __schedstat_add(var, amt)      do { } while (0)
-rq_sched_info_depart(struct rq *rq, unsigned long long delta)
+# define   schedstat_add(var, amt)      do { } while (0)
-{}
+# define __schedstat_set(var, val)      do { } while (0)
-#define schedstat_enabled()             0
+# define   schedstat_set(var, val)      do { } while (0)
-#define __schedstat_inc(var)            do { } while (0)
+# define   schedstat_val(var)           0
-#define schedstat_inc(var)              do { } while (0)
+# define   schedstat_val_or_zero(var)   0
-#define __schedstat_add(var, amt)       do { } while (0)
-#define schedstat_add(var, amt)         do { } while (0)
-#define __schedstat_set(var, val)       do { } while (0)
-#define schedstat_set(var, val)         do { } while (0)
-#define schedstat_val(var)              0
-#define schedstat_val_or_zero(var)      0
 #endif /* CONFIG_SCHEDSTATS */
 #ifdef CONFIG_SCHED_INFO
@@ -69,9 +63,9 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
 /*
 * We are interested in knowing how long it was from the *first* time a
- * task was queued to the time that it finally hit a cpu, we call this routine
+ * task was queued to the time that it finally hit a CPU, we call this routine
- * from dequeue_task() to account for possible rq->clock skew across cpus. The
+ * from dequeue_task() to account for possible rq->clock skew across CPUs. The
- * delta taken on each cpu would annul the skew.
+ * delta taken on each CPU would annul the skew.
 */
 static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
 {
@@ -87,7 +81,7 @@ static inline void sched_info_dequeued(struct rq *rq, struct task_struct *t)
 }
 /*
- * Called when a task finally hits the cpu.  We can now calculate how
+ * Called when a task finally hits the CPU.  We can now calculate how
 * long it was waiting to run.  We also note when it began so that we
 * can keep stats on how long its timeslice is.
 */
@@ -112,9 +106,10 @@ static void sched_info_arrive(struct rq *rq, struct task_struct *t)
 */
 static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
 {
-        if (unlikely(sched_info_on()))
+        if (unlikely(sched_info_on())) {
                if (!t->sched_info.last_queued)
                        t->sched_info.last_queued = rq_clock(rq);
+        }
 }
 /*
@@ -127,8 +122,7 @@ static inline void sched_info_queued(struct rq *rq, struct task_struct *t)
 */
 static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
 {
-        unsigned long long delta = rq_clock(rq) -
+        unsigned long long delta = rq_clock(rq) - t->sched_info.last_arrival;
-                                        t->sched_info.last_arrival;
        rq_sched_info_depart(rq, delta);
@@ -142,11 +136,10 @@ static inline void sched_info_depart(struct rq *rq, struct task_struct *t)
 * the idle task.)  We are only called when prev != next.
 */
 static inline void
-__sched_info_switch(struct rq *rq,
+__sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
-                    struct task_struct *prev, struct task_struct *next)
 {
        /*
-         * prev now departs the cpu.  It's not interesting to record
+         * prev now departs the CPU.  It's not interesting to record
         * stats about how efficient we were at scheduling the idle
         * process, however.
         */
@@ -156,18 +149,19 @@ __sched_info_switch(struct rq *rq,
        if (next != rq->idle)
                sched_info_arrive(rq, next);
 }
 static inline void
-sched_info_switch(struct rq *rq,
+sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
-                  struct task_struct *prev, struct task_struct *next)
 {
        if (unlikely(sched_info_on()))
                __sched_info_switch(rq, prev, next);
 }
-#else
-#define sched_info_queued(rq, t)                do { } while (0)
+#else /* !CONFIG_SCHED_INFO: */
-#define sched_info_reset_dequeued(t)    do { } while (0)
+# define sched_info_queued(rq, t)       do { } while (0)
-#define sched_info_dequeued(rq, t)              do { } while (0)
+# define sched_info_reset_dequeued(t)   do { } while (0)
-#define sched_info_depart(rq, t)                do { } while (0)
+# define sched_info_dequeued(rq, t)     do { } while (0)
-#define sched_info_arrive(rq, next)             do { } while (0)
+# define sched_info_depart(rq, t)       do { } while (0)
-#define sched_info_switch(rq, t, next)          do { } while (0)
+# define sched_info_arrive(rq, next)    do { } while (0)
+# define sched_info_switch(rq, t, next) do { } while (0)
 #endif /* CONFIG_SCHED_INFO */
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 210b1f2146ff..c183b790ca54 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,6 +1,4 @@
 // SPDX-License-Identifier: GPL-2.0
-#include "sched.h"
 /*
 * stop-task scheduling class.
 *
@@ -9,6 +7,7 @@
 *
 * See kernel/stop_machine.c
 */
+#include "sched.h"
 #ifdef CONFIG_SMP
 static int
@@ -75,6 +74,14 @@ static void put_prev_task_stop(struct rq *rq, struct task_struct *prev)
        cgroup_account_cputime(curr, delta_exec);
 }
+/*
+ * scheduler tick hitting a task of our scheduling class.
+ *
+ * NOTE: This function can be called remotely by the tick offload that
+ * goes along full dynticks. Therefore no local assumption can be made
+ * and everything must be accessed through the @rq and @curr passed in
+ * parameters.
+ */
 static void task_tick_stop(struct rq *rq, struct task_struct *curr, int queued)
 {
 }
diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c
index 9ff1555341ed..b6fb2c3b3ff7 100644
--- a/kernel/sched/swait.c
+++ b/kernel/sched/swait.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
-#include <linux/sched/signal.h>
+/*
-#include <linux/swait.h>
+ * <linux/swait.h> (simple wait queues ) implementation:
+ */
+#include "sched.h"
 void __init_swait_queue_head(struct swait_queue_head *q, const char *name,
                             struct lock_class_key *key)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 519b024f4e94..64cc564f5255 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2,10 +2,6 @@
 /*
 * Scheduler topology setup/handling methods
 */
-#include <linux/sched.h>
-#include <linux/mutex.h>
-#include <linux/sched/isolation.h>
 #include "sched.h"
 DEFINE_MUTEX(sched_domains_mutex);
@@ -41,8 +37,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
        if (!(sd->flags & SD_LOAD_BALANCE)) {
                printk("does not load-balance\n");
                if (sd->parent)
-                        printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
+                        printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent");
-                                        " has parent");
                return -1;
        }
@@ -50,12 +45,10 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
               cpumask_pr_args(sched_domain_span(sd)), sd->name);
        if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
-                printk(KERN_ERR "ERROR: domain->span does not contain "
+                printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu);
-                                "CPU%d\n", cpu);
        }
        if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
-                printk(KERN_ERR "ERROR: domain->groups does not contain"
+                printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu);
-                                " CPU%d\n", cpu);
        }
        printk(KERN_DEBUG "%*s groups:", level + 1, "");
@@ -115,8 +108,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
        if (sd->parent &&
            !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
-                printk(KERN_ERR "ERROR: parent span is not a superset "
+                printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n");
-                        "of domain->span\n");
        return 0;
 }
@@ -595,7 +587,7 @@ int group_balance_cpu(struct sched_group *sg)
 * are not.
 *
 * This leads to a few particularly weird cases where the sched_domain's are
- * not of the same number for each cpu. Consider:
+ * not of the same number for each CPU. Consider:
 *
 * NUMA-2       0-3                                             0-3
 *  groups:     {0-2},{1-3}                                     {1-3},{0-2}
@@ -780,7 +772,7 @@ fail:
 *          ^ ^             ^ ^
 *          `-'             `-'
 *
- * The sched_domains are per-cpu and have a two way link (parent & child) and
+ * The sched_domains are per-CPU and have a two way link (parent & child) and
 * denote the ever growing mask of CPUs belonging to that level of topology.
 *
 * Each sched_domain has a circular (double) linked list of sched_group's, each
@@ -1021,6 +1013,7 @@ __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
        d->rd = alloc_rootdomain();
        if (!d->rd)
                return sa_sd;
        return sa_rootdomain;
 }
@@ -1047,12 +1040,14 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
 }
 #ifdef CONFIG_NUMA
-static int sched_domains_numa_levels;
 enum numa_topology_type sched_numa_topology_type;
-static int *sched_domains_numa_distance;
-int sched_max_numa_distance;
+static int                      sched_domains_numa_levels;
-static struct cpumask ***sched_domains_numa_masks;
+static int                      sched_domains_curr_level;
-static int sched_domains_curr_level;
+int                             sched_max_numa_distance;
+static int                      *sched_domains_numa_distance;
+static struct cpumask           ***sched_domains_numa_masks;
 #endif
 /*
@@ -1074,11 +1069,11 @@ static int sched_domains_curr_level;
 *   SD_ASYM_PACKING        - describes SMT quirks
 */
 #define TOPOLOGY_SD_FLAGS               \
-        (SD_SHARE_CPUCAPACITY |         \
+        (SD_SHARE_CPUCAPACITY   |       \
         SD_SHARE_PKG_RESOURCES |       \
-         SD_NUMA |                      \
+         SD_NUMA                |       \
-         SD_ASYM_PACKING |              \
+         SD_ASYM_PACKING        |       \
-         SD_ASYM_CPUCAPACITY |          \
+         SD_ASYM_CPUCAPACITY    |       \
         SD_SHARE_POWERDOMAIN)
 static struct sched_domain *
@@ -1628,7 +1623,7 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
                        pr_err("     the %s domain not a subset of the %s domain\n",
                                        child->name, sd->name);
 #endif
-                        /* Fixup, ensure @sd has at least @child cpus. */
+                        /* Fixup, ensure @sd has at least @child CPUs. */
                        cpumask_or(sched_domain_span(sd),
                                   sched_domain_span(sd),
                                   sched_domain_span(child));
@@ -1720,6 +1715,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
        ret = 0;
 error:
        __free_domain_allocs(&d, alloc_state, cpu_map);
        return ret;
 }
@@ -1824,6 +1820,7 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
                return 1;
        tmp = SD_ATTR_INIT;
        return !memcmp(cur ? (cur + idx_cur) : &tmp,
                        new ? (new + idx_new) : &tmp,
                        sizeof(struct sched_domain_attr));
@@ -1929,4 +1926,3 @@ match2:
        mutex_unlock(&sched_domains_mutex);
 }
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 929ecb7d6b78..928be527477e 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -3,14 +3,7 @@
 *
 * (C) 2004 Nadia Yvette Chambers, Oracle
 */
-#include <linux/init.h>
+#include "sched.h"
-#include <linux/export.h>
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/mm.h>
-#include <linux/wait.h>
-#include <linux/hash.h>
-#include <linux/kthread.h>
 void __init_waitqueue_head(struct wait_queue_head *wq_head, const char *name, struct lock_class_key *key)
 {
@@ -107,6 +100,7 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
                        break;
                }
        }
        return nr_exclusive;
 }
@@ -317,6 +311,7 @@ int do_wait_intr(wait_queue_head_t *wq, wait_queue_entry_t *wait)
        spin_unlock(&wq->lock);
        schedule();
        spin_lock(&wq->lock);
        return 0;
 }
 EXPORT_SYMBOL(do_wait_intr);
@@ -333,6 +328,7 @@ int do_wait_intr_irq(wait_queue_head_t *wq, wait_queue_entry_t *wait)
        spin_unlock_irq(&wq->lock);
        schedule();
        spin_lock_irq(&wq->lock);
        return 0;
 }
 EXPORT_SYMBOL(do_wait_intr_irq);
@@ -378,6 +374,7 @@ int autoremove_wake_function(struct wait_queue_entry *wq_entry, unsigned mode, i
        if (ret)
                list_del_init(&wq_entry->entry);
        return ret;
 }
 EXPORT_SYMBOL(autoremove_wake_function);
diff --git a/kernel/sched/wait_bit.c b/kernel/sched/wait_bit.c
index 84cb3acd9260..ed84ab245a05 100644
--- a/kernel/sched/wait_bit.c
+++ b/kernel/sched/wait_bit.c
@@ -1,10 +1,7 @@
 /*
 * The implementation of the wait_bit*() and related waiting APIs:
 */
-#include <linux/wait_bit.h>
+#include "sched.h"
-#include <linux/sched/signal.h>
-#include <linux/sched/debug.h>
-#include <linux/hash.h>
 #define WAIT_TABLE_BITS 8
 #define WAIT_TABLE_SIZE (1 << WAIT_TABLE_BITS)
@@ -29,8 +26,8 @@ int wake_bit_function(struct wait_queue_entry *wq_entry, unsigned mode, int sync
                        wait_bit->key.bit_nr != key->bit_nr ||
                        test_bit(key->bit_nr, key->flags))
                return 0;
-        else
-                return autoremove_wake_function(wq_entry, mode, sync, key);
+        return autoremove_wake_function(wq_entry, mode, sync, key);
 }
 EXPORT_SYMBOL(wake_bit_function);
@@ -50,7 +47,9 @@ __wait_on_bit(struct wait_queue_head *wq_head, struct wait_bit_queue_entry *wbq_
                if (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags))
                        ret = (*action)(&wbq_entry->key, mode);
        } while (test_bit(wbq_entry->key.bit_nr, wbq_entry->key.flags) && !ret);
        finish_wait(wq_head, &wbq_entry->wq_entry);
        return ret;
 }
 EXPORT_SYMBOL(__wait_on_bit);
@@ -73,6 +72,7 @@ int __sched out_of_line_wait_on_bit_timeout(
        DEFINE_WAIT_BIT(wq_entry, word, bit);
        wq_entry.key.timeout = jiffies + timeout;
        return __wait_on_bit(wq_head, &wq_entry, action, mode);
 }
 EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
@@ -120,6 +120,7 @@ EXPORT_SYMBOL(out_of_line_wait_on_bit_lock);
 void __wake_up_bit(struct wait_queue_head *wq_head, void *word, int bit)
 {
        struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
        if (waitqueue_active(wq_head))
                __wake_up(wq_head, TASK_NORMAL, 1, &key);
 }
@@ -148,6 +149,54 @@ void wake_up_bit(void *word, int bit)
 }
 EXPORT_SYMBOL(wake_up_bit);
+wait_queue_head_t *__var_waitqueue(void *p)
+{
+        if (BITS_PER_LONG == 64) {
+                unsigned long q = (unsigned long)p;
+                return bit_waitqueue((void *)(q & ~1), q & 1);
+        }
+        return bit_waitqueue(p, 0);
+}
+EXPORT_SYMBOL(__var_waitqueue);
+static int
+var_wake_function(struct wait_queue_entry *wq_entry, unsigned int mode,
+                  int sync, void *arg)
+{
+        struct wait_bit_key *key = arg;
+        struct wait_bit_queue_entry *wbq_entry =
+                container_of(wq_entry, struct wait_bit_queue_entry, wq_entry);
+        if (wbq_entry->key.flags != key->flags ||
+            wbq_entry->key.bit_nr != key->bit_nr)
+                return 0;
+        return autoremove_wake_function(wq_entry, mode, sync, key);
+}
+void init_wait_var_entry(struct wait_bit_queue_entry *wbq_entry, void *var, int flags)
+{
+        *wbq_entry = (struct wait_bit_queue_entry){
+                .key = {
+                        .flags  = (var),
+                        .bit_nr = -1,
+                },
+                .wq_entry = {
+                        .private = current,
+                        .func    = var_wake_function,
+                        .entry   = LIST_HEAD_INIT(wbq_entry->wq_entry.entry),
+                },
+        };
+}
+EXPORT_SYMBOL(init_wait_var_entry);
+void wake_up_var(void *var)
+{
+        __wake_up_bit(__var_waitqueue(var), var, -1);
+}
+EXPORT_SYMBOL(wake_up_var);
 /*
 * Manipulate the atomic_t address to produce a better bit waitqueue table hash
 * index (we're keying off bit -1, but that would produce a horrible hash
@@ -157,6 +206,7 @@ static inline wait_queue_head_t *atomic_t_waitqueue(atomic_t *p)
 {
        if (BITS_PER_LONG == 64) {
                unsigned long q = (unsigned long)p;
                return bit_waitqueue((void *)(q & ~1), q & 1);
        }
        return bit_waitqueue(p, 0);
@@ -173,6 +223,7 @@ static int wake_atomic_t_function(struct wait_queue_entry *wq_entry, unsigned mo
            wait_bit->key.bit_nr != key->bit_nr ||
            atomic_read(val) != 0)
                return 0;
        return autoremove_wake_function(wq_entry, mode, sync, key);
 }
@@ -196,6 +247,7 @@ int __wait_on_atomic_t(struct wait_queue_head *wq_head, struct wait_bit_queue_en
                ret = (*action)(val, mode);
        } while (!ret && atomic_read(val) != 0);
        finish_wait(wq_head, &wbq_entry->wq_entry);
        return ret;
 }
@@ -226,6 +278,7 @@ __sched int atomic_t_wait(atomic_t *counter, unsigned int mode)
        schedule();
        if (signal_pending_state(mode, current))
                return -EINTR;
        return 0;
 }
 EXPORT_SYMBOL(atomic_t_wait);
@@ -250,6 +303,7 @@ __sched int bit_wait(struct wait_bit_key *word, int mode)
        schedule();
        if (signal_pending_state(mode, current))
                return -EINTR;
        return 0;
 }
 EXPORT_SYMBOL(bit_wait);
@@ -259,6 +313,7 @@ __sched int bit_wait_io(struct wait_bit_key *word, int mode)
        io_schedule();
        if (signal_pending_state(mode, current))
                return -EINTR;
        return 0;
 }
 EXPORT_SYMBOL(bit_wait_io);
@@ -266,11 +321,13 @@ EXPORT_SYMBOL(bit_wait_io);
 __sched int bit_wait_timeout(struct wait_bit_key *word, int mode)
 {
        unsigned long now = READ_ONCE(jiffies);
        if (time_after_eq(now, word->timeout))
                return -EAGAIN;
        schedule_timeout(word->timeout - now);
        if (signal_pending_state(mode, current))
                return -EINTR;
        return 0;
 }
 EXPORT_SYMBOL_GPL(bit_wait_timeout);
@@ -278,11 +335,13 @@ EXPORT_SYMBOL_GPL(bit_wait_timeout);
 __sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode)
 {
        unsigned long now = READ_ONCE(jiffies);
        if (time_after_eq(now, word->timeout))
                return -EAGAIN;
        io_schedule_timeout(word->timeout - now);
        if (signal_pending_state(mode, current))
                return -EINTR;
        return 0;
 }
 EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 29a5733eff83..f2fa2e940fe5 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -481,11 +481,18 @@ static int __init setup_tick_nohz(char *str)
 __setup("nohz=", setup_tick_nohz);
-int tick_nohz_tick_stopped(void)
+bool tick_nohz_tick_stopped(void)
 {
        return __this_cpu_read(tick_cpu_sched.tick_stopped);
 }
+bool tick_nohz_tick_stopped_cpu(int cpu)
+{
+        struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu);
+        return ts->tick_stopped;
+}
 /**
 * tick_nohz_update_jiffies - update jiffies when idle was interrupted
 *
@@ -741,12 +748,6 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts,
                delta = KTIME_MAX;
        }
-#ifdef CONFIG_NO_HZ_FULL
-        /* Limit the tick delta to the maximum scheduler deferment */
-        if (!ts->inidle)
-                delta = min(delta, scheduler_tick_max_deferment());
-#endif
        /* Calculate the next expiry time */
        if (delta < (KTIME_MAX - basemono))
                expires = basemono + delta;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6ec6ba65127b..254e636a3d6b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5573,12 +5573,13 @@ static void __init wq_numa_init(void)
 int __init workqueue_init_early(void)
 {
        int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
+        int hk_flags = HK_FLAG_DOMAIN | HK_FLAG_WQ;
        int i, cpu;
        WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
        BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL));
-        cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_FLAG_DOMAIN));
+        cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(hk_flags));
        pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
author	Dan Williams <dan.j.williams@intel.com>	2018-04-09 13:50:17 -0400
committer	Dan Williams <dan.j.williams@intel.com>	2018-04-09 13:50:17 -0400
commit	e13e75b86ef2f88e3a47d672dd4c52a293efb95b (patch)
tree	2617aebd952d1aec09d323f6b2484b93f659e753
parent	1ed41b5696ccc3ff40a1dee39fe14eff273faf82 (diff)
parent	976431b02c2ef92ae3f8b6a7d699fc554025e118 (diff)