aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c227
1 files changed, 227 insertions, 0 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 756f9f9e8542..9af5af979a13 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,9 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/profile.h> 27#include <linux/profile.h>
28#include <linux/interrupt.h> 28#include <linux/interrupt.h>
29#include <linux/mempolicy.h>
30#include <linux/migrate.h>
31#include <linux/task_work.h>
29 32
30#include <trace/events/sched.h> 33#include <trace/events/sched.h>
31 34
@@ -774,6 +777,227 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
774 * Scheduling class queueing methods: 777 * Scheduling class queueing methods:
775 */ 778 */
776 779
780#ifdef CONFIG_NUMA_BALANCING
781/*
782 * numa task sample period in ms
783 */
784unsigned int sysctl_numa_balancing_scan_period_min = 100;
785unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
786unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
787
788/* Portion of address space to scan in MB */
789unsigned int sysctl_numa_balancing_scan_size = 256;
790
791/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
792unsigned int sysctl_numa_balancing_scan_delay = 1000;
793
794static void task_numa_placement(struct task_struct *p)
795{
796 int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
797
798 if (p->numa_scan_seq == seq)
799 return;
800 p->numa_scan_seq = seq;
801
802 /* FIXME: Scheduling placement policy hints go here */
803}
804
805/*
806 * Got a PROT_NONE fault for a page on @node.
807 */
808void task_numa_fault(int node, int pages, bool migrated)
809{
810 struct task_struct *p = current;
811
812 if (!sched_feat_numa(NUMA))
813 return;
814
815 /* FIXME: Allocate task-specific structure for placement policy here */
816
817 /*
818 * If pages are properly placed (did not migrate) then scan slower.
819 * This is reset periodically in case of phase changes
820 */
821 if (!migrated)
822 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
823 p->numa_scan_period + jiffies_to_msecs(10));
824
825 task_numa_placement(p);
826}
827
828static void reset_ptenuma_scan(struct task_struct *p)
829{
830 ACCESS_ONCE(p->mm->numa_scan_seq)++;
831 p->mm->numa_scan_offset = 0;
832}
833
834/*
835 * The expensive part of numa migration is done from task_work context.
836 * Triggered from task_tick_numa().
837 */
838void task_numa_work(struct callback_head *work)
839{
840 unsigned long migrate, next_scan, now = jiffies;
841 struct task_struct *p = current;
842 struct mm_struct *mm = p->mm;
843 struct vm_area_struct *vma;
844 unsigned long start, end;
845 long pages;
846
847 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
848
849 work->next = work; /* protect against double add */
850 /*
851 * Who cares about NUMA placement when they're dying.
852 *
853 * NOTE: make sure not to dereference p->mm before this check,
854 * exit_task_work() happens _after_ exit_mm() so we could be called
855 * without p->mm even though we still had it when we enqueued this
856 * work.
857 */
858 if (p->flags & PF_EXITING)
859 return;
860
861 /*
862 * We do not care about task placement until a task runs on a node
863 * other than the first one used by the address space. This is
864 * largely because migrations are driven by what CPU the task
865 * is running on. If it's never scheduled on another node, it'll
866 * not migrate so why bother trapping the fault.
867 */
868 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
869 mm->first_nid = numa_node_id();
870 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
871 /* Are we running on a new node yet? */
872 if (numa_node_id() == mm->first_nid &&
873 !sched_feat_numa(NUMA_FORCE))
874 return;
875
876 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
877 }
878
879 /*
880 * Reset the scan period if enough time has gone by. Objective is that
881 * scanning will be reduced if pages are properly placed. As tasks
882 * can enter different phases this needs to be re-examined. Lacking
883 * proper tracking of reference behaviour, this blunt hammer is used.
884 */
885 migrate = mm->numa_next_reset;
886 if (time_after(now, migrate)) {
887 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
888 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
889 xchg(&mm->numa_next_reset, next_scan);
890 }
891
892 /*
893 * Enforce maximal scan/migration frequency..
894 */
895 migrate = mm->numa_next_scan;
896 if (time_before(now, migrate))
897 return;
898
899 if (p->numa_scan_period == 0)
900 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
901
902 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
903 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
904 return;
905
906 /*
907 * Do not set pte_numa if the current running node is rate-limited.
908 * This loses statistics on the fault but if we are unwilling to
909 * migrate to this node, it is less likely we can do useful work
910 */
911 if (migrate_ratelimited(numa_node_id()))
912 return;
913
914 start = mm->numa_scan_offset;
915 pages = sysctl_numa_balancing_scan_size;
916 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
917 if (!pages)
918 return;
919
920 down_read(&mm->mmap_sem);
921 vma = find_vma(mm, start);
922 if (!vma) {
923 reset_ptenuma_scan(p);
924 start = 0;
925 vma = mm->mmap;
926 }
927 for (; vma; vma = vma->vm_next) {
928 if (!vma_migratable(vma))
929 continue;
930
931 /* Skip small VMAs. They are not likely to be of relevance */
932 if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR)
933 continue;
934
935 do {
936 start = max(start, vma->vm_start);
937 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
938 end = min(end, vma->vm_end);
939 pages -= change_prot_numa(vma, start, end);
940
941 start = end;
942 if (pages <= 0)
943 goto out;
944 } while (end != vma->vm_end);
945 }
946
947out:
948 /*
949 * It is possible to reach the end of the VMA list but the last few VMAs are
950 * not guaranteed to the vma_migratable. If they are not, we would find the
951 * !migratable VMA on the next scan but not reset the scanner to the start
952 * so check it now.
953 */
954 if (vma)
955 mm->numa_scan_offset = start;
956 else
957 reset_ptenuma_scan(p);
958 up_read(&mm->mmap_sem);
959}
960
961/*
962 * Drive the periodic memory faults..
963 */
964void task_tick_numa(struct rq *rq, struct task_struct *curr)
965{
966 struct callback_head *work = &curr->numa_work;
967 u64 period, now;
968
969 /*
970 * We don't care about NUMA placement if we don't have memory.
971 */
972 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
973 return;
974
975 /*
976 * Using runtime rather than walltime has the dual advantage that
977 * we (mostly) drive the selection from busy threads and that the
978 * task needs to have done some actual work before we bother with
979 * NUMA placement.
980 */
981 now = curr->se.sum_exec_runtime;
982 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
983
984 if (now - curr->node_stamp > period) {
985 if (!curr->node_stamp)
986 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
987 curr->node_stamp = now;
988
989 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
990 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
991 task_work_add(curr, work, true);
992 }
993 }
994}
995#else
996static void task_tick_numa(struct rq *rq, struct task_struct *curr)
997{
998}
999#endif /* CONFIG_NUMA_BALANCING */
1000
777static void 1001static void
778account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 1002account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
779{ 1003{
@@ -5501,6 +5725,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
5501 entity_tick(cfs_rq, se, queued); 5725 entity_tick(cfs_rq, se, queued);
5502 } 5726 }
5503 5727
5728 if (sched_feat_numa(NUMA))
5729 task_tick_numa(rq, curr);
5730
5504 update_rq_runnable_avg(rq, 1); 5731 update_rq_runnable_avg(rq, 1);
5505} 5732}
5506 5733