diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 227 |
1 files changed, 227 insertions, 0 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 756f9f9e8542..9af5af979a13 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -26,6 +26,9 @@ | |||
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/profile.h> | 27 | #include <linux/profile.h> |
28 | #include <linux/interrupt.h> | 28 | #include <linux/interrupt.h> |
29 | #include <linux/mempolicy.h> | ||
30 | #include <linux/migrate.h> | ||
31 | #include <linux/task_work.h> | ||
29 | 32 | ||
30 | #include <trace/events/sched.h> | 33 | #include <trace/events/sched.h> |
31 | 34 | ||
@@ -774,6 +777,227 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
774 | * Scheduling class queueing methods: | 777 | * Scheduling class queueing methods: |
775 | */ | 778 | */ |
776 | 779 | ||
780 | #ifdef CONFIG_NUMA_BALANCING | ||
781 | /* | ||
782 | * numa task sample period in ms | ||
783 | */ | ||
784 | unsigned int sysctl_numa_balancing_scan_period_min = 100; | ||
785 | unsigned int sysctl_numa_balancing_scan_period_max = 100*50; | ||
786 | unsigned int sysctl_numa_balancing_scan_period_reset = 100*600; | ||
787 | |||
788 | /* Portion of address space to scan in MB */ | ||
789 | unsigned int sysctl_numa_balancing_scan_size = 256; | ||
790 | |||
791 | /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ | ||
792 | unsigned int sysctl_numa_balancing_scan_delay = 1000; | ||
793 | |||
794 | static void task_numa_placement(struct task_struct *p) | ||
795 | { | ||
796 | int seq = ACCESS_ONCE(p->mm->numa_scan_seq); | ||
797 | |||
798 | if (p->numa_scan_seq == seq) | ||
799 | return; | ||
800 | p->numa_scan_seq = seq; | ||
801 | |||
802 | /* FIXME: Scheduling placement policy hints go here */ | ||
803 | } | ||
804 | |||
805 | /* | ||
806 | * Got a PROT_NONE fault for a page on @node. | ||
807 | */ | ||
808 | void task_numa_fault(int node, int pages, bool migrated) | ||
809 | { | ||
810 | struct task_struct *p = current; | ||
811 | |||
812 | if (!sched_feat_numa(NUMA)) | ||
813 | return; | ||
814 | |||
815 | /* FIXME: Allocate task-specific structure for placement policy here */ | ||
816 | |||
817 | /* | ||
818 | * If pages are properly placed (did not migrate) then scan slower. | ||
819 | * This is reset periodically in case of phase changes | ||
820 | */ | ||
821 | if (!migrated) | ||
822 | p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, | ||
823 | p->numa_scan_period + jiffies_to_msecs(10)); | ||
824 | |||
825 | task_numa_placement(p); | ||
826 | } | ||
827 | |||
828 | static void reset_ptenuma_scan(struct task_struct *p) | ||
829 | { | ||
830 | ACCESS_ONCE(p->mm->numa_scan_seq)++; | ||
831 | p->mm->numa_scan_offset = 0; | ||
832 | } | ||
833 | |||
834 | /* | ||
835 | * The expensive part of numa migration is done from task_work context. | ||
836 | * Triggered from task_tick_numa(). | ||
837 | */ | ||
838 | void task_numa_work(struct callback_head *work) | ||
839 | { | ||
840 | unsigned long migrate, next_scan, now = jiffies; | ||
841 | struct task_struct *p = current; | ||
842 | struct mm_struct *mm = p->mm; | ||
843 | struct vm_area_struct *vma; | ||
844 | unsigned long start, end; | ||
845 | long pages; | ||
846 | |||
847 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); | ||
848 | |||
849 | work->next = work; /* protect against double add */ | ||
850 | /* | ||
851 | * Who cares about NUMA placement when they're dying. | ||
852 | * | ||
853 | * NOTE: make sure not to dereference p->mm before this check, | ||
854 | * exit_task_work() happens _after_ exit_mm() so we could be called | ||
855 | * without p->mm even though we still had it when we enqueued this | ||
856 | * work. | ||
857 | */ | ||
858 | if (p->flags & PF_EXITING) | ||
859 | return; | ||
860 | |||
861 | /* | ||
862 | * We do not care about task placement until a task runs on a node | ||
863 | * other than the first one used by the address space. This is | ||
864 | * largely because migrations are driven by what CPU the task | ||
865 | * is running on. If it's never scheduled on another node, it'll | ||
866 | * not migrate so why bother trapping the fault. | ||
867 | */ | ||
868 | if (mm->first_nid == NUMA_PTE_SCAN_INIT) | ||
869 | mm->first_nid = numa_node_id(); | ||
870 | if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) { | ||
871 | /* Are we running on a new node yet? */ | ||
872 | if (numa_node_id() == mm->first_nid && | ||
873 | !sched_feat_numa(NUMA_FORCE)) | ||
874 | return; | ||
875 | |||
876 | mm->first_nid = NUMA_PTE_SCAN_ACTIVE; | ||
877 | } | ||
878 | |||
879 | /* | ||
880 | * Reset the scan period if enough time has gone by. Objective is that | ||
881 | * scanning will be reduced if pages are properly placed. As tasks | ||
882 | * can enter different phases this needs to be re-examined. Lacking | ||
883 | * proper tracking of reference behaviour, this blunt hammer is used. | ||
884 | */ | ||
885 | migrate = mm->numa_next_reset; | ||
886 | if (time_after(now, migrate)) { | ||
887 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
888 | next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset); | ||
889 | xchg(&mm->numa_next_reset, next_scan); | ||
890 | } | ||
891 | |||
892 | /* | ||
893 | * Enforce maximal scan/migration frequency.. | ||
894 | */ | ||
895 | migrate = mm->numa_next_scan; | ||
896 | if (time_before(now, migrate)) | ||
897 | return; | ||
898 | |||
899 | if (p->numa_scan_period == 0) | ||
900 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
901 | |||
902 | next_scan = now + msecs_to_jiffies(p->numa_scan_period); | ||
903 | if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) | ||
904 | return; | ||
905 | |||
906 | /* | ||
907 | * Do not set pte_numa if the current running node is rate-limited. | ||
908 | * This loses statistics on the fault but if we are unwilling to | ||
909 | * migrate to this node, it is less likely we can do useful work | ||
910 | */ | ||
911 | if (migrate_ratelimited(numa_node_id())) | ||
912 | return; | ||
913 | |||
914 | start = mm->numa_scan_offset; | ||
915 | pages = sysctl_numa_balancing_scan_size; | ||
916 | pages <<= 20 - PAGE_SHIFT; /* MB in pages */ | ||
917 | if (!pages) | ||
918 | return; | ||
919 | |||
920 | down_read(&mm->mmap_sem); | ||
921 | vma = find_vma(mm, start); | ||
922 | if (!vma) { | ||
923 | reset_ptenuma_scan(p); | ||
924 | start = 0; | ||
925 | vma = mm->mmap; | ||
926 | } | ||
927 | for (; vma; vma = vma->vm_next) { | ||
928 | if (!vma_migratable(vma)) | ||
929 | continue; | ||
930 | |||
931 | /* Skip small VMAs. They are not likely to be of relevance */ | ||
932 | if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR) | ||
933 | continue; | ||
934 | |||
935 | do { | ||
936 | start = max(start, vma->vm_start); | ||
937 | end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE); | ||
938 | end = min(end, vma->vm_end); | ||
939 | pages -= change_prot_numa(vma, start, end); | ||
940 | |||
941 | start = end; | ||
942 | if (pages <= 0) | ||
943 | goto out; | ||
944 | } while (end != vma->vm_end); | ||
945 | } | ||
946 | |||
947 | out: | ||
948 | /* | ||
949 | * It is possible to reach the end of the VMA list but the last few VMAs are | ||
950 | * not guaranteed to the vma_migratable. If they are not, we would find the | ||
951 | * !migratable VMA on the next scan but not reset the scanner to the start | ||
952 | * so check it now. | ||
953 | */ | ||
954 | if (vma) | ||
955 | mm->numa_scan_offset = start; | ||
956 | else | ||
957 | reset_ptenuma_scan(p); | ||
958 | up_read(&mm->mmap_sem); | ||
959 | } | ||
960 | |||
961 | /* | ||
962 | * Drive the periodic memory faults.. | ||
963 | */ | ||
964 | void task_tick_numa(struct rq *rq, struct task_struct *curr) | ||
965 | { | ||
966 | struct callback_head *work = &curr->numa_work; | ||
967 | u64 period, now; | ||
968 | |||
969 | /* | ||
970 | * We don't care about NUMA placement if we don't have memory. | ||
971 | */ | ||
972 | if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) | ||
973 | return; | ||
974 | |||
975 | /* | ||
976 | * Using runtime rather than walltime has the dual advantage that | ||
977 | * we (mostly) drive the selection from busy threads and that the | ||
978 | * task needs to have done some actual work before we bother with | ||
979 | * NUMA placement. | ||
980 | */ | ||
981 | now = curr->se.sum_exec_runtime; | ||
982 | period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; | ||
983 | |||
984 | if (now - curr->node_stamp > period) { | ||
985 | if (!curr->node_stamp) | ||
986 | curr->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
987 | curr->node_stamp = now; | ||
988 | |||
989 | if (!time_before(jiffies, curr->mm->numa_next_scan)) { | ||
990 | init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ | ||
991 | task_work_add(curr, work, true); | ||
992 | } | ||
993 | } | ||
994 | } | ||
995 | #else | ||
996 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) | ||
997 | { | ||
998 | } | ||
999 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1000 | |||
777 | static void | 1001 | static void |
778 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 1002 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
779 | { | 1003 | { |
@@ -5501,6 +5725,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | |||
5501 | entity_tick(cfs_rq, se, queued); | 5725 | entity_tick(cfs_rq, se, queued); |
5502 | } | 5726 | } |
5503 | 5727 | ||
5728 | if (sched_feat_numa(NUMA)) | ||
5729 | task_tick_numa(rq, curr); | ||
5730 | |||
5504 | update_rq_runnable_avg(rq, 1); | 5731 | update_rq_runnable_avg(rq, 1); |
5505 | } | 5732 | } |
5506 | 5733 | ||