diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/sched/core.c | 13 | ||||
-rw-r--r-- | kernel/sched/fair.c | 125 | ||||
-rw-r--r-- | kernel/sched/features.h | 7 | ||||
-rw-r--r-- | kernel/sched/sched.h | 6 | ||||
-rw-r--r-- | kernel/sysctl.c | 24 |
5 files changed, 173 insertions, 2 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2d8927fda712..cad0d092ce3b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p) | |||
1533 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1533 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
1534 | INIT_HLIST_HEAD(&p->preempt_notifiers); | 1534 | INIT_HLIST_HEAD(&p->preempt_notifiers); |
1535 | #endif | 1535 | #endif |
1536 | |||
1537 | #ifdef CONFIG_NUMA_BALANCING | ||
1538 | if (p->mm && atomic_read(&p->mm->mm_users) == 1) { | ||
1539 | p->mm->numa_next_scan = jiffies; | ||
1540 | p->mm->numa_scan_seq = 0; | ||
1541 | } | ||
1542 | |||
1543 | p->node_stamp = 0ULL; | ||
1544 | p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; | ||
1545 | p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0; | ||
1546 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
1547 | p->numa_work.next = &p->numa_work; | ||
1548 | #endif /* CONFIG_NUMA_BALANCING */ | ||
1536 | } | 1549 | } |
1537 | 1550 | ||
1538 | /* | 1551 | /* |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b800a14b990..6831abb5dbef 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -26,6 +26,8 @@ | |||
26 | #include <linux/slab.h> | 26 | #include <linux/slab.h> |
27 | #include <linux/profile.h> | 27 | #include <linux/profile.h> |
28 | #include <linux/interrupt.h> | 28 | #include <linux/interrupt.h> |
29 | #include <linux/mempolicy.h> | ||
30 | #include <linux/task_work.h> | ||
29 | 31 | ||
30 | #include <trace/events/sched.h> | 32 | #include <trace/events/sched.h> |
31 | 33 | ||
@@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
776 | * Scheduling class queueing methods: | 778 | * Scheduling class queueing methods: |
777 | */ | 779 | */ |
778 | 780 | ||
781 | #ifdef CONFIG_NUMA_BALANCING | ||
782 | /* | ||
783 | * numa task sample period in ms: 5s | ||
784 | */ | ||
785 | unsigned int sysctl_numa_balancing_scan_period_min = 5000; | ||
786 | unsigned int sysctl_numa_balancing_scan_period_max = 5000*16; | ||
787 | |||
788 | static void task_numa_placement(struct task_struct *p) | ||
789 | { | ||
790 | int seq = ACCESS_ONCE(p->mm->numa_scan_seq); | ||
791 | |||
792 | if (p->numa_scan_seq == seq) | ||
793 | return; | ||
794 | p->numa_scan_seq = seq; | ||
795 | |||
796 | /* FIXME: Scheduling placement policy hints go here */ | ||
797 | } | ||
798 | |||
799 | /* | ||
800 | * Got a PROT_NONE fault for a page on @node. | ||
801 | */ | ||
802 | void task_numa_fault(int node, int pages) | ||
803 | { | ||
804 | struct task_struct *p = current; | ||
805 | |||
806 | /* FIXME: Allocate task-specific structure for placement policy here */ | ||
807 | |||
808 | task_numa_placement(p); | ||
809 | } | ||
810 | |||
811 | /* | ||
812 | * The expensive part of numa migration is done from task_work context. | ||
813 | * Triggered from task_tick_numa(). | ||
814 | */ | ||
815 | void task_numa_work(struct callback_head *work) | ||
816 | { | ||
817 | unsigned long migrate, next_scan, now = jiffies; | ||
818 | struct task_struct *p = current; | ||
819 | struct mm_struct *mm = p->mm; | ||
820 | |||
821 | WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work)); | ||
822 | |||
823 | work->next = work; /* protect against double add */ | ||
824 | /* | ||
825 | * Who cares about NUMA placement when they're dying. | ||
826 | * | ||
827 | * NOTE: make sure not to dereference p->mm before this check, | ||
828 | * exit_task_work() happens _after_ exit_mm() so we could be called | ||
829 | * without p->mm even though we still had it when we enqueued this | ||
830 | * work. | ||
831 | */ | ||
832 | if (p->flags & PF_EXITING) | ||
833 | return; | ||
834 | |||
835 | /* | ||
836 | * Enforce maximal scan/migration frequency.. | ||
837 | */ | ||
838 | migrate = mm->numa_next_scan; | ||
839 | if (time_before(now, migrate)) | ||
840 | return; | ||
841 | |||
842 | if (p->numa_scan_period == 0) | ||
843 | p->numa_scan_period = sysctl_numa_balancing_scan_period_min; | ||
844 | |||
845 | next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period); | ||
846 | if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) | ||
847 | return; | ||
848 | |||
849 | ACCESS_ONCE(mm->numa_scan_seq)++; | ||
850 | { | ||
851 | struct vm_area_struct *vma; | ||
852 | |||
853 | down_read(&mm->mmap_sem); | ||
854 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | ||
855 | if (!vma_migratable(vma)) | ||
856 | continue; | ||
857 | change_prot_numa(vma, vma->vm_start, vma->vm_end); | ||
858 | } | ||
859 | up_read(&mm->mmap_sem); | ||
860 | } | ||
861 | } | ||
862 | |||
863 | /* | ||
864 | * Drive the periodic memory faults.. | ||
865 | */ | ||
866 | void task_tick_numa(struct rq *rq, struct task_struct *curr) | ||
867 | { | ||
868 | struct callback_head *work = &curr->numa_work; | ||
869 | u64 period, now; | ||
870 | |||
871 | /* | ||
872 | * We don't care about NUMA placement if we don't have memory. | ||
873 | */ | ||
874 | if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work) | ||
875 | return; | ||
876 | |||
877 | /* | ||
878 | * Using runtime rather than walltime has the dual advantage that | ||
879 | * we (mostly) drive the selection from busy threads and that the | ||
880 | * task needs to have done some actual work before we bother with | ||
881 | * NUMA placement. | ||
882 | */ | ||
883 | now = curr->se.sum_exec_runtime; | ||
884 | period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; | ||
885 | |||
886 | if (now - curr->node_stamp > period) { | ||
887 | curr->node_stamp = now; | ||
888 | |||
889 | if (!time_before(jiffies, curr->mm->numa_next_scan)) { | ||
890 | init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */ | ||
891 | task_work_add(curr, work, true); | ||
892 | } | ||
893 | } | ||
894 | } | ||
895 | #else | ||
896 | static void task_tick_numa(struct rq *rq, struct task_struct *curr) | ||
897 | { | ||
898 | } | ||
899 | #endif /* CONFIG_NUMA_BALANCING */ | ||
900 | |||
779 | static void | 901 | static void |
780 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) | 902 | account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) |
781 | { | 903 | { |
@@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) | |||
4954 | cfs_rq = cfs_rq_of(se); | 5076 | cfs_rq = cfs_rq_of(se); |
4955 | entity_tick(cfs_rq, se, queued); | 5077 | entity_tick(cfs_rq, se, queued); |
4956 | } | 5078 | } |
5079 | |||
5080 | if (sched_feat_numa(NUMA)) | ||
5081 | task_tick_numa(rq, curr); | ||
4957 | } | 5082 | } |
4958 | 5083 | ||
4959 | /* | 5084 | /* |
diff --git a/kernel/sched/features.h b/kernel/sched/features.h index eebefcad7027..5fb7aefbec80 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h | |||
@@ -61,3 +61,10 @@ SCHED_FEAT(TTWU_QUEUE, true) | |||
61 | SCHED_FEAT(FORCE_SD_OVERLAP, false) | 61 | SCHED_FEAT(FORCE_SD_OVERLAP, false) |
62 | SCHED_FEAT(RT_RUNTIME_SHARE, true) | 62 | SCHED_FEAT(RT_RUNTIME_SHARE, true) |
63 | SCHED_FEAT(LB_MIN, false) | 63 | SCHED_FEAT(LB_MIN, false) |
64 | |||
65 | /* | ||
66 | * Apply the automatic NUMA scheduling policy | ||
67 | */ | ||
68 | #ifdef CONFIG_NUMA_BALANCING | ||
69 | SCHED_FEAT(NUMA, true) | ||
70 | #endif | ||
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 7a7db09cfabc..ae31c051ff2f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -648,6 +648,12 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; | |||
648 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) | 648 | #define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) |
649 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ | 649 | #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ |
650 | 650 | ||
651 | #ifdef CONFIG_NUMA_BALANCING | ||
652 | #define sched_feat_numa(x) sched_feat(x) | ||
653 | #else | ||
654 | #define sched_feat_numa(x) (0) | ||
655 | #endif | ||
656 | |||
651 | static inline u64 global_rt_period(void) | 657 | static inline u64 global_rt_period(void) |
652 | { | 658 | { |
653 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; | 659 | return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 26f65eaa01f9..025e1ae50ef1 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */ | |||
256 | static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ | 256 | static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
257 | static int min_wakeup_granularity_ns; /* 0 usecs */ | 257 | static int min_wakeup_granularity_ns; /* 0 usecs */ |
258 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ | 258 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
259 | #ifdef CONFIG_SMP | ||
259 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; | 260 | static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; |
260 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; | 261 | static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; |
261 | #endif | 262 | #endif /* CONFIG_SMP */ |
263 | #endif /* CONFIG_SCHED_DEBUG */ | ||
262 | 264 | ||
263 | #ifdef CONFIG_COMPACTION | 265 | #ifdef CONFIG_COMPACTION |
264 | static int min_extfrag_threshold; | 266 | static int min_extfrag_threshold; |
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = { | |||
301 | .extra1 = &min_wakeup_granularity_ns, | 303 | .extra1 = &min_wakeup_granularity_ns, |
302 | .extra2 = &max_wakeup_granularity_ns, | 304 | .extra2 = &max_wakeup_granularity_ns, |
303 | }, | 305 | }, |
306 | #ifdef CONFIG_SMP | ||
304 | { | 307 | { |
305 | .procname = "sched_tunable_scaling", | 308 | .procname = "sched_tunable_scaling", |
306 | .data = &sysctl_sched_tunable_scaling, | 309 | .data = &sysctl_sched_tunable_scaling, |
@@ -347,7 +350,24 @@ static struct ctl_table kern_table[] = { | |||
347 | .extra1 = &zero, | 350 | .extra1 = &zero, |
348 | .extra2 = &one, | 351 | .extra2 = &one, |
349 | }, | 352 | }, |
350 | #endif | 353 | #endif /* CONFIG_SMP */ |
354 | #ifdef CONFIG_NUMA_BALANCING | ||
355 | { | ||
356 | .procname = "numa_balancing_scan_period_min_ms", | ||
357 | .data = &sysctl_numa_balancing_scan_period_min, | ||
358 | .maxlen = sizeof(unsigned int), | ||
359 | .mode = 0644, | ||
360 | .proc_handler = proc_dointvec, | ||
361 | }, | ||
362 | { | ||
363 | .procname = "numa_balancing_scan_period_max_ms", | ||
364 | .data = &sysctl_numa_balancing_scan_period_max, | ||
365 | .maxlen = sizeof(unsigned int), | ||
366 | .mode = 0644, | ||
367 | .proc_handler = proc_dointvec, | ||
368 | }, | ||
369 | #endif /* CONFIG_NUMA_BALANCING */ | ||
370 | #endif /* CONFIG_SCHED_DEBUG */ | ||
351 | { | 371 | { |
352 | .procname = "sched_rt_period_us", | 372 | .procname = "sched_rt_period_us", |
353 | .data = &sysctl_sched_rt_period, | 373 | .data = &sysctl_sched_rt_period, |