aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/sched/core.c13
-rw-r--r--kernel/sched/fair.c125
-rw-r--r--kernel/sched/features.h7
-rw-r--r--kernel/sched/sched.h6
-rw-r--r--kernel/sysctl.c24
5 files changed, 173 insertions, 2 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2d8927fda712..cad0d092ce3b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1533,6 +1533,19 @@ static void __sched_fork(struct task_struct *p)
1533#ifdef CONFIG_PREEMPT_NOTIFIERS 1533#ifdef CONFIG_PREEMPT_NOTIFIERS
1534 INIT_HLIST_HEAD(&p->preempt_notifiers); 1534 INIT_HLIST_HEAD(&p->preempt_notifiers);
1535#endif 1535#endif
1536
1537#ifdef CONFIG_NUMA_BALANCING
1538 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1539 p->mm->numa_next_scan = jiffies;
1540 p->mm->numa_scan_seq = 0;
1541 }
1542
1543 p->node_stamp = 0ULL;
1544 p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
1545 p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
1546 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
1547 p->numa_work.next = &p->numa_work;
1548#endif /* CONFIG_NUMA_BALANCING */
1536} 1549}
1537 1550
1538/* 1551/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a14b990..6831abb5dbef 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,8 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/profile.h> 27#include <linux/profile.h>
28#include <linux/interrupt.h> 28#include <linux/interrupt.h>
29#include <linux/mempolicy.h>
30#include <linux/task_work.h>
29 31
30#include <trace/events/sched.h> 32#include <trace/events/sched.h>
31 33
@@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
776 * Scheduling class queueing methods: 778 * Scheduling class queueing methods:
777 */ 779 */
778 780
781#ifdef CONFIG_NUMA_BALANCING
782/*
783 * numa task sample period in ms: 5s
784 */
785unsigned int sysctl_numa_balancing_scan_period_min = 5000;
786unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
787
788static void task_numa_placement(struct task_struct *p)
789{
790 int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
791
792 if (p->numa_scan_seq == seq)
793 return;
794 p->numa_scan_seq = seq;
795
796 /* FIXME: Scheduling placement policy hints go here */
797}
798
799/*
800 * Got a PROT_NONE fault for a page on @node.
801 */
802void task_numa_fault(int node, int pages)
803{
804 struct task_struct *p = current;
805
806 /* FIXME: Allocate task-specific structure for placement policy here */
807
808 task_numa_placement(p);
809}
810
811/*
812 * The expensive part of numa migration is done from task_work context.
813 * Triggered from task_tick_numa().
814 */
815void task_numa_work(struct callback_head *work)
816{
817 unsigned long migrate, next_scan, now = jiffies;
818 struct task_struct *p = current;
819 struct mm_struct *mm = p->mm;
820
821 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
822
823 work->next = work; /* protect against double add */
824 /*
825 * Who cares about NUMA placement when they're dying.
826 *
827 * NOTE: make sure not to dereference p->mm before this check,
828 * exit_task_work() happens _after_ exit_mm() so we could be called
829 * without p->mm even though we still had it when we enqueued this
830 * work.
831 */
832 if (p->flags & PF_EXITING)
833 return;
834
835 /*
836 * Enforce maximal scan/migration frequency..
837 */
838 migrate = mm->numa_next_scan;
839 if (time_before(now, migrate))
840 return;
841
842 if (p->numa_scan_period == 0)
843 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
844
845 next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
846 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
847 return;
848
849 ACCESS_ONCE(mm->numa_scan_seq)++;
850 {
851 struct vm_area_struct *vma;
852
853 down_read(&mm->mmap_sem);
854 for (vma = mm->mmap; vma; vma = vma->vm_next) {
855 if (!vma_migratable(vma))
856 continue;
857 change_prot_numa(vma, vma->vm_start, vma->vm_end);
858 }
859 up_read(&mm->mmap_sem);
860 }
861}
862
863/*
864 * Drive the periodic memory faults..
865 */
866void task_tick_numa(struct rq *rq, struct task_struct *curr)
867{
868 struct callback_head *work = &curr->numa_work;
869 u64 period, now;
870
871 /*
872 * We don't care about NUMA placement if we don't have memory.
873 */
874 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
875 return;
876
877 /*
878 * Using runtime rather than walltime has the dual advantage that
879 * we (mostly) drive the selection from busy threads and that the
880 * task needs to have done some actual work before we bother with
881 * NUMA placement.
882 */
883 now = curr->se.sum_exec_runtime;
884 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
885
886 if (now - curr->node_stamp > period) {
887 curr->node_stamp = now;
888
889 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
890 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
891 task_work_add(curr, work, true);
892 }
893 }
894}
895#else
896static void task_tick_numa(struct rq *rq, struct task_struct *curr)
897{
898}
899#endif /* CONFIG_NUMA_BALANCING */
900
779static void 901static void
780account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 902account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
781{ 903{
@@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4954 cfs_rq = cfs_rq_of(se); 5076 cfs_rq = cfs_rq_of(se);
4955 entity_tick(cfs_rq, se, queued); 5077 entity_tick(cfs_rq, se, queued);
4956 } 5078 }
5079
5080 if (sched_feat_numa(NUMA))
5081 task_tick_numa(rq, curr);
4957} 5082}
4958 5083
4959/* 5084/*
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index eebefcad7027..5fb7aefbec80 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -61,3 +61,10 @@ SCHED_FEAT(TTWU_QUEUE, true)
61SCHED_FEAT(FORCE_SD_OVERLAP, false) 61SCHED_FEAT(FORCE_SD_OVERLAP, false)
62SCHED_FEAT(RT_RUNTIME_SHARE, true) 62SCHED_FEAT(RT_RUNTIME_SHARE, true)
63SCHED_FEAT(LB_MIN, false) 63SCHED_FEAT(LB_MIN, false)
64
65/*
66 * Apply the automatic NUMA scheduling policy
67 */
68#ifdef CONFIG_NUMA_BALANCING
69SCHED_FEAT(NUMA, true)
70#endif
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7a7db09cfabc..ae31c051ff2f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -648,6 +648,12 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR];
648#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) 648#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
649#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ 649#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
650 650
651#ifdef CONFIG_NUMA_BALANCING
652#define sched_feat_numa(x) sched_feat(x)
653#else
654#define sched_feat_numa(x) (0)
655#endif
656
651static inline u64 global_rt_period(void) 657static inline u64 global_rt_period(void)
652{ 658{
653 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC; 659 return (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 26f65eaa01f9..025e1ae50ef1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -256,9 +256,11 @@ static int min_sched_granularity_ns = 100000; /* 100 usecs */
256static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ 256static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
257static int min_wakeup_granularity_ns; /* 0 usecs */ 257static int min_wakeup_granularity_ns; /* 0 usecs */
258static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ 258static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
259#ifdef CONFIG_SMP
259static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; 260static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE;
260static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; 261static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1;
261#endif 262#endif /* CONFIG_SMP */
263#endif /* CONFIG_SCHED_DEBUG */
262 264
263#ifdef CONFIG_COMPACTION 265#ifdef CONFIG_COMPACTION
264static int min_extfrag_threshold; 266static int min_extfrag_threshold;
@@ -301,6 +303,7 @@ static struct ctl_table kern_table[] = {
301 .extra1 = &min_wakeup_granularity_ns, 303 .extra1 = &min_wakeup_granularity_ns,
302 .extra2 = &max_wakeup_granularity_ns, 304 .extra2 = &max_wakeup_granularity_ns,
303 }, 305 },
306#ifdef CONFIG_SMP
304 { 307 {
305 .procname = "sched_tunable_scaling", 308 .procname = "sched_tunable_scaling",
306 .data = &sysctl_sched_tunable_scaling, 309 .data = &sysctl_sched_tunable_scaling,
@@ -347,7 +350,24 @@ static struct ctl_table kern_table[] = {
347 .extra1 = &zero, 350 .extra1 = &zero,
348 .extra2 = &one, 351 .extra2 = &one,
349 }, 352 },
350#endif 353#endif /* CONFIG_SMP */
354#ifdef CONFIG_NUMA_BALANCING
355 {
356 .procname = "numa_balancing_scan_period_min_ms",
357 .data = &sysctl_numa_balancing_scan_period_min,
358 .maxlen = sizeof(unsigned int),
359 .mode = 0644,
360 .proc_handler = proc_dointvec,
361 },
362 {
363 .procname = "numa_balancing_scan_period_max_ms",
364 .data = &sysctl_numa_balancing_scan_period_max,
365 .maxlen = sizeof(unsigned int),
366 .mode = 0644,
367 .proc_handler = proc_dointvec,
368 },
369#endif /* CONFIG_NUMA_BALANCING */
370#endif /* CONFIG_SCHED_DEBUG */
351 { 371 {
352 .procname = "sched_rt_period_us", 372 .procname = "sched_rt_period_us",
353 .data = &sysctl_sched_rt_period, 373 .data = &sysctl_sched_rt_period,