aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
authorPeter Zijlstra <a.p.zijlstra@chello.nl>2012-10-25 08:16:43 -0400
committerMel Gorman <mgorman@suse.de>2012-12-11 09:42:45 -0500
commitcbee9f88ec1b8dd6b58f25f54e4f52c82ed77690 (patch)
treed4cfbcfa3e89742216cd792d4aa914356406b532 /kernel/sched/fair.c
parenta720094ded8cbb303111035be91858011d2eac71 (diff)
mm: numa: Add fault driven placement and migration
NOTE: This patch is based on "sched, numa, mm: Add fault driven placement and migration policy" but as it throws away all the policy to just leave a basic foundation I had to drop the signed-offs-by. This patch creates a bare-bones method for setting PTEs pte_numa in the context of the scheduler that when faulted later will be faulted onto the node the CPU is running on. In itself this does nothing useful but any placement policy will fundamentally depend on receiving hints on placement from fault context and doing something intelligent about it. Signed-off-by: Mel Gorman <mgorman@suse.de> Acked-by: Rik van Riel <riel@redhat.com>
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c125
1 files changed, 125 insertions, 0 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6b800a14b990..6831abb5dbef 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,8 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/profile.h> 27#include <linux/profile.h>
28#include <linux/interrupt.h> 28#include <linux/interrupt.h>
29#include <linux/mempolicy.h>
30#include <linux/task_work.h>
29 31
30#include <trace/events/sched.h> 32#include <trace/events/sched.h>
31 33
@@ -776,6 +778,126 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
776 * Scheduling class queueing methods: 778 * Scheduling class queueing methods:
777 */ 779 */
778 780
781#ifdef CONFIG_NUMA_BALANCING
782/*
783 * numa task sample period in ms: 5s
784 */
785unsigned int sysctl_numa_balancing_scan_period_min = 5000;
786unsigned int sysctl_numa_balancing_scan_period_max = 5000*16;
787
788static void task_numa_placement(struct task_struct *p)
789{
790 int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
791
792 if (p->numa_scan_seq == seq)
793 return;
794 p->numa_scan_seq = seq;
795
796 /* FIXME: Scheduling placement policy hints go here */
797}
798
799/*
800 * Got a PROT_NONE fault for a page on @node.
801 */
802void task_numa_fault(int node, int pages)
803{
804 struct task_struct *p = current;
805
806 /* FIXME: Allocate task-specific structure for placement policy here */
807
808 task_numa_placement(p);
809}
810
811/*
812 * The expensive part of numa migration is done from task_work context.
813 * Triggered from task_tick_numa().
814 */
815void task_numa_work(struct callback_head *work)
816{
817 unsigned long migrate, next_scan, now = jiffies;
818 struct task_struct *p = current;
819 struct mm_struct *mm = p->mm;
820
821 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
822
823 work->next = work; /* protect against double add */
824 /*
825 * Who cares about NUMA placement when they're dying.
826 *
827 * NOTE: make sure not to dereference p->mm before this check,
828 * exit_task_work() happens _after_ exit_mm() so we could be called
829 * without p->mm even though we still had it when we enqueued this
830 * work.
831 */
832 if (p->flags & PF_EXITING)
833 return;
834
835 /*
836 * Enforce maximal scan/migration frequency..
837 */
838 migrate = mm->numa_next_scan;
839 if (time_before(now, migrate))
840 return;
841
842 if (p->numa_scan_period == 0)
843 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
844
845 next_scan = now + 2*msecs_to_jiffies(p->numa_scan_period);
846 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
847 return;
848
849 ACCESS_ONCE(mm->numa_scan_seq)++;
850 {
851 struct vm_area_struct *vma;
852
853 down_read(&mm->mmap_sem);
854 for (vma = mm->mmap; vma; vma = vma->vm_next) {
855 if (!vma_migratable(vma))
856 continue;
857 change_prot_numa(vma, vma->vm_start, vma->vm_end);
858 }
859 up_read(&mm->mmap_sem);
860 }
861}
862
863/*
864 * Drive the periodic memory faults..
865 */
866void task_tick_numa(struct rq *rq, struct task_struct *curr)
867{
868 struct callback_head *work = &curr->numa_work;
869 u64 period, now;
870
871 /*
872 * We don't care about NUMA placement if we don't have memory.
873 */
874 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
875 return;
876
877 /*
878 * Using runtime rather than walltime has the dual advantage that
879 * we (mostly) drive the selection from busy threads and that the
880 * task needs to have done some actual work before we bother with
881 * NUMA placement.
882 */
883 now = curr->se.sum_exec_runtime;
884 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
885
886 if (now - curr->node_stamp > period) {
887 curr->node_stamp = now;
888
889 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
890 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
891 task_work_add(curr, work, true);
892 }
893 }
894}
895#else
896static void task_tick_numa(struct rq *rq, struct task_struct *curr)
897{
898}
899#endif /* CONFIG_NUMA_BALANCING */
900
779static void 901static void
780account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 902account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
781{ 903{
@@ -4954,6 +5076,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
4954 cfs_rq = cfs_rq_of(se); 5076 cfs_rq = cfs_rq_of(se);
4955 entity_tick(cfs_rq, se, queued); 5077 entity_tick(cfs_rq, se, queued);
4956 } 5078 }
5079
5080 if (sched_feat_numa(NUMA))
5081 task_tick_numa(rq, curr);
4957} 5082}
4958 5083
4959/* 5084/*