aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2012-12-16 17:33:25 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2012-12-16 18:18:08 -0500
commit3d59eebc5e137bd89c6351e4c70e90ba1d0dc234 (patch)
treeb4ddfd0b057454a7437a3b4e3074a3b8b4b03817 /kernel/sched/fair.c
parent11520e5e7c1855fc3bf202bb3be35a39d9efa034 (diff)
parent4fc3f1d66b1ef0d7b8dc11f4ff1cc510f78b37d6 (diff)
Merge tag 'balancenuma-v11' of git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux-balancenuma
Pull Automatic NUMA Balancing bare-bones from Mel Gorman: "There are three implementations for NUMA balancing, this tree (balancenuma), numacore which has been developed in tip/master and autonuma which is in aa.git. In almost all respects balancenuma is the dumbest of the three because its main impact is on the VM side with no attempt to be smart about scheduling. In the interest of getting the ball rolling, it would be desirable to see this much merged for 3.8 with the view to building scheduler smarts on top and adapting the VM where required for 3.9. The most recent set of comparisons available from different people are mel: https://lkml.org/lkml/2012/12/9/108 mingo: https://lkml.org/lkml/2012/12/7/331 tglx: https://lkml.org/lkml/2012/12/10/437 srikar: https://lkml.org/lkml/2012/12/10/397 The results are a mixed bag. In my own tests, balancenuma does reasonably well. It's dumb as rocks and does not regress against mainline. On the other hand, Ingo's tests shows that balancenuma is incapable of converging for this workloads driven by perf which is bad but is potentially explained by the lack of scheduler smarts. Thomas' results show balancenuma improves on mainline but falls far short of numacore or autonuma. Srikar's results indicate we all suffer on a large machine with imbalanced node sizes. My own testing showed that recent numacore results have improved dramatically, particularly in the last week but not universally. We've butted heads heavily on system CPU usage and high levels of migration even when it shows that overall performance is better. There are also cases where it regresses. Of interest is that for specjbb in some configurations it will regress for lower numbers of warehouses and show gains for higher numbers which is not reported by the tool by default and sometimes missed in treports. Recently I reported for numacore that the JVM was crashing with NullPointerExceptions but currently it's unclear what the source of this problem is. Initially I thought it was in how numacore batch handles PTEs but I'm no longer think this is the case. It's possible numacore is just able to trigger it due to higher rates of migration. These reports were quite late in the cycle so I/we would like to start with this tree as it contains much of the code we can agree on and has not changed significantly over the last 2-3 weeks." * tag 'balancenuma-v11' of git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux-balancenuma: (50 commits) mm/rmap, migration: Make rmap_walk_anon() and try_to_unmap_anon() more scalable mm/rmap: Convert the struct anon_vma::mutex to an rwsem mm: migrate: Account a transhuge page properly when rate limiting mm: numa: Account for failed allocations and isolations as migration failures mm: numa: Add THP migration for the NUMA working set scanning fault case build fix mm: numa: Add THP migration for the NUMA working set scanning fault case. mm: sched: numa: Delay PTE scanning until a task is scheduled on a new node mm: sched: numa: Control enabling and disabling of NUMA balancing if !SCHED_DEBUG mm: sched: numa: Control enabling and disabling of NUMA balancing mm: sched: Adapt the scanning rate if a NUMA hinting fault does not migrate mm: numa: Use a two-stage filter to restrict pages being migrated for unlikely task<->node relationships mm: numa: migrate: Set last_nid on newly allocated page mm: numa: split_huge_page: Transfer last_nid on tail page mm: numa: Introduce last_nid to the page frame sched: numa: Slowly increase the scanning period as NUMA faults are handled mm: numa: Rate limit setting of pte_numa if node is saturated mm: numa: Rate limit the amount of memory that is migrated between nodes mm: numa: Structures for Migrate On Fault per NUMA migration rate limiting mm: numa: Migrate pages handled during a pmd_numa hinting fault mm: numa: Migrate on reference policy ...
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c227
1 files changed, 227 insertions, 0 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 756f9f9e8542..9af5af979a13 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -26,6 +26,9 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/profile.h> 27#include <linux/profile.h>
28#include <linux/interrupt.h> 28#include <linux/interrupt.h>
29#include <linux/mempolicy.h>
30#include <linux/migrate.h>
31#include <linux/task_work.h>
29 32
30#include <trace/events/sched.h> 33#include <trace/events/sched.h>
31 34
@@ -774,6 +777,227 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
774 * Scheduling class queueing methods: 777 * Scheduling class queueing methods:
775 */ 778 */
776 779
780#ifdef CONFIG_NUMA_BALANCING
781/*
782 * numa task sample period in ms
783 */
784unsigned int sysctl_numa_balancing_scan_period_min = 100;
785unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
786unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
787
788/* Portion of address space to scan in MB */
789unsigned int sysctl_numa_balancing_scan_size = 256;
790
791/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
792unsigned int sysctl_numa_balancing_scan_delay = 1000;
793
794static void task_numa_placement(struct task_struct *p)
795{
796 int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
797
798 if (p->numa_scan_seq == seq)
799 return;
800 p->numa_scan_seq = seq;
801
802 /* FIXME: Scheduling placement policy hints go here */
803}
804
805/*
806 * Got a PROT_NONE fault for a page on @node.
807 */
808void task_numa_fault(int node, int pages, bool migrated)
809{
810 struct task_struct *p = current;
811
812 if (!sched_feat_numa(NUMA))
813 return;
814
815 /* FIXME: Allocate task-specific structure for placement policy here */
816
817 /*
818 * If pages are properly placed (did not migrate) then scan slower.
819 * This is reset periodically in case of phase changes
820 */
821 if (!migrated)
822 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
823 p->numa_scan_period + jiffies_to_msecs(10));
824
825 task_numa_placement(p);
826}
827
828static void reset_ptenuma_scan(struct task_struct *p)
829{
830 ACCESS_ONCE(p->mm->numa_scan_seq)++;
831 p->mm->numa_scan_offset = 0;
832}
833
834/*
835 * The expensive part of numa migration is done from task_work context.
836 * Triggered from task_tick_numa().
837 */
838void task_numa_work(struct callback_head *work)
839{
840 unsigned long migrate, next_scan, now = jiffies;
841 struct task_struct *p = current;
842 struct mm_struct *mm = p->mm;
843 struct vm_area_struct *vma;
844 unsigned long start, end;
845 long pages;
846
847 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
848
849 work->next = work; /* protect against double add */
850 /*
851 * Who cares about NUMA placement when they're dying.
852 *
853 * NOTE: make sure not to dereference p->mm before this check,
854 * exit_task_work() happens _after_ exit_mm() so we could be called
855 * without p->mm even though we still had it when we enqueued this
856 * work.
857 */
858 if (p->flags & PF_EXITING)
859 return;
860
861 /*
862 * We do not care about task placement until a task runs on a node
863 * other than the first one used by the address space. This is
864 * largely because migrations are driven by what CPU the task
865 * is running on. If it's never scheduled on another node, it'll
866 * not migrate so why bother trapping the fault.
867 */
868 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
869 mm->first_nid = numa_node_id();
870 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
871 /* Are we running on a new node yet? */
872 if (numa_node_id() == mm->first_nid &&
873 !sched_feat_numa(NUMA_FORCE))
874 return;
875
876 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
877 }
878
879 /*
880 * Reset the scan period if enough time has gone by. Objective is that
881 * scanning will be reduced if pages are properly placed. As tasks
882 * can enter different phases this needs to be re-examined. Lacking
883 * proper tracking of reference behaviour, this blunt hammer is used.
884 */
885 migrate = mm->numa_next_reset;
886 if (time_after(now, migrate)) {
887 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
888 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
889 xchg(&mm->numa_next_reset, next_scan);
890 }
891
892 /*
893 * Enforce maximal scan/migration frequency..
894 */
895 migrate = mm->numa_next_scan;
896 if (time_before(now, migrate))
897 return;
898
899 if (p->numa_scan_period == 0)
900 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
901
902 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
903 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
904 return;
905
906 /*
907 * Do not set pte_numa if the current running node is rate-limited.
908 * This loses statistics on the fault but if we are unwilling to
909 * migrate to this node, it is less likely we can do useful work
910 */
911 if (migrate_ratelimited(numa_node_id()))
912 return;
913
914 start = mm->numa_scan_offset;
915 pages = sysctl_numa_balancing_scan_size;
916 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
917 if (!pages)
918 return;
919
920 down_read(&mm->mmap_sem);
921 vma = find_vma(mm, start);
922 if (!vma) {
923 reset_ptenuma_scan(p);
924 start = 0;
925 vma = mm->mmap;
926 }
927 for (; vma; vma = vma->vm_next) {
928 if (!vma_migratable(vma))
929 continue;
930
931 /* Skip small VMAs. They are not likely to be of relevance */
932 if (((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) < HPAGE_PMD_NR)
933 continue;
934
935 do {
936 start = max(start, vma->vm_start);
937 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
938 end = min(end, vma->vm_end);
939 pages -= change_prot_numa(vma, start, end);
940
941 start = end;
942 if (pages <= 0)
943 goto out;
944 } while (end != vma->vm_end);
945 }
946
947out:
948 /*
949 * It is possible to reach the end of the VMA list but the last few VMAs are
950 * not guaranteed to the vma_migratable. If they are not, we would find the
951 * !migratable VMA on the next scan but not reset the scanner to the start
952 * so check it now.
953 */
954 if (vma)
955 mm->numa_scan_offset = start;
956 else
957 reset_ptenuma_scan(p);
958 up_read(&mm->mmap_sem);
959}
960
961/*
962 * Drive the periodic memory faults..
963 */
964void task_tick_numa(struct rq *rq, struct task_struct *curr)
965{
966 struct callback_head *work = &curr->numa_work;
967 u64 period, now;
968
969 /*
970 * We don't care about NUMA placement if we don't have memory.
971 */
972 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
973 return;
974
975 /*
976 * Using runtime rather than walltime has the dual advantage that
977 * we (mostly) drive the selection from busy threads and that the
978 * task needs to have done some actual work before we bother with
979 * NUMA placement.
980 */
981 now = curr->se.sum_exec_runtime;
982 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
983
984 if (now - curr->node_stamp > period) {
985 if (!curr->node_stamp)
986 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
987 curr->node_stamp = now;
988
989 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
990 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
991 task_work_add(curr, work, true);
992 }
993 }
994}
995#else
996static void task_tick_numa(struct rq *rq, struct task_struct *curr)
997{
998}
999#endif /* CONFIG_NUMA_BALANCING */
1000
777static void 1001static void
778account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 1002account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
779{ 1003{
@@ -5501,6 +5725,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
5501 entity_tick(cfs_rq, se, queued); 5725 entity_tick(cfs_rq, se, queued);
5502 } 5726 }
5503 5727
5728 if (sched_feat_numa(NUMA))
5729 task_tick_numa(rq, curr);
5730
5504 update_rq_runnable_avg(rq, 1); 5731 update_rq_runnable_avg(rq, 1);
5505} 5732}
5506 5733