aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMel Gorman <mgorman@suse.de>2012-11-20 20:18:23 -0500
committerMel Gorman <mgorman@suse.de>2012-12-11 09:42:55 -0500
commitb8593bfda1652755136333cdd362de125b283a9c (patch)
treec0395d9cf775fd9225e81b055fc8f5540a14333a
parente42c8ff2999de1239a57d434bfbd8e9f2a56e814 (diff)
mm: sched: Adapt the scanning rate if a NUMA hinting fault does not migrate
The PTE scanning rate and fault rates are two of the biggest sources of system CPU overhead with automatic NUMA placement. Ideally a proper policy would detect if a workload was properly placed, schedule and adjust the PTE scanning rate accordingly. We do not track the necessary information to do that but we at least know if we migrated or not. This patch scans slower if a page was not migrated as the result of a NUMA hinting fault up to sysctl_numa_balancing_scan_period_max which is now higher than the previous default. Once every minute it will reset the scanner in case of phase changes. This is hilariously crude and the numbers are arbitrary. Workloads will converge quite slowly in comparison to what a proper policy should be able to do. On the plus side, we will chew up less CPU for workloads that have no need for automatic balancing. Signed-off-by: Mel Gorman <mgorman@suse.de>
-rw-r--r--include/linux/mm_types.h3
-rw-r--r--include/linux/sched.h5
-rw-r--r--kernel/sched/core.c1
-rw-r--r--kernel/sched/fair.c29
-rw-r--r--kernel/sysctl.c7
-rw-r--r--mm/huge_memory.c2
-rw-r--r--mm/memory.c12
7 files changed, 44 insertions, 15 deletions
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c5fffa239861..e850a23dd6ec 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -410,6 +410,9 @@ struct mm_struct {
410 */ 410 */
411 unsigned long numa_next_scan; 411 unsigned long numa_next_scan;
412 412
413 /* numa_next_reset is when the PTE scanner period will be reset */
414 unsigned long numa_next_reset;
415
413 /* Restart point for scanning and setting pte_numa */ 416 /* Restart point for scanning and setting pte_numa */
414 unsigned long numa_scan_offset; 417 unsigned long numa_scan_offset;
415 418
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 7d95a232b5b9..0f4ff2bd03f6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1562,9 +1562,9 @@ struct task_struct {
1562#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) 1562#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
1563 1563
1564#ifdef CONFIG_NUMA_BALANCING 1564#ifdef CONFIG_NUMA_BALANCING
1565extern void task_numa_fault(int node, int pages); 1565extern void task_numa_fault(int node, int pages, bool migrated);
1566#else 1566#else
1567static inline void task_numa_fault(int node, int pages) 1567static inline void task_numa_fault(int node, int pages, bool migrated)
1568{ 1568{
1569} 1569}
1570#endif 1570#endif
@@ -2009,6 +2009,7 @@ extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
2009extern unsigned int sysctl_numa_balancing_scan_delay; 2009extern unsigned int sysctl_numa_balancing_scan_delay;
2010extern unsigned int sysctl_numa_balancing_scan_period_min; 2010extern unsigned int sysctl_numa_balancing_scan_period_min;
2011extern unsigned int sysctl_numa_balancing_scan_period_max; 2011extern unsigned int sysctl_numa_balancing_scan_period_max;
2012extern unsigned int sysctl_numa_balancing_scan_period_reset;
2012extern unsigned int sysctl_numa_balancing_scan_size; 2013extern unsigned int sysctl_numa_balancing_scan_size;
2013extern unsigned int sysctl_numa_balancing_settle_count; 2014extern unsigned int sysctl_numa_balancing_settle_count;
2014 2015
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fbfc4843063f..9d255bc0e278 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1537,6 +1537,7 @@ static void __sched_fork(struct task_struct *p)
1537#ifdef CONFIG_NUMA_BALANCING 1537#ifdef CONFIG_NUMA_BALANCING
1538 if (p->mm && atomic_read(&p->mm->mm_users) == 1) { 1538 if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
1539 p->mm->numa_next_scan = jiffies; 1539 p->mm->numa_next_scan = jiffies;
1540 p->mm->numa_next_reset = jiffies;
1540 p->mm->numa_scan_seq = 0; 1541 p->mm->numa_scan_seq = 0;
1541 } 1542 }
1542 1543
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index dd18087fd369..4b577863933f 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -784,7 +784,8 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
784 * numa task sample period in ms 784 * numa task sample period in ms
785 */ 785 */
786unsigned int sysctl_numa_balancing_scan_period_min = 100; 786unsigned int sysctl_numa_balancing_scan_period_min = 100;
787unsigned int sysctl_numa_balancing_scan_period_max = 100*16; 787unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
788unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
788 789
789/* Portion of address space to scan in MB */ 790/* Portion of address space to scan in MB */
790unsigned int sysctl_numa_balancing_scan_size = 256; 791unsigned int sysctl_numa_balancing_scan_size = 256;
@@ -806,20 +807,19 @@ static void task_numa_placement(struct task_struct *p)
806/* 807/*
807 * Got a PROT_NONE fault for a page on @node. 808 * Got a PROT_NONE fault for a page on @node.
808 */ 809 */
809void task_numa_fault(int node, int pages) 810void task_numa_fault(int node, int pages, bool migrated)
810{ 811{
811 struct task_struct *p = current; 812 struct task_struct *p = current;
812 813
813 /* FIXME: Allocate task-specific structure for placement policy here */ 814 /* FIXME: Allocate task-specific structure for placement policy here */
814 815
815 /* 816 /*
816 * Assume that as faults occur that pages are getting properly placed 817 * If pages are properly placed (did not migrate) then scan slower.
817 * and fewer NUMA hints are required. Note that this is a big 818 * This is reset periodically in case of phase changes
818 * assumption, it assumes processes reach a steady steady with no
819 * further phase changes.
820 */ 819 */
821 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max, 820 if (!migrated)
822 p->numa_scan_period + jiffies_to_msecs(2)); 821 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
822 p->numa_scan_period + jiffies_to_msecs(10));
823 823
824 task_numa_placement(p); 824 task_numa_placement(p);
825} 825}
@@ -858,6 +858,19 @@ void task_numa_work(struct callback_head *work)
858 return; 858 return;
859 859
860 /* 860 /*
861 * Reset the scan period if enough time has gone by. Objective is that
862 * scanning will be reduced if pages are properly placed. As tasks
863 * can enter different phases this needs to be re-examined. Lacking
864 * proper tracking of reference behaviour, this blunt hammer is used.
865 */
866 migrate = mm->numa_next_reset;
867 if (time_after(now, migrate)) {
868 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
869 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
870 xchg(&mm->numa_next_reset, next_scan);
871 }
872
873 /*
861 * Enforce maximal scan/migration frequency.. 874 * Enforce maximal scan/migration frequency..
862 */ 875 */
863 migrate = mm->numa_next_scan; 876 migrate = mm->numa_next_scan;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 48a68cc258c1..8906f90d6fa2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -367,6 +367,13 @@ static struct ctl_table kern_table[] = {
367 .proc_handler = proc_dointvec, 367 .proc_handler = proc_dointvec,
368 }, 368 },
369 { 369 {
370 .procname = "numa_balancing_scan_period_reset",
371 .data = &sysctl_numa_balancing_scan_period_reset,
372 .maxlen = sizeof(unsigned int),
373 .mode = 0644,
374 .proc_handler = proc_dointvec,
375 },
376 {
370 .procname = "numa_balancing_scan_period_max_ms", 377 .procname = "numa_balancing_scan_period_max_ms",
371 .data = &sysctl_numa_balancing_scan_period_max, 378 .data = &sysctl_numa_balancing_scan_period_max,
372 .maxlen = sizeof(unsigned int), 379 .maxlen = sizeof(unsigned int),
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 79b96064f8fc..199b261a257e 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1068,7 +1068,7 @@ out_unlock:
1068 spin_unlock(&mm->page_table_lock); 1068 spin_unlock(&mm->page_table_lock);
1069 if (page) { 1069 if (page) {
1070 put_page(page); 1070 put_page(page);
1071 task_numa_fault(numa_node_id(), HPAGE_PMD_NR); 1071 task_numa_fault(numa_node_id(), HPAGE_PMD_NR, false);
1072 } 1072 }
1073 return 0; 1073 return 0;
1074} 1074}
diff --git a/mm/memory.c b/mm/memory.c
index 84c6d9eab182..39edb11b63dc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3468,6 +3468,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3468 spinlock_t *ptl; 3468 spinlock_t *ptl;
3469 int current_nid = -1; 3469 int current_nid = -1;
3470 int target_nid; 3470 int target_nid;
3471 bool migrated = false;
3471 3472
3472 /* 3473 /*
3473 * The "pte" at this point cannot be used safely without 3474 * The "pte" at this point cannot be used safely without
@@ -3509,12 +3510,13 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3509 } 3510 }
3510 3511
3511 /* Migrate to the requested node */ 3512 /* Migrate to the requested node */
3512 if (migrate_misplaced_page(page, target_nid)) 3513 migrated = migrate_misplaced_page(page, target_nid);
3514 if (migrated)
3513 current_nid = target_nid; 3515 current_nid = target_nid;
3514 3516
3515out: 3517out:
3516 if (current_nid != -1) 3518 if (current_nid != -1)
3517 task_numa_fault(current_nid, 1); 3519 task_numa_fault(current_nid, 1, migrated);
3518 return 0; 3520 return 0;
3519} 3521}
3520 3522
@@ -3554,6 +3556,7 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3554 struct page *page; 3556 struct page *page;
3555 int curr_nid = local_nid; 3557 int curr_nid = local_nid;
3556 int target_nid; 3558 int target_nid;
3559 bool migrated;
3557 if (!pte_present(pteval)) 3560 if (!pte_present(pteval))
3558 continue; 3561 continue;
3559 if (!pte_numa(pteval)) 3562 if (!pte_numa(pteval))
@@ -3590,9 +3593,10 @@ static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
3590 3593
3591 /* Migrate to the requested node */ 3594 /* Migrate to the requested node */
3592 pte_unmap_unlock(pte, ptl); 3595 pte_unmap_unlock(pte, ptl);
3593 if (migrate_misplaced_page(page, target_nid)) 3596 migrated = migrate_misplaced_page(page, target_nid);
3597 if (migrated)
3594 curr_nid = target_nid; 3598 curr_nid = target_nid;
3595 task_numa_fault(curr_nid, 1); 3599 task_numa_fault(curr_nid, 1, migrated);
3596 3600
3597 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); 3601 pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
3598 } 3602 }