aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRik van Riel <riel@redhat.com>2014-01-27 17:03:42 -0500
committerIngo Molnar <mingo@kernel.org>2014-01-28 07:17:05 -0500
commit50ec8a401fed6d246ab65e6011d61ac91c34af70 (patch)
treea78aa3c23bb9837be712d72a39c19b3d7246170d
parentff1df896aef8e0ec1556a5c44f424bd45bfa2cbe (diff)
sched/numa: Track from which nodes NUMA faults are triggered
Track which nodes NUMA faults are triggered from, in other words the CPUs on which the NUMA faults happened. This uses a similar mechanism to what is used to track the memory involved in numa faults. The next patches use this to build up a bitmap of which nodes a workload is actively running on. Signed-off-by: Rik van Riel <riel@redhat.com> Acked-by: Mel Gorman <mgorman@suse.de> Signed-off-by: Peter Zijlstra <peterz@infradead.org> Cc: Chegu Vinod <chegu_vinod@hp.com> Link: http://lkml.kernel.org/r/1390860228-21539-4-git-send-email-riel@redhat.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r--include/linux/sched.h9
-rw-r--r--kernel/sched/fair.c30
2 files changed, 30 insertions, 9 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 144d509df053..5fb0cfb43ecf 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1480,6 +1480,13 @@ struct task_struct {
1480 unsigned long *numa_faults_buffer_memory; 1480 unsigned long *numa_faults_buffer_memory;
1481 1481
1482 /* 1482 /*
1483 * Track the nodes the process was running on when a NUMA hinting
1484 * fault was incurred.
1485 */
1486 unsigned long *numa_faults_cpu;
1487 unsigned long *numa_faults_buffer_cpu;
1488
1489 /*
1483 * numa_faults_locality tracks if faults recorded during the last 1490 * numa_faults_locality tracks if faults recorded during the last
1484 * scan window were remote/local. The task scan period is adapted 1491 * scan window were remote/local. The task scan period is adapted
1485 * based on the locality of the faults with different weights 1492 * based on the locality of the faults with different weights
@@ -1582,8 +1589,6 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);
1582extern pid_t task_numa_group_id(struct task_struct *p); 1589extern pid_t task_numa_group_id(struct task_struct *p);
1583extern void set_numabalancing_state(bool enabled); 1590extern void set_numabalancing_state(bool enabled);
1584extern void task_numa_free(struct task_struct *p); 1591extern void task_numa_free(struct task_struct *p);
1585
1586extern unsigned int sysctl_numa_balancing_migrate_deferred;
1587#else 1592#else
1588static inline void task_numa_fault(int last_node, int node, int pages, 1593static inline void task_numa_fault(int last_node, int node, int pages,
1589 int flags) 1594 int flags)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3e616d704f67..4841aaff7394 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -886,6 +886,7 @@ struct numa_group {
886 886
887 struct rcu_head rcu; 887 struct rcu_head rcu;
888 unsigned long total_faults; 888 unsigned long total_faults;
889 unsigned long *faults_cpu;
889 unsigned long faults[0]; 890 unsigned long faults[0];
890}; 891};
891 892
@@ -1368,10 +1369,11 @@ static void task_numa_placement(struct task_struct *p)
1368 int priv, i; 1369 int priv, i;
1369 1370
1370 for (priv = 0; priv < 2; priv++) { 1371 for (priv = 0; priv < 2; priv++) {
1371 long diff; 1372 long diff, f_diff;
1372 1373
1373 i = task_faults_idx(nid, priv); 1374 i = task_faults_idx(nid, priv);
1374 diff = -p->numa_faults_memory[i]; 1375 diff = -p->numa_faults_memory[i];
1376 f_diff = -p->numa_faults_cpu[i];
1375 1377
1376 /* Decay existing window, copy faults since last scan */ 1378 /* Decay existing window, copy faults since last scan */
1377 p->numa_faults_memory[i] >>= 1; 1379 p->numa_faults_memory[i] >>= 1;
@@ -1379,12 +1381,18 @@ static void task_numa_placement(struct task_struct *p)
1379 fault_types[priv] += p->numa_faults_buffer_memory[i]; 1381 fault_types[priv] += p->numa_faults_buffer_memory[i];
1380 p->numa_faults_buffer_memory[i] = 0; 1382 p->numa_faults_buffer_memory[i] = 0;
1381 1383
1384 p->numa_faults_cpu[i] >>= 1;
1385 p->numa_faults_cpu[i] += p->numa_faults_buffer_cpu[i];
1386 p->numa_faults_buffer_cpu[i] = 0;
1387
1382 faults += p->numa_faults_memory[i]; 1388 faults += p->numa_faults_memory[i];
1383 diff += p->numa_faults_memory[i]; 1389 diff += p->numa_faults_memory[i];
1390 f_diff += p->numa_faults_cpu[i];
1384 p->total_numa_faults += diff; 1391 p->total_numa_faults += diff;
1385 if (p->numa_group) { 1392 if (p->numa_group) {
1386 /* safe because we can only change our own group */ 1393 /* safe because we can only change our own group */
1387 p->numa_group->faults[i] += diff; 1394 p->numa_group->faults[i] += diff;
1395 p->numa_group->faults_cpu[i] += f_diff;
1388 p->numa_group->total_faults += diff; 1396 p->numa_group->total_faults += diff;
1389 group_faults += p->numa_group->faults[i]; 1397 group_faults += p->numa_group->faults[i];
1390 } 1398 }
@@ -1453,7 +1461,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1453 1461
1454 if (unlikely(!p->numa_group)) { 1462 if (unlikely(!p->numa_group)) {
1455 unsigned int size = sizeof(struct numa_group) + 1463 unsigned int size = sizeof(struct numa_group) +
1456 2*nr_node_ids*sizeof(unsigned long); 1464 4*nr_node_ids*sizeof(unsigned long);
1457 1465
1458 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); 1466 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
1459 if (!grp) 1467 if (!grp)
@@ -1463,8 +1471,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1463 spin_lock_init(&grp->lock); 1471 spin_lock_init(&grp->lock);
1464 INIT_LIST_HEAD(&grp->task_list); 1472 INIT_LIST_HEAD(&grp->task_list);
1465 grp->gid = p->pid; 1473 grp->gid = p->pid;
1474 /* Second half of the array tracks nids where faults happen */
1475 grp->faults_cpu = grp->faults + 2 * nr_node_ids;
1466 1476
1467 for (i = 0; i < 2*nr_node_ids; i++) 1477 for (i = 0; i < 4*nr_node_ids; i++)
1468 grp->faults[i] = p->numa_faults_memory[i]; 1478 grp->faults[i] = p->numa_faults_memory[i];
1469 1479
1470 grp->total_faults = p->total_numa_faults; 1480 grp->total_faults = p->total_numa_faults;
@@ -1522,7 +1532,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
1522 1532
1523 double_lock(&my_grp->lock, &grp->lock); 1533 double_lock(&my_grp->lock, &grp->lock);
1524 1534
1525 for (i = 0; i < 2*nr_node_ids; i++) { 1535 for (i = 0; i < 4*nr_node_ids; i++) {
1526 my_grp->faults[i] -= p->numa_faults_memory[i]; 1536 my_grp->faults[i] -= p->numa_faults_memory[i];
1527 grp->faults[i] += p->numa_faults_memory[i]; 1537 grp->faults[i] += p->numa_faults_memory[i];
1528 } 1538 }
@@ -1554,7 +1564,7 @@ void task_numa_free(struct task_struct *p)
1554 1564
1555 if (grp) { 1565 if (grp) {
1556 spin_lock(&grp->lock); 1566 spin_lock(&grp->lock);
1557 for (i = 0; i < 2*nr_node_ids; i++) 1567 for (i = 0; i < 4*nr_node_ids; i++)
1558 grp->faults[i] -= p->numa_faults_memory[i]; 1568 grp->faults[i] -= p->numa_faults_memory[i];
1559 grp->total_faults -= p->total_numa_faults; 1569 grp->total_faults -= p->total_numa_faults;
1560 1570
@@ -1567,6 +1577,8 @@ void task_numa_free(struct task_struct *p)
1567 1577
1568 p->numa_faults_memory = NULL; 1578 p->numa_faults_memory = NULL;
1569 p->numa_faults_buffer_memory = NULL; 1579 p->numa_faults_buffer_memory = NULL;
1580 p->numa_faults_cpu= NULL;
1581 p->numa_faults_buffer_cpu = NULL;
1570 kfree(numa_faults); 1582 kfree(numa_faults);
1571} 1583}
1572 1584
@@ -1577,6 +1589,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1577{ 1589{
1578 struct task_struct *p = current; 1590 struct task_struct *p = current;
1579 bool migrated = flags & TNF_MIGRATED; 1591 bool migrated = flags & TNF_MIGRATED;
1592 int this_node = task_node(current);
1580 int priv; 1593 int priv;
1581 1594
1582 if (!numabalancing_enabled) 1595 if (!numabalancing_enabled)
@@ -1592,7 +1605,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1592 1605
1593 /* Allocate buffer to track faults on a per-node basis */ 1606 /* Allocate buffer to track faults on a per-node basis */
1594 if (unlikely(!p->numa_faults_memory)) { 1607 if (unlikely(!p->numa_faults_memory)) {
1595 int size = sizeof(*p->numa_faults_memory) * 2 * nr_node_ids; 1608 int size = sizeof(*p->numa_faults_memory) * 4 * nr_node_ids;
1596 1609
1597 /* numa_faults and numa_faults_buffer share the allocation */ 1610 /* numa_faults and numa_faults_buffer share the allocation */
1598 p->numa_faults_memory = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); 1611 p->numa_faults_memory = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN);
@@ -1600,7 +1613,9 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1600 return; 1613 return;
1601 1614
1602 BUG_ON(p->numa_faults_buffer_memory); 1615 BUG_ON(p->numa_faults_buffer_memory);
1603 p->numa_faults_buffer_memory = p->numa_faults_memory + (2 * nr_node_ids); 1616 p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
1617 p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
1618 p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
1604 p->total_numa_faults = 0; 1619 p->total_numa_faults = 0;
1605 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); 1620 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1606 } 1621 }
@@ -1630,6 +1645,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
1630 p->numa_pages_migrated += pages; 1645 p->numa_pages_migrated += pages;
1631 1646
1632 p->numa_faults_buffer_memory[task_faults_idx(node, priv)] += pages; 1647 p->numa_faults_buffer_memory[task_faults_idx(node, priv)] += pages;
1648 p->numa_faults_buffer_cpu[task_faults_idx(this_node, priv)] += pages;
1633 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; 1649 p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
1634} 1650}
1635 1651