diff options
author | Rik van Riel <riel@redhat.com> | 2014-01-27 17:03:42 -0500 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2014-01-28 07:17:05 -0500 |
commit | 50ec8a401fed6d246ab65e6011d61ac91c34af70 (patch) | |
tree | a78aa3c23bb9837be712d72a39c19b3d7246170d | |
parent | ff1df896aef8e0ec1556a5c44f424bd45bfa2cbe (diff) |
sched/numa: Track from which nodes NUMA faults are triggered
Track which nodes NUMA faults are triggered from, in other words
the CPUs on which the NUMA faults happened. This uses a similar
mechanism to what is used to track the memory involved in numa faults.
The next patches use this to build up a bitmap of which nodes a
workload is actively running on.
Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chegu Vinod <chegu_vinod@hp.com>
Link: http://lkml.kernel.org/r/1390860228-21539-4-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | include/linux/sched.h | 9 | ||||
-rw-r--r-- | kernel/sched/fair.c | 30 |
2 files changed, 30 insertions, 9 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index 144d509df053..5fb0cfb43ecf 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1480,6 +1480,13 @@ struct task_struct { | |||
1480 | unsigned long *numa_faults_buffer_memory; | 1480 | unsigned long *numa_faults_buffer_memory; |
1481 | 1481 | ||
1482 | /* | 1482 | /* |
1483 | * Track the nodes the process was running on when a NUMA hinting | ||
1484 | * fault was incurred. | ||
1485 | */ | ||
1486 | unsigned long *numa_faults_cpu; | ||
1487 | unsigned long *numa_faults_buffer_cpu; | ||
1488 | |||
1489 | /* | ||
1483 | * numa_faults_locality tracks if faults recorded during the last | 1490 | * numa_faults_locality tracks if faults recorded during the last |
1484 | * scan window were remote/local. The task scan period is adapted | 1491 | * scan window were remote/local. The task scan period is adapted |
1485 | * based on the locality of the faults with different weights | 1492 | * based on the locality of the faults with different weights |
@@ -1582,8 +1589,6 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags); | |||
1582 | extern pid_t task_numa_group_id(struct task_struct *p); | 1589 | extern pid_t task_numa_group_id(struct task_struct *p); |
1583 | extern void set_numabalancing_state(bool enabled); | 1590 | extern void set_numabalancing_state(bool enabled); |
1584 | extern void task_numa_free(struct task_struct *p); | 1591 | extern void task_numa_free(struct task_struct *p); |
1585 | |||
1586 | extern unsigned int sysctl_numa_balancing_migrate_deferred; | ||
1587 | #else | 1592 | #else |
1588 | static inline void task_numa_fault(int last_node, int node, int pages, | 1593 | static inline void task_numa_fault(int last_node, int node, int pages, |
1589 | int flags) | 1594 | int flags) |
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3e616d704f67..4841aaff7394 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -886,6 +886,7 @@ struct numa_group { | |||
886 | 886 | ||
887 | struct rcu_head rcu; | 887 | struct rcu_head rcu; |
888 | unsigned long total_faults; | 888 | unsigned long total_faults; |
889 | unsigned long *faults_cpu; | ||
889 | unsigned long faults[0]; | 890 | unsigned long faults[0]; |
890 | }; | 891 | }; |
891 | 892 | ||
@@ -1368,10 +1369,11 @@ static void task_numa_placement(struct task_struct *p) | |||
1368 | int priv, i; | 1369 | int priv, i; |
1369 | 1370 | ||
1370 | for (priv = 0; priv < 2; priv++) { | 1371 | for (priv = 0; priv < 2; priv++) { |
1371 | long diff; | 1372 | long diff, f_diff; |
1372 | 1373 | ||
1373 | i = task_faults_idx(nid, priv); | 1374 | i = task_faults_idx(nid, priv); |
1374 | diff = -p->numa_faults_memory[i]; | 1375 | diff = -p->numa_faults_memory[i]; |
1376 | f_diff = -p->numa_faults_cpu[i]; | ||
1375 | 1377 | ||
1376 | /* Decay existing window, copy faults since last scan */ | 1378 | /* Decay existing window, copy faults since last scan */ |
1377 | p->numa_faults_memory[i] >>= 1; | 1379 | p->numa_faults_memory[i] >>= 1; |
@@ -1379,12 +1381,18 @@ static void task_numa_placement(struct task_struct *p) | |||
1379 | fault_types[priv] += p->numa_faults_buffer_memory[i]; | 1381 | fault_types[priv] += p->numa_faults_buffer_memory[i]; |
1380 | p->numa_faults_buffer_memory[i] = 0; | 1382 | p->numa_faults_buffer_memory[i] = 0; |
1381 | 1383 | ||
1384 | p->numa_faults_cpu[i] >>= 1; | ||
1385 | p->numa_faults_cpu[i] += p->numa_faults_buffer_cpu[i]; | ||
1386 | p->numa_faults_buffer_cpu[i] = 0; | ||
1387 | |||
1382 | faults += p->numa_faults_memory[i]; | 1388 | faults += p->numa_faults_memory[i]; |
1383 | diff += p->numa_faults_memory[i]; | 1389 | diff += p->numa_faults_memory[i]; |
1390 | f_diff += p->numa_faults_cpu[i]; | ||
1384 | p->total_numa_faults += diff; | 1391 | p->total_numa_faults += diff; |
1385 | if (p->numa_group) { | 1392 | if (p->numa_group) { |
1386 | /* safe because we can only change our own group */ | 1393 | /* safe because we can only change our own group */ |
1387 | p->numa_group->faults[i] += diff; | 1394 | p->numa_group->faults[i] += diff; |
1395 | p->numa_group->faults_cpu[i] += f_diff; | ||
1388 | p->numa_group->total_faults += diff; | 1396 | p->numa_group->total_faults += diff; |
1389 | group_faults += p->numa_group->faults[i]; | 1397 | group_faults += p->numa_group->faults[i]; |
1390 | } | 1398 | } |
@@ -1453,7 +1461,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
1453 | 1461 | ||
1454 | if (unlikely(!p->numa_group)) { | 1462 | if (unlikely(!p->numa_group)) { |
1455 | unsigned int size = sizeof(struct numa_group) + | 1463 | unsigned int size = sizeof(struct numa_group) + |
1456 | 2*nr_node_ids*sizeof(unsigned long); | 1464 | 4*nr_node_ids*sizeof(unsigned long); |
1457 | 1465 | ||
1458 | grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); | 1466 | grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); |
1459 | if (!grp) | 1467 | if (!grp) |
@@ -1463,8 +1471,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
1463 | spin_lock_init(&grp->lock); | 1471 | spin_lock_init(&grp->lock); |
1464 | INIT_LIST_HEAD(&grp->task_list); | 1472 | INIT_LIST_HEAD(&grp->task_list); |
1465 | grp->gid = p->pid; | 1473 | grp->gid = p->pid; |
1474 | /* Second half of the array tracks nids where faults happen */ | ||
1475 | grp->faults_cpu = grp->faults + 2 * nr_node_ids; | ||
1466 | 1476 | ||
1467 | for (i = 0; i < 2*nr_node_ids; i++) | 1477 | for (i = 0; i < 4*nr_node_ids; i++) |
1468 | grp->faults[i] = p->numa_faults_memory[i]; | 1478 | grp->faults[i] = p->numa_faults_memory[i]; |
1469 | 1479 | ||
1470 | grp->total_faults = p->total_numa_faults; | 1480 | grp->total_faults = p->total_numa_faults; |
@@ -1522,7 +1532,7 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, | |||
1522 | 1532 | ||
1523 | double_lock(&my_grp->lock, &grp->lock); | 1533 | double_lock(&my_grp->lock, &grp->lock); |
1524 | 1534 | ||
1525 | for (i = 0; i < 2*nr_node_ids; i++) { | 1535 | for (i = 0; i < 4*nr_node_ids; i++) { |
1526 | my_grp->faults[i] -= p->numa_faults_memory[i]; | 1536 | my_grp->faults[i] -= p->numa_faults_memory[i]; |
1527 | grp->faults[i] += p->numa_faults_memory[i]; | 1537 | grp->faults[i] += p->numa_faults_memory[i]; |
1528 | } | 1538 | } |
@@ -1554,7 +1564,7 @@ void task_numa_free(struct task_struct *p) | |||
1554 | 1564 | ||
1555 | if (grp) { | 1565 | if (grp) { |
1556 | spin_lock(&grp->lock); | 1566 | spin_lock(&grp->lock); |
1557 | for (i = 0; i < 2*nr_node_ids; i++) | 1567 | for (i = 0; i < 4*nr_node_ids; i++) |
1558 | grp->faults[i] -= p->numa_faults_memory[i]; | 1568 | grp->faults[i] -= p->numa_faults_memory[i]; |
1559 | grp->total_faults -= p->total_numa_faults; | 1569 | grp->total_faults -= p->total_numa_faults; |
1560 | 1570 | ||
@@ -1567,6 +1577,8 @@ void task_numa_free(struct task_struct *p) | |||
1567 | 1577 | ||
1568 | p->numa_faults_memory = NULL; | 1578 | p->numa_faults_memory = NULL; |
1569 | p->numa_faults_buffer_memory = NULL; | 1579 | p->numa_faults_buffer_memory = NULL; |
1580 | p->numa_faults_cpu= NULL; | ||
1581 | p->numa_faults_buffer_cpu = NULL; | ||
1570 | kfree(numa_faults); | 1582 | kfree(numa_faults); |
1571 | } | 1583 | } |
1572 | 1584 | ||
@@ -1577,6 +1589,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) | |||
1577 | { | 1589 | { |
1578 | struct task_struct *p = current; | 1590 | struct task_struct *p = current; |
1579 | bool migrated = flags & TNF_MIGRATED; | 1591 | bool migrated = flags & TNF_MIGRATED; |
1592 | int this_node = task_node(current); | ||
1580 | int priv; | 1593 | int priv; |
1581 | 1594 | ||
1582 | if (!numabalancing_enabled) | 1595 | if (!numabalancing_enabled) |
@@ -1592,7 +1605,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) | |||
1592 | 1605 | ||
1593 | /* Allocate buffer to track faults on a per-node basis */ | 1606 | /* Allocate buffer to track faults on a per-node basis */ |
1594 | if (unlikely(!p->numa_faults_memory)) { | 1607 | if (unlikely(!p->numa_faults_memory)) { |
1595 | int size = sizeof(*p->numa_faults_memory) * 2 * nr_node_ids; | 1608 | int size = sizeof(*p->numa_faults_memory) * 4 * nr_node_ids; |
1596 | 1609 | ||
1597 | /* numa_faults and numa_faults_buffer share the allocation */ | 1610 | /* numa_faults and numa_faults_buffer share the allocation */ |
1598 | p->numa_faults_memory = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); | 1611 | p->numa_faults_memory = kzalloc(size * 2, GFP_KERNEL|__GFP_NOWARN); |
@@ -1600,7 +1613,9 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) | |||
1600 | return; | 1613 | return; |
1601 | 1614 | ||
1602 | BUG_ON(p->numa_faults_buffer_memory); | 1615 | BUG_ON(p->numa_faults_buffer_memory); |
1603 | p->numa_faults_buffer_memory = p->numa_faults_memory + (2 * nr_node_ids); | 1616 | p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids); |
1617 | p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids); | ||
1618 | p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids); | ||
1604 | p->total_numa_faults = 0; | 1619 | p->total_numa_faults = 0; |
1605 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); | 1620 | memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality)); |
1606 | } | 1621 | } |
@@ -1630,6 +1645,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags) | |||
1630 | p->numa_pages_migrated += pages; | 1645 | p->numa_pages_migrated += pages; |
1631 | 1646 | ||
1632 | p->numa_faults_buffer_memory[task_faults_idx(node, priv)] += pages; | 1647 | p->numa_faults_buffer_memory[task_faults_idx(node, priv)] += pages; |
1648 | p->numa_faults_buffer_cpu[task_faults_idx(this_node, priv)] += pages; | ||
1633 | p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; | 1649 | p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages; |
1634 | } | 1650 | } |
1635 | 1651 | ||