diff options
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r-- | mm/memcontrol.c | 102 |
1 files changed, 96 insertions, 6 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fc62c714f3b6..1520efd1c7c4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -231,6 +231,11 @@ struct mem_cgroup { | |||
231 | * reclaimed from. | 231 | * reclaimed from. |
232 | */ | 232 | */ |
233 | int last_scanned_child; | 233 | int last_scanned_child; |
234 | int last_scanned_node; | ||
235 | #if MAX_NUMNODES > 1 | ||
236 | nodemask_t scan_nodes; | ||
237 | unsigned long next_scan_node_update; | ||
238 | #endif | ||
234 | /* | 239 | /* |
235 | * Should the accounting and control be hierarchical, per subtree? | 240 | * Should the accounting and control be hierarchical, per subtree? |
236 | */ | 241 | */ |
@@ -624,18 +629,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
624 | preempt_enable(); | 629 | preempt_enable(); |
625 | } | 630 | } |
626 | 631 | ||
632 | static unsigned long | ||
633 | mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx) | ||
634 | { | ||
635 | struct mem_cgroup_per_zone *mz; | ||
636 | u64 total = 0; | ||
637 | int zid; | ||
638 | |||
639 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | ||
640 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | ||
641 | total += MEM_CGROUP_ZSTAT(mz, idx); | ||
642 | } | ||
643 | return total; | ||
644 | } | ||
627 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | 645 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, |
628 | enum lru_list idx) | 646 | enum lru_list idx) |
629 | { | 647 | { |
630 | int nid, zid; | 648 | int nid; |
631 | struct mem_cgroup_per_zone *mz; | ||
632 | u64 total = 0; | 649 | u64 total = 0; |
633 | 650 | ||
634 | for_each_online_node(nid) | 651 | for_each_online_node(nid) |
635 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 652 | total += mem_cgroup_get_zonestat_node(mem, nid, idx); |
636 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | ||
637 | total += MEM_CGROUP_ZSTAT(mz, idx); | ||
638 | } | ||
639 | return total; | 653 | return total; |
640 | } | 654 | } |
641 | 655 | ||
@@ -1418,6 +1432,81 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1418 | return ret; | 1432 | return ret; |
1419 | } | 1433 | } |
1420 | 1434 | ||
1435 | #if MAX_NUMNODES > 1 | ||
1436 | |||
1437 | /* | ||
1438 | * Always updating the nodemask is not very good - even if we have an empty | ||
1439 | * list or the wrong list here, we can start from some node and traverse all | ||
1440 | * nodes based on the zonelist. So update the list loosely once per 10 secs. | ||
1441 | * | ||
1442 | */ | ||
1443 | static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) | ||
1444 | { | ||
1445 | int nid; | ||
1446 | |||
1447 | if (time_after(mem->next_scan_node_update, jiffies)) | ||
1448 | return; | ||
1449 | |||
1450 | mem->next_scan_node_update = jiffies + 10*HZ; | ||
1451 | /* make a nodemask where this memcg uses memory from */ | ||
1452 | mem->scan_nodes = node_states[N_HIGH_MEMORY]; | ||
1453 | |||
1454 | for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { | ||
1455 | |||
1456 | if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) || | ||
1457 | mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE)) | ||
1458 | continue; | ||
1459 | |||
1460 | if (total_swap_pages && | ||
1461 | (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) || | ||
1462 | mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON))) | ||
1463 | continue; | ||
1464 | node_clear(nid, mem->scan_nodes); | ||
1465 | } | ||
1466 | } | ||
1467 | |||
1468 | /* | ||
1469 | * Selecting a node where we start reclaim from. Because what we need is just | ||
1470 | * reducing usage counter, start from anywhere is O,K. Considering | ||
1471 | * memory reclaim from current node, there are pros. and cons. | ||
1472 | * | ||
1473 | * Freeing memory from current node means freeing memory from a node which | ||
1474 | * we'll use or we've used. So, it may make LRU bad. And if several threads | ||
1475 | * hit limits, it will see a contention on a node. But freeing from remote | ||
1476 | * node means more costs for memory reclaim because of memory latency. | ||
1477 | * | ||
1478 | * Now, we use round-robin. Better algorithm is welcomed. | ||
1479 | */ | ||
1480 | int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | ||
1481 | { | ||
1482 | int node; | ||
1483 | |||
1484 | mem_cgroup_may_update_nodemask(mem); | ||
1485 | node = mem->last_scanned_node; | ||
1486 | |||
1487 | node = next_node(node, mem->scan_nodes); | ||
1488 | if (node == MAX_NUMNODES) | ||
1489 | node = first_node(mem->scan_nodes); | ||
1490 | /* | ||
1491 | * We call this when we hit limit, not when pages are added to LRU. | ||
1492 | * No LRU may hold pages because all pages are UNEVICTABLE or | ||
1493 | * memcg is too small and all pages are not on LRU. In that case, | ||
1494 | * we use curret node. | ||
1495 | */ | ||
1496 | if (unlikely(node == MAX_NUMNODES)) | ||
1497 | node = numa_node_id(); | ||
1498 | |||
1499 | mem->last_scanned_node = node; | ||
1500 | return node; | ||
1501 | } | ||
1502 | |||
1503 | #else | ||
1504 | int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | ||
1505 | { | ||
1506 | return 0; | ||
1507 | } | ||
1508 | #endif | ||
1509 | |||
1421 | /* | 1510 | /* |
1422 | * Scan the hierarchy if needed to reclaim memory. We remember the last child | 1511 | * Scan the hierarchy if needed to reclaim memory. We remember the last child |
1423 | * we reclaimed from, so that we don't end up penalizing one child extensively | 1512 | * we reclaimed from, so that we don't end up penalizing one child extensively |
@@ -4606,6 +4695,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4606 | res_counter_init(&mem->memsw, NULL); | 4695 | res_counter_init(&mem->memsw, NULL); |
4607 | } | 4696 | } |
4608 | mem->last_scanned_child = 0; | 4697 | mem->last_scanned_child = 0; |
4698 | mem->last_scanned_node = MAX_NUMNODES; | ||
4609 | INIT_LIST_HEAD(&mem->oom_notify); | 4699 | INIT_LIST_HEAD(&mem->oom_notify); |
4610 | 4700 | ||
4611 | if (parent) | 4701 | if (parent) |