diff options
-rw-r--r-- | include/linux/memcontrol.h | 1 | ||||
-rw-r--r-- | mm/memcontrol.c | 102 | ||||
-rw-r--r-- | mm/vmscan.c | 10 |
3 files changed, 106 insertions, 7 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0629121f2c0b..16052117131e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
@@ -106,6 +106,7 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem, | |||
106 | */ | 106 | */ |
107 | int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg); | 107 | int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg); |
108 | int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg); | 108 | int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg); |
109 | int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); | ||
109 | unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, | 110 | unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, |
110 | struct zone *zone, | 111 | struct zone *zone, |
111 | enum lru_list lru); | 112 | enum lru_list lru); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fc62c714f3b6..1520efd1c7c4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -231,6 +231,11 @@ struct mem_cgroup { | |||
231 | * reclaimed from. | 231 | * reclaimed from. |
232 | */ | 232 | */ |
233 | int last_scanned_child; | 233 | int last_scanned_child; |
234 | int last_scanned_node; | ||
235 | #if MAX_NUMNODES > 1 | ||
236 | nodemask_t scan_nodes; | ||
237 | unsigned long next_scan_node_update; | ||
238 | #endif | ||
234 | /* | 239 | /* |
235 | * Should the accounting and control be hierarchical, per subtree? | 240 | * Should the accounting and control be hierarchical, per subtree? |
236 | */ | 241 | */ |
@@ -624,18 +629,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
624 | preempt_enable(); | 629 | preempt_enable(); |
625 | } | 630 | } |
626 | 631 | ||
632 | static unsigned long | ||
633 | mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx) | ||
634 | { | ||
635 | struct mem_cgroup_per_zone *mz; | ||
636 | u64 total = 0; | ||
637 | int zid; | ||
638 | |||
639 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | ||
640 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | ||
641 | total += MEM_CGROUP_ZSTAT(mz, idx); | ||
642 | } | ||
643 | return total; | ||
644 | } | ||
627 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | 645 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, |
628 | enum lru_list idx) | 646 | enum lru_list idx) |
629 | { | 647 | { |
630 | int nid, zid; | 648 | int nid; |
631 | struct mem_cgroup_per_zone *mz; | ||
632 | u64 total = 0; | 649 | u64 total = 0; |
633 | 650 | ||
634 | for_each_online_node(nid) | 651 | for_each_online_node(nid) |
635 | for (zid = 0; zid < MAX_NR_ZONES; zid++) { | 652 | total += mem_cgroup_get_zonestat_node(mem, nid, idx); |
636 | mz = mem_cgroup_zoneinfo(mem, nid, zid); | ||
637 | total += MEM_CGROUP_ZSTAT(mz, idx); | ||
638 | } | ||
639 | return total; | 653 | return total; |
640 | } | 654 | } |
641 | 655 | ||
@@ -1418,6 +1432,81 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
1418 | return ret; | 1432 | return ret; |
1419 | } | 1433 | } |
1420 | 1434 | ||
1435 | #if MAX_NUMNODES > 1 | ||
1436 | |||
1437 | /* | ||
1438 | * Always updating the nodemask is not very good - even if we have an empty | ||
1439 | * list or the wrong list here, we can start from some node and traverse all | ||
1440 | * nodes based on the zonelist. So update the list loosely once per 10 secs. | ||
1441 | * | ||
1442 | */ | ||
1443 | static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) | ||
1444 | { | ||
1445 | int nid; | ||
1446 | |||
1447 | if (time_after(mem->next_scan_node_update, jiffies)) | ||
1448 | return; | ||
1449 | |||
1450 | mem->next_scan_node_update = jiffies + 10*HZ; | ||
1451 | /* make a nodemask where this memcg uses memory from */ | ||
1452 | mem->scan_nodes = node_states[N_HIGH_MEMORY]; | ||
1453 | |||
1454 | for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { | ||
1455 | |||
1456 | if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) || | ||
1457 | mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE)) | ||
1458 | continue; | ||
1459 | |||
1460 | if (total_swap_pages && | ||
1461 | (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) || | ||
1462 | mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON))) | ||
1463 | continue; | ||
1464 | node_clear(nid, mem->scan_nodes); | ||
1465 | } | ||
1466 | } | ||
1467 | |||
1468 | /* | ||
1469 | * Selecting a node where we start reclaim from. Because what we need is just | ||
1470 | * reducing usage counter, start from anywhere is O,K. Considering | ||
1471 | * memory reclaim from current node, there are pros. and cons. | ||
1472 | * | ||
1473 | * Freeing memory from current node means freeing memory from a node which | ||
1474 | * we'll use or we've used. So, it may make LRU bad. And if several threads | ||
1475 | * hit limits, it will see a contention on a node. But freeing from remote | ||
1476 | * node means more costs for memory reclaim because of memory latency. | ||
1477 | * | ||
1478 | * Now, we use round-robin. Better algorithm is welcomed. | ||
1479 | */ | ||
1480 | int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | ||
1481 | { | ||
1482 | int node; | ||
1483 | |||
1484 | mem_cgroup_may_update_nodemask(mem); | ||
1485 | node = mem->last_scanned_node; | ||
1486 | |||
1487 | node = next_node(node, mem->scan_nodes); | ||
1488 | if (node == MAX_NUMNODES) | ||
1489 | node = first_node(mem->scan_nodes); | ||
1490 | /* | ||
1491 | * We call this when we hit limit, not when pages are added to LRU. | ||
1492 | * No LRU may hold pages because all pages are UNEVICTABLE or | ||
1493 | * memcg is too small and all pages are not on LRU. In that case, | ||
1494 | * we use curret node. | ||
1495 | */ | ||
1496 | if (unlikely(node == MAX_NUMNODES)) | ||
1497 | node = numa_node_id(); | ||
1498 | |||
1499 | mem->last_scanned_node = node; | ||
1500 | return node; | ||
1501 | } | ||
1502 | |||
1503 | #else | ||
1504 | int mem_cgroup_select_victim_node(struct mem_cgroup *mem) | ||
1505 | { | ||
1506 | return 0; | ||
1507 | } | ||
1508 | #endif | ||
1509 | |||
1421 | /* | 1510 | /* |
1422 | * Scan the hierarchy if needed to reclaim memory. We remember the last child | 1511 | * Scan the hierarchy if needed to reclaim memory. We remember the last child |
1423 | * we reclaimed from, so that we don't end up penalizing one child extensively | 1512 | * we reclaimed from, so that we don't end up penalizing one child extensively |
@@ -4606,6 +4695,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4606 | res_counter_init(&mem->memsw, NULL); | 4695 | res_counter_init(&mem->memsw, NULL); |
4607 | } | 4696 | } |
4608 | mem->last_scanned_child = 0; | 4697 | mem->last_scanned_child = 0; |
4698 | mem->last_scanned_node = MAX_NUMNODES; | ||
4609 | INIT_LIST_HEAD(&mem->oom_notify); | 4699 | INIT_LIST_HEAD(&mem->oom_notify); |
4610 | 4700 | ||
4611 | if (parent) | 4701 | if (parent) |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 884ae08c16cc..b0875871820d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
@@ -2226,6 +2226,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2226 | { | 2226 | { |
2227 | struct zonelist *zonelist; | 2227 | struct zonelist *zonelist; |
2228 | unsigned long nr_reclaimed; | 2228 | unsigned long nr_reclaimed; |
2229 | int nid; | ||
2229 | struct scan_control sc = { | 2230 | struct scan_control sc = { |
2230 | .may_writepage = !laptop_mode, | 2231 | .may_writepage = !laptop_mode, |
2231 | .may_unmap = 1, | 2232 | .may_unmap = 1, |
@@ -2242,7 +2243,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, | |||
2242 | .gfp_mask = sc.gfp_mask, | 2243 | .gfp_mask = sc.gfp_mask, |
2243 | }; | 2244 | }; |
2244 | 2245 | ||
2245 | zonelist = NODE_DATA(numa_node_id())->node_zonelists; | 2246 | /* |
2247 | * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't | ||
2248 | * take care of from where we get pages. So the node where we start the | ||
2249 | * scan does not need to be the current node. | ||
2250 | */ | ||
2251 | nid = mem_cgroup_select_victim_node(mem_cont); | ||
2252 | |||
2253 | zonelist = NODE_DATA(nid)->node_zonelists; | ||
2246 | 2254 | ||
2247 | trace_mm_vmscan_memcg_reclaim_begin(0, | 2255 | trace_mm_vmscan_memcg_reclaim_begin(0, |
2248 | sc.may_writepage, | 2256 | sc.may_writepage, |