aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/memcontrol.h1
-rw-r--r--mm/memcontrol.c102
-rw-r--r--mm/vmscan.c10
3 files changed, 106 insertions, 7 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index 0629121f2c0b..16052117131e 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -106,6 +106,7 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
106 */ 106 */
107int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg); 107int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
108int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg); 108int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
109int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
109unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, 110unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
110 struct zone *zone, 111 struct zone *zone,
111 enum lru_list lru); 112 enum lru_list lru);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index fc62c714f3b6..1520efd1c7c4 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -231,6 +231,11 @@ struct mem_cgroup {
231 * reclaimed from. 231 * reclaimed from.
232 */ 232 */
233 int last_scanned_child; 233 int last_scanned_child;
234 int last_scanned_node;
235#if MAX_NUMNODES > 1
236 nodemask_t scan_nodes;
237 unsigned long next_scan_node_update;
238#endif
234 /* 239 /*
235 * Should the accounting and control be hierarchical, per subtree? 240 * Should the accounting and control be hierarchical, per subtree?
236 */ 241 */
@@ -624,18 +629,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
624 preempt_enable(); 629 preempt_enable();
625} 630}
626 631
632static unsigned long
633mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
634{
635 struct mem_cgroup_per_zone *mz;
636 u64 total = 0;
637 int zid;
638
639 for (zid = 0; zid < MAX_NR_ZONES; zid++) {
640 mz = mem_cgroup_zoneinfo(mem, nid, zid);
641 total += MEM_CGROUP_ZSTAT(mz, idx);
642 }
643 return total;
644}
627static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, 645static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
628 enum lru_list idx) 646 enum lru_list idx)
629{ 647{
630 int nid, zid; 648 int nid;
631 struct mem_cgroup_per_zone *mz;
632 u64 total = 0; 649 u64 total = 0;
633 650
634 for_each_online_node(nid) 651 for_each_online_node(nid)
635 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 652 total += mem_cgroup_get_zonestat_node(mem, nid, idx);
636 mz = mem_cgroup_zoneinfo(mem, nid, zid);
637 total += MEM_CGROUP_ZSTAT(mz, idx);
638 }
639 return total; 653 return total;
640} 654}
641 655
@@ -1418,6 +1432,81 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
1418 return ret; 1432 return ret;
1419} 1433}
1420 1434
1435#if MAX_NUMNODES > 1
1436
1437/*
1438 * Always updating the nodemask is not very good - even if we have an empty
1439 * list or the wrong list here, we can start from some node and traverse all
1440 * nodes based on the zonelist. So update the list loosely once per 10 secs.
1441 *
1442 */
1443static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
1444{
1445 int nid;
1446
1447 if (time_after(mem->next_scan_node_update, jiffies))
1448 return;
1449
1450 mem->next_scan_node_update = jiffies + 10*HZ;
1451 /* make a nodemask where this memcg uses memory from */
1452 mem->scan_nodes = node_states[N_HIGH_MEMORY];
1453
1454 for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1455
1456 if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
1457 mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
1458 continue;
1459
1460 if (total_swap_pages &&
1461 (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
1462 mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
1463 continue;
1464 node_clear(nid, mem->scan_nodes);
1465 }
1466}
1467
1468/*
1469 * Selecting a node where we start reclaim from. Because what we need is just
1470 * reducing usage counter, start from anywhere is O,K. Considering
1471 * memory reclaim from current node, there are pros. and cons.
1472 *
1473 * Freeing memory from current node means freeing memory from a node which
1474 * we'll use or we've used. So, it may make LRU bad. And if several threads
1475 * hit limits, it will see a contention on a node. But freeing from remote
1476 * node means more costs for memory reclaim because of memory latency.
1477 *
1478 * Now, we use round-robin. Better algorithm is welcomed.
1479 */
1480int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1481{
1482 int node;
1483
1484 mem_cgroup_may_update_nodemask(mem);
1485 node = mem->last_scanned_node;
1486
1487 node = next_node(node, mem->scan_nodes);
1488 if (node == MAX_NUMNODES)
1489 node = first_node(mem->scan_nodes);
1490 /*
1491 * We call this when we hit limit, not when pages are added to LRU.
1492 * No LRU may hold pages because all pages are UNEVICTABLE or
1493 * memcg is too small and all pages are not on LRU. In that case,
1494 * we use curret node.
1495 */
1496 if (unlikely(node == MAX_NUMNODES))
1497 node = numa_node_id();
1498
1499 mem->last_scanned_node = node;
1500 return node;
1501}
1502
1503#else
1504int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1505{
1506 return 0;
1507}
1508#endif
1509
1421/* 1510/*
1422 * Scan the hierarchy if needed to reclaim memory. We remember the last child 1511 * Scan the hierarchy if needed to reclaim memory. We remember the last child
1423 * we reclaimed from, so that we don't end up penalizing one child extensively 1512 * we reclaimed from, so that we don't end up penalizing one child extensively
@@ -4606,6 +4695,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
4606 res_counter_init(&mem->memsw, NULL); 4695 res_counter_init(&mem->memsw, NULL);
4607 } 4696 }
4608 mem->last_scanned_child = 0; 4697 mem->last_scanned_child = 0;
4698 mem->last_scanned_node = MAX_NUMNODES;
4609 INIT_LIST_HEAD(&mem->oom_notify); 4699 INIT_LIST_HEAD(&mem->oom_notify);
4610 4700
4611 if (parent) 4701 if (parent)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 884ae08c16cc..b0875871820d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2226,6 +2226,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2226{ 2226{
2227 struct zonelist *zonelist; 2227 struct zonelist *zonelist;
2228 unsigned long nr_reclaimed; 2228 unsigned long nr_reclaimed;
2229 int nid;
2229 struct scan_control sc = { 2230 struct scan_control sc = {
2230 .may_writepage = !laptop_mode, 2231 .may_writepage = !laptop_mode,
2231 .may_unmap = 1, 2232 .may_unmap = 1,
@@ -2242,7 +2243,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
2242 .gfp_mask = sc.gfp_mask, 2243 .gfp_mask = sc.gfp_mask,
2243 }; 2244 };
2244 2245
2245 zonelist = NODE_DATA(numa_node_id())->node_zonelists; 2246 /*
2247 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
2248 * take care of from where we get pages. So the node where we start the
2249 * scan does not need to be the current node.
2250 */
2251 nid = mem_cgroup_select_victim_node(mem_cont);
2252
2253 zonelist = NODE_DATA(nid)->node_zonelists;
2246 2254
2247 trace_mm_vmscan_memcg_reclaim_begin(0, 2255 trace_mm_vmscan_memcg_reclaim_begin(0,
2248 sc.may_writepage, 2256 sc.may_writepage,