diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/highmem.c | 6 | ||||
-rw-r--r-- | mm/memcontrol.c | 406 | ||||
-rw-r--r-- | mm/swap.c | 1 |
3 files changed, 302 insertions, 111 deletions
diff --git a/mm/highmem.c b/mm/highmem.c index 781e754a75ac..693394daa2ed 100644 --- a/mm/highmem.c +++ b/mm/highmem.c | |||
@@ -29,6 +29,11 @@ | |||
29 | #include <linux/kgdb.h> | 29 | #include <linux/kgdb.h> |
30 | #include <asm/tlbflush.h> | 30 | #include <asm/tlbflush.h> |
31 | 31 | ||
32 | |||
33 | #if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) | ||
34 | DEFINE_PER_CPU(int, __kmap_atomic_idx); | ||
35 | #endif | ||
36 | |||
32 | /* | 37 | /* |
33 | * Virtual_count is not a pure "count". | 38 | * Virtual_count is not a pure "count". |
34 | * 0 means that it is not mapped, and has not been mapped | 39 | * 0 means that it is not mapped, and has not been mapped |
@@ -43,7 +48,6 @@ unsigned long totalhigh_pages __read_mostly; | |||
43 | EXPORT_SYMBOL(totalhigh_pages); | 48 | EXPORT_SYMBOL(totalhigh_pages); |
44 | 49 | ||
45 | 50 | ||
46 | DEFINE_PER_CPU(int, __kmap_atomic_idx); | ||
47 | EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); | 51 | EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); |
48 | 52 | ||
49 | unsigned int nr_free_highpages (void) | 53 | unsigned int nr_free_highpages (void) |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9be3cf8a5da4..9a99cfaf0a19 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -89,7 +89,10 @@ enum mem_cgroup_stat_index { | |||
89 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 89 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
90 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 90 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
91 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 91 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
92 | MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ | 92 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ |
93 | /* incremented at every pagein/pageout */ | ||
94 | MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA, | ||
95 | MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ | ||
93 | 96 | ||
94 | MEM_CGROUP_STAT_NSTATS, | 97 | MEM_CGROUP_STAT_NSTATS, |
95 | }; | 98 | }; |
@@ -254,6 +257,12 @@ struct mem_cgroup { | |||
254 | * percpu counter. | 257 | * percpu counter. |
255 | */ | 258 | */ |
256 | struct mem_cgroup_stat_cpu *stat; | 259 | struct mem_cgroup_stat_cpu *stat; |
260 | /* | ||
261 | * used when a cpu is offlined or other synchronizations | ||
262 | * See mem_cgroup_read_stat(). | ||
263 | */ | ||
264 | struct mem_cgroup_stat_cpu nocpu_base; | ||
265 | spinlock_t pcp_counter_lock; | ||
257 | }; | 266 | }; |
258 | 267 | ||
259 | /* Stuffs for move charges at task migration. */ | 268 | /* Stuffs for move charges at task migration. */ |
@@ -530,14 +539,40 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | |||
530 | return mz; | 539 | return mz; |
531 | } | 540 | } |
532 | 541 | ||
542 | /* | ||
543 | * Implementation Note: reading percpu statistics for memcg. | ||
544 | * | ||
545 | * Both of vmstat[] and percpu_counter has threshold and do periodic | ||
546 | * synchronization to implement "quick" read. There are trade-off between | ||
547 | * reading cost and precision of value. Then, we may have a chance to implement | ||
548 | * a periodic synchronizion of counter in memcg's counter. | ||
549 | * | ||
550 | * But this _read() function is used for user interface now. The user accounts | ||
551 | * memory usage by memory cgroup and he _always_ requires exact value because | ||
552 | * he accounts memory. Even if we provide quick-and-fuzzy read, we always | ||
553 | * have to visit all online cpus and make sum. So, for now, unnecessary | ||
554 | * synchronization is not implemented. (just implemented for cpu hotplug) | ||
555 | * | ||
556 | * If there are kernel internal actions which can make use of some not-exact | ||
557 | * value, and reading all cpu value can be performance bottleneck in some | ||
558 | * common workload, threashold and synchonization as vmstat[] should be | ||
559 | * implemented. | ||
560 | */ | ||
533 | static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, | 561 | static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, |
534 | enum mem_cgroup_stat_index idx) | 562 | enum mem_cgroup_stat_index idx) |
535 | { | 563 | { |
536 | int cpu; | 564 | int cpu; |
537 | s64 val = 0; | 565 | s64 val = 0; |
538 | 566 | ||
539 | for_each_possible_cpu(cpu) | 567 | get_online_cpus(); |
568 | for_each_online_cpu(cpu) | ||
540 | val += per_cpu(mem->stat->count[idx], cpu); | 569 | val += per_cpu(mem->stat->count[idx], cpu); |
570 | #ifdef CONFIG_HOTPLUG_CPU | ||
571 | spin_lock(&mem->pcp_counter_lock); | ||
572 | val += mem->nocpu_base.count[idx]; | ||
573 | spin_unlock(&mem->pcp_counter_lock); | ||
574 | #endif | ||
575 | put_online_cpus(); | ||
541 | return val; | 576 | return val; |
542 | } | 577 | } |
543 | 578 | ||
@@ -659,40 +694,83 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
659 | return mem; | 694 | return mem; |
660 | } | 695 | } |
661 | 696 | ||
662 | /* | 697 | /* The caller has to guarantee "mem" exists before calling this */ |
663 | * Call callback function against all cgroup under hierarchy tree. | 698 | static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) |
664 | */ | ||
665 | static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, | ||
666 | int (*func)(struct mem_cgroup *, void *)) | ||
667 | { | 699 | { |
668 | int found, ret, nextid; | ||
669 | struct cgroup_subsys_state *css; | 700 | struct cgroup_subsys_state *css; |
670 | struct mem_cgroup *mem; | 701 | int found; |
671 | |||
672 | if (!root->use_hierarchy) | ||
673 | return (*func)(root, data); | ||
674 | 702 | ||
675 | nextid = 1; | 703 | if (!mem) /* ROOT cgroup has the smallest ID */ |
676 | do { | 704 | return root_mem_cgroup; /*css_put/get against root is ignored*/ |
677 | ret = 0; | 705 | if (!mem->use_hierarchy) { |
706 | if (css_tryget(&mem->css)) | ||
707 | return mem; | ||
708 | return NULL; | ||
709 | } | ||
710 | rcu_read_lock(); | ||
711 | /* | ||
712 | * searching a memory cgroup which has the smallest ID under given | ||
713 | * ROOT cgroup. (ID >= 1) | ||
714 | */ | ||
715 | css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); | ||
716 | if (css && css_tryget(css)) | ||
717 | mem = container_of(css, struct mem_cgroup, css); | ||
718 | else | ||
678 | mem = NULL; | 719 | mem = NULL; |
720 | rcu_read_unlock(); | ||
721 | return mem; | ||
722 | } | ||
723 | |||
724 | static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, | ||
725 | struct mem_cgroup *root, | ||
726 | bool cond) | ||
727 | { | ||
728 | int nextid = css_id(&iter->css) + 1; | ||
729 | int found; | ||
730 | int hierarchy_used; | ||
731 | struct cgroup_subsys_state *css; | ||
732 | |||
733 | hierarchy_used = iter->use_hierarchy; | ||
679 | 734 | ||
735 | css_put(&iter->css); | ||
736 | /* If no ROOT, walk all, ignore hierarchy */ | ||
737 | if (!cond || (root && !hierarchy_used)) | ||
738 | return NULL; | ||
739 | |||
740 | if (!root) | ||
741 | root = root_mem_cgroup; | ||
742 | |||
743 | do { | ||
744 | iter = NULL; | ||
680 | rcu_read_lock(); | 745 | rcu_read_lock(); |
681 | css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, | 746 | |
682 | &found); | 747 | css = css_get_next(&mem_cgroup_subsys, nextid, |
748 | &root->css, &found); | ||
683 | if (css && css_tryget(css)) | 749 | if (css && css_tryget(css)) |
684 | mem = container_of(css, struct mem_cgroup, css); | 750 | iter = container_of(css, struct mem_cgroup, css); |
685 | rcu_read_unlock(); | 751 | rcu_read_unlock(); |
686 | 752 | /* If css is NULL, no more cgroups will be found */ | |
687 | if (mem) { | ||
688 | ret = (*func)(mem, data); | ||
689 | css_put(&mem->css); | ||
690 | } | ||
691 | nextid = found + 1; | 753 | nextid = found + 1; |
692 | } while (!ret && css); | 754 | } while (css && !iter); |
693 | 755 | ||
694 | return ret; | 756 | return iter; |
695 | } | 757 | } |
758 | /* | ||
759 | * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please | ||
760 | * be careful that "break" loop is not allowed. We have reference count. | ||
761 | * Instead of that modify "cond" to be false and "continue" to exit the loop. | ||
762 | */ | ||
763 | #define for_each_mem_cgroup_tree_cond(iter, root, cond) \ | ||
764 | for (iter = mem_cgroup_start_loop(root);\ | ||
765 | iter != NULL;\ | ||
766 | iter = mem_cgroup_get_next(iter, root, cond)) | ||
767 | |||
768 | #define for_each_mem_cgroup_tree(iter, root) \ | ||
769 | for_each_mem_cgroup_tree_cond(iter, root, true) | ||
770 | |||
771 | #define for_each_mem_cgroup_all(iter) \ | ||
772 | for_each_mem_cgroup_tree_cond(iter, NULL, true) | ||
773 | |||
696 | 774 | ||
697 | static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) | 775 | static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) |
698 | { | 776 | { |
@@ -1051,7 +1129,52 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg) | |||
1051 | return swappiness; | 1129 | return swappiness; |
1052 | } | 1130 | } |
1053 | 1131 | ||
1054 | /* A routine for testing mem is not under move_account */ | 1132 | static void mem_cgroup_start_move(struct mem_cgroup *mem) |
1133 | { | ||
1134 | int cpu; | ||
1135 | |||
1136 | get_online_cpus(); | ||
1137 | spin_lock(&mem->pcp_counter_lock); | ||
1138 | for_each_online_cpu(cpu) | ||
1139 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; | ||
1140 | mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; | ||
1141 | spin_unlock(&mem->pcp_counter_lock); | ||
1142 | put_online_cpus(); | ||
1143 | |||
1144 | synchronize_rcu(); | ||
1145 | } | ||
1146 | |||
1147 | static void mem_cgroup_end_move(struct mem_cgroup *mem) | ||
1148 | { | ||
1149 | int cpu; | ||
1150 | |||
1151 | if (!mem) | ||
1152 | return; | ||
1153 | get_online_cpus(); | ||
1154 | spin_lock(&mem->pcp_counter_lock); | ||
1155 | for_each_online_cpu(cpu) | ||
1156 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; | ||
1157 | mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; | ||
1158 | spin_unlock(&mem->pcp_counter_lock); | ||
1159 | put_online_cpus(); | ||
1160 | } | ||
1161 | /* | ||
1162 | * 2 routines for checking "mem" is under move_account() or not. | ||
1163 | * | ||
1164 | * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used | ||
1165 | * for avoiding race in accounting. If true, | ||
1166 | * pc->mem_cgroup may be overwritten. | ||
1167 | * | ||
1168 | * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or | ||
1169 | * under hierarchy of moving cgroups. This is for | ||
1170 | * waiting at hith-memory prressure caused by "move". | ||
1171 | */ | ||
1172 | |||
1173 | static bool mem_cgroup_stealed(struct mem_cgroup *mem) | ||
1174 | { | ||
1175 | VM_BUG_ON(!rcu_read_lock_held()); | ||
1176 | return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; | ||
1177 | } | ||
1055 | 1178 | ||
1056 | static bool mem_cgroup_under_move(struct mem_cgroup *mem) | 1179 | static bool mem_cgroup_under_move(struct mem_cgroup *mem) |
1057 | { | 1180 | { |
@@ -1092,13 +1215,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) | |||
1092 | return false; | 1215 | return false; |
1093 | } | 1216 | } |
1094 | 1217 | ||
1095 | static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) | ||
1096 | { | ||
1097 | int *val = data; | ||
1098 | (*val)++; | ||
1099 | return 0; | ||
1100 | } | ||
1101 | |||
1102 | /** | 1218 | /** |
1103 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. | 1219 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. |
1104 | * @memcg: The memory cgroup that went over limit | 1220 | * @memcg: The memory cgroup that went over limit |
@@ -1173,7 +1289,10 @@ done: | |||
1173 | static int mem_cgroup_count_children(struct mem_cgroup *mem) | 1289 | static int mem_cgroup_count_children(struct mem_cgroup *mem) |
1174 | { | 1290 | { |
1175 | int num = 0; | 1291 | int num = 0; |
1176 | mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); | 1292 | struct mem_cgroup *iter; |
1293 | |||
1294 | for_each_mem_cgroup_tree(iter, mem) | ||
1295 | num++; | ||
1177 | return num; | 1296 | return num; |
1178 | } | 1297 | } |
1179 | 1298 | ||
@@ -1322,49 +1441,39 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1322 | return total; | 1441 | return total; |
1323 | } | 1442 | } |
1324 | 1443 | ||
1325 | static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) | ||
1326 | { | ||
1327 | int *val = (int *)data; | ||
1328 | int x; | ||
1329 | /* | ||
1330 | * Logically, we can stop scanning immediately when we find | ||
1331 | * a memcg is already locked. But condidering unlock ops and | ||
1332 | * creation/removal of memcg, scan-all is simple operation. | ||
1333 | */ | ||
1334 | x = atomic_inc_return(&mem->oom_lock); | ||
1335 | *val = max(x, *val); | ||
1336 | return 0; | ||
1337 | } | ||
1338 | /* | 1444 | /* |
1339 | * Check OOM-Killer is already running under our hierarchy. | 1445 | * Check OOM-Killer is already running under our hierarchy. |
1340 | * If someone is running, return false. | 1446 | * If someone is running, return false. |
1341 | */ | 1447 | */ |
1342 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) | 1448 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) |
1343 | { | 1449 | { |
1344 | int lock_count = 0; | 1450 | int x, lock_count = 0; |
1451 | struct mem_cgroup *iter; | ||
1345 | 1452 | ||
1346 | mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); | 1453 | for_each_mem_cgroup_tree(iter, mem) { |
1454 | x = atomic_inc_return(&iter->oom_lock); | ||
1455 | lock_count = max(x, lock_count); | ||
1456 | } | ||
1347 | 1457 | ||
1348 | if (lock_count == 1) | 1458 | if (lock_count == 1) |
1349 | return true; | 1459 | return true; |
1350 | return false; | 1460 | return false; |
1351 | } | 1461 | } |
1352 | 1462 | ||
1353 | static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) | 1463 | static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) |
1354 | { | 1464 | { |
1465 | struct mem_cgroup *iter; | ||
1466 | |||
1355 | /* | 1467 | /* |
1356 | * When a new child is created while the hierarchy is under oom, | 1468 | * When a new child is created while the hierarchy is under oom, |
1357 | * mem_cgroup_oom_lock() may not be called. We have to use | 1469 | * mem_cgroup_oom_lock() may not be called. We have to use |
1358 | * atomic_add_unless() here. | 1470 | * atomic_add_unless() here. |
1359 | */ | 1471 | */ |
1360 | atomic_add_unless(&mem->oom_lock, -1, 0); | 1472 | for_each_mem_cgroup_tree(iter, mem) |
1473 | atomic_add_unless(&iter->oom_lock, -1, 0); | ||
1361 | return 0; | 1474 | return 0; |
1362 | } | 1475 | } |
1363 | 1476 | ||
1364 | static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) | ||
1365 | { | ||
1366 | mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); | ||
1367 | } | ||
1368 | 1477 | ||
1369 | static DEFINE_MUTEX(memcg_oom_mutex); | 1478 | static DEFINE_MUTEX(memcg_oom_mutex); |
1370 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | 1479 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); |
@@ -1462,34 +1571,73 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
1462 | /* | 1571 | /* |
1463 | * Currently used to update mapped file statistics, but the routine can be | 1572 | * Currently used to update mapped file statistics, but the routine can be |
1464 | * generalized to update other statistics as well. | 1573 | * generalized to update other statistics as well. |
1574 | * | ||
1575 | * Notes: Race condition | ||
1576 | * | ||
1577 | * We usually use page_cgroup_lock() for accessing page_cgroup member but | ||
1578 | * it tends to be costly. But considering some conditions, we doesn't need | ||
1579 | * to do so _always_. | ||
1580 | * | ||
1581 | * Considering "charge", lock_page_cgroup() is not required because all | ||
1582 | * file-stat operations happen after a page is attached to radix-tree. There | ||
1583 | * are no race with "charge". | ||
1584 | * | ||
1585 | * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup | ||
1586 | * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even | ||
1587 | * if there are race with "uncharge". Statistics itself is properly handled | ||
1588 | * by flags. | ||
1589 | * | ||
1590 | * Considering "move", this is an only case we see a race. To make the race | ||
1591 | * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are | ||
1592 | * possibility of race condition. If there is, we take a lock. | ||
1465 | */ | 1593 | */ |
1466 | void mem_cgroup_update_file_mapped(struct page *page, int val) | 1594 | |
1595 | static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) | ||
1467 | { | 1596 | { |
1468 | struct mem_cgroup *mem; | 1597 | struct mem_cgroup *mem; |
1469 | struct page_cgroup *pc; | 1598 | struct page_cgroup *pc = lookup_page_cgroup(page); |
1599 | bool need_unlock = false; | ||
1470 | 1600 | ||
1471 | pc = lookup_page_cgroup(page); | ||
1472 | if (unlikely(!pc)) | 1601 | if (unlikely(!pc)) |
1473 | return; | 1602 | return; |
1474 | 1603 | ||
1475 | lock_page_cgroup(pc); | 1604 | rcu_read_lock(); |
1476 | mem = pc->mem_cgroup; | 1605 | mem = pc->mem_cgroup; |
1477 | if (!mem || !PageCgroupUsed(pc)) | 1606 | if (unlikely(!mem || !PageCgroupUsed(pc))) |
1478 | goto done; | 1607 | goto out; |
1608 | /* pc->mem_cgroup is unstable ? */ | ||
1609 | if (unlikely(mem_cgroup_stealed(mem))) { | ||
1610 | /* take a lock against to access pc->mem_cgroup */ | ||
1611 | lock_page_cgroup(pc); | ||
1612 | need_unlock = true; | ||
1613 | mem = pc->mem_cgroup; | ||
1614 | if (!mem || !PageCgroupUsed(pc)) | ||
1615 | goto out; | ||
1616 | } | ||
1479 | 1617 | ||
1480 | /* | 1618 | this_cpu_add(mem->stat->count[idx], val); |
1481 | * Preemption is already disabled. We can use __this_cpu_xxx | 1619 | |
1482 | */ | 1620 | switch (idx) { |
1483 | if (val > 0) { | 1621 | case MEM_CGROUP_STAT_FILE_MAPPED: |
1484 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 1622 | if (val > 0) |
1485 | SetPageCgroupFileMapped(pc); | 1623 | SetPageCgroupFileMapped(pc); |
1486 | } else { | 1624 | else if (!page_mapped(page)) |
1487 | __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 1625 | ClearPageCgroupFileMapped(pc); |
1488 | ClearPageCgroupFileMapped(pc); | 1626 | break; |
1627 | default: | ||
1628 | BUG(); | ||
1489 | } | 1629 | } |
1490 | 1630 | ||
1491 | done: | 1631 | out: |
1492 | unlock_page_cgroup(pc); | 1632 | if (unlikely(need_unlock)) |
1633 | unlock_page_cgroup(pc); | ||
1634 | rcu_read_unlock(); | ||
1635 | return; | ||
1636 | } | ||
1637 | |||
1638 | void mem_cgroup_update_file_mapped(struct page *page, int val) | ||
1639 | { | ||
1640 | mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val); | ||
1493 | } | 1641 | } |
1494 | 1642 | ||
1495 | /* | 1643 | /* |
@@ -1605,15 +1753,55 @@ static void drain_all_stock_sync(void) | |||
1605 | atomic_dec(&memcg_drain_count); | 1753 | atomic_dec(&memcg_drain_count); |
1606 | } | 1754 | } |
1607 | 1755 | ||
1608 | static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | 1756 | /* |
1757 | * This function drains percpu counter value from DEAD cpu and | ||
1758 | * move it to local cpu. Note that this function can be preempted. | ||
1759 | */ | ||
1760 | static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) | ||
1761 | { | ||
1762 | int i; | ||
1763 | |||
1764 | spin_lock(&mem->pcp_counter_lock); | ||
1765 | for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { | ||
1766 | s64 x = per_cpu(mem->stat->count[i], cpu); | ||
1767 | |||
1768 | per_cpu(mem->stat->count[i], cpu) = 0; | ||
1769 | mem->nocpu_base.count[i] += x; | ||
1770 | } | ||
1771 | /* need to clear ON_MOVE value, works as a kind of lock. */ | ||
1772 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; | ||
1773 | spin_unlock(&mem->pcp_counter_lock); | ||
1774 | } | ||
1775 | |||
1776 | static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) | ||
1777 | { | ||
1778 | int idx = MEM_CGROUP_ON_MOVE; | ||
1779 | |||
1780 | spin_lock(&mem->pcp_counter_lock); | ||
1781 | per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; | ||
1782 | spin_unlock(&mem->pcp_counter_lock); | ||
1783 | } | ||
1784 | |||
1785 | static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, | ||
1609 | unsigned long action, | 1786 | unsigned long action, |
1610 | void *hcpu) | 1787 | void *hcpu) |
1611 | { | 1788 | { |
1612 | int cpu = (unsigned long)hcpu; | 1789 | int cpu = (unsigned long)hcpu; |
1613 | struct memcg_stock_pcp *stock; | 1790 | struct memcg_stock_pcp *stock; |
1791 | struct mem_cgroup *iter; | ||
1792 | |||
1793 | if ((action == CPU_ONLINE)) { | ||
1794 | for_each_mem_cgroup_all(iter) | ||
1795 | synchronize_mem_cgroup_on_move(iter, cpu); | ||
1796 | return NOTIFY_OK; | ||
1797 | } | ||
1614 | 1798 | ||
1615 | if (action != CPU_DEAD) | 1799 | if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) |
1616 | return NOTIFY_OK; | 1800 | return NOTIFY_OK; |
1801 | |||
1802 | for_each_mem_cgroup_all(iter) | ||
1803 | mem_cgroup_drain_pcp_counter(iter, cpu); | ||
1804 | |||
1617 | stock = &per_cpu(memcg_stock, cpu); | 1805 | stock = &per_cpu(memcg_stock, cpu); |
1618 | drain_stock(stock); | 1806 | drain_stock(stock); |
1619 | return NOTIFY_OK; | 1807 | return NOTIFY_OK; |
@@ -3038,6 +3226,7 @@ move_account: | |||
3038 | lru_add_drain_all(); | 3226 | lru_add_drain_all(); |
3039 | drain_all_stock_sync(); | 3227 | drain_all_stock_sync(); |
3040 | ret = 0; | 3228 | ret = 0; |
3229 | mem_cgroup_start_move(mem); | ||
3041 | for_each_node_state(node, N_HIGH_MEMORY) { | 3230 | for_each_node_state(node, N_HIGH_MEMORY) { |
3042 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 3231 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
3043 | enum lru_list l; | 3232 | enum lru_list l; |
@@ -3051,6 +3240,7 @@ move_account: | |||
3051 | if (ret) | 3240 | if (ret) |
3052 | break; | 3241 | break; |
3053 | } | 3242 | } |
3243 | mem_cgroup_end_move(mem); | ||
3054 | memcg_oom_recover(mem); | 3244 | memcg_oom_recover(mem); |
3055 | /* it seems parent cgroup doesn't have enough mem */ | 3245 | /* it seems parent cgroup doesn't have enough mem */ |
3056 | if (ret == -ENOMEM) | 3246 | if (ret == -ENOMEM) |
@@ -3137,33 +3327,25 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
3137 | return retval; | 3327 | return retval; |
3138 | } | 3328 | } |
3139 | 3329 | ||
3140 | struct mem_cgroup_idx_data { | ||
3141 | s64 val; | ||
3142 | enum mem_cgroup_stat_index idx; | ||
3143 | }; | ||
3144 | 3330 | ||
3145 | static int | 3331 | static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, |
3146 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) | 3332 | enum mem_cgroup_stat_index idx) |
3147 | { | 3333 | { |
3148 | struct mem_cgroup_idx_data *d = data; | 3334 | struct mem_cgroup *iter; |
3149 | d->val += mem_cgroup_read_stat(mem, d->idx); | 3335 | s64 val = 0; |
3150 | return 0; | ||
3151 | } | ||
3152 | 3336 | ||
3153 | static void | 3337 | /* each per cpu's value can be minus.Then, use s64 */ |
3154 | mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, | 3338 | for_each_mem_cgroup_tree(iter, mem) |
3155 | enum mem_cgroup_stat_index idx, s64 *val) | 3339 | val += mem_cgroup_read_stat(iter, idx); |
3156 | { | 3340 | |
3157 | struct mem_cgroup_idx_data d; | 3341 | if (val < 0) /* race ? */ |
3158 | d.idx = idx; | 3342 | val = 0; |
3159 | d.val = 0; | 3343 | return val; |
3160 | mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); | ||
3161 | *val = d.val; | ||
3162 | } | 3344 | } |
3163 | 3345 | ||
3164 | static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) | 3346 | static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) |
3165 | { | 3347 | { |
3166 | u64 idx_val, val; | 3348 | u64 val; |
3167 | 3349 | ||
3168 | if (!mem_cgroup_is_root(mem)) { | 3350 | if (!mem_cgroup_is_root(mem)) { |
3169 | if (!swap) | 3351 | if (!swap) |
@@ -3172,16 +3354,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) | |||
3172 | return res_counter_read_u64(&mem->memsw, RES_USAGE); | 3354 | return res_counter_read_u64(&mem->memsw, RES_USAGE); |
3173 | } | 3355 | } |
3174 | 3356 | ||
3175 | mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); | 3357 | val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE); |
3176 | val = idx_val; | 3358 | val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS); |
3177 | mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val); | ||
3178 | val += idx_val; | ||
3179 | 3359 | ||
3180 | if (swap) { | 3360 | if (swap) |
3181 | mem_cgroup_get_recursive_idx_stat(mem, | 3361 | val += mem_cgroup_get_recursive_idx_stat(mem, |
3182 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | 3362 | MEM_CGROUP_STAT_SWAPOUT); |
3183 | val += idx_val; | ||
3184 | } | ||
3185 | 3363 | ||
3186 | return val << PAGE_SHIFT; | 3364 | return val << PAGE_SHIFT; |
3187 | } | 3365 | } |
@@ -3389,9 +3567,9 @@ struct { | |||
3389 | }; | 3567 | }; |
3390 | 3568 | ||
3391 | 3569 | ||
3392 | static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | 3570 | static void |
3571 | mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) | ||
3393 | { | 3572 | { |
3394 | struct mcs_total_stat *s = data; | ||
3395 | s64 val; | 3573 | s64 val; |
3396 | 3574 | ||
3397 | /* per cpu stat */ | 3575 | /* per cpu stat */ |
@@ -3421,13 +3599,15 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | |||
3421 | s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; | 3599 | s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; |
3422 | val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); | 3600 | val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); |
3423 | s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; | 3601 | s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; |
3424 | return 0; | ||
3425 | } | 3602 | } |
3426 | 3603 | ||
3427 | static void | 3604 | static void |
3428 | mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) | 3605 | mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) |
3429 | { | 3606 | { |
3430 | mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); | 3607 | struct mem_cgroup *iter; |
3608 | |||
3609 | for_each_mem_cgroup_tree(iter, mem) | ||
3610 | mem_cgroup_get_local_stat(iter, s); | ||
3431 | } | 3611 | } |
3432 | 3612 | ||
3433 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | 3613 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, |
@@ -3604,7 +3784,7 @@ static int compare_thresholds(const void *a, const void *b) | |||
3604 | return _a->threshold - _b->threshold; | 3784 | return _a->threshold - _b->threshold; |
3605 | } | 3785 | } |
3606 | 3786 | ||
3607 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) | 3787 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) |
3608 | { | 3788 | { |
3609 | struct mem_cgroup_eventfd_list *ev; | 3789 | struct mem_cgroup_eventfd_list *ev; |
3610 | 3790 | ||
@@ -3615,7 +3795,10 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) | |||
3615 | 3795 | ||
3616 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem) | 3796 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem) |
3617 | { | 3797 | { |
3618 | mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); | 3798 | struct mem_cgroup *iter; |
3799 | |||
3800 | for_each_mem_cgroup_tree(iter, mem) | ||
3801 | mem_cgroup_oom_notify_cb(iter); | ||
3619 | } | 3802 | } |
3620 | 3803 | ||
3621 | static int mem_cgroup_usage_register_event(struct cgroup *cgrp, | 3804 | static int mem_cgroup_usage_register_event(struct cgroup *cgrp, |
@@ -4032,6 +4215,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
4032 | vfree(mem); | 4215 | vfree(mem); |
4033 | mem = NULL; | 4216 | mem = NULL; |
4034 | } | 4217 | } |
4218 | spin_lock_init(&mem->pcp_counter_lock); | ||
4035 | return mem; | 4219 | return mem; |
4036 | } | 4220 | } |
4037 | 4221 | ||
@@ -4158,7 +4342,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
4158 | &per_cpu(memcg_stock, cpu); | 4342 | &per_cpu(memcg_stock, cpu); |
4159 | INIT_WORK(&stock->work, drain_local_stock); | 4343 | INIT_WORK(&stock->work, drain_local_stock); |
4160 | } | 4344 | } |
4161 | hotcpu_notifier(memcg_stock_cpu_callback, 0); | 4345 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
4162 | } else { | 4346 | } else { |
4163 | parent = mem_cgroup_from_cont(cont->parent); | 4347 | parent = mem_cgroup_from_cont(cont->parent); |
4164 | mem->use_hierarchy = parent->use_hierarchy; | 4348 | mem->use_hierarchy = parent->use_hierarchy; |
@@ -4513,6 +4697,7 @@ static void mem_cgroup_clear_mc(void) | |||
4513 | mc.to = NULL; | 4697 | mc.to = NULL; |
4514 | mc.moving_task = NULL; | 4698 | mc.moving_task = NULL; |
4515 | spin_unlock(&mc.lock); | 4699 | spin_unlock(&mc.lock); |
4700 | mem_cgroup_end_move(from); | ||
4516 | memcg_oom_recover(from); | 4701 | memcg_oom_recover(from); |
4517 | memcg_oom_recover(to); | 4702 | memcg_oom_recover(to); |
4518 | wake_up_all(&mc.waitq); | 4703 | wake_up_all(&mc.waitq); |
@@ -4543,6 +4728,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |||
4543 | VM_BUG_ON(mc.moved_charge); | 4728 | VM_BUG_ON(mc.moved_charge); |
4544 | VM_BUG_ON(mc.moved_swap); | 4729 | VM_BUG_ON(mc.moved_swap); |
4545 | VM_BUG_ON(mc.moving_task); | 4730 | VM_BUG_ON(mc.moving_task); |
4731 | mem_cgroup_start_move(from); | ||
4546 | spin_lock(&mc.lock); | 4732 | spin_lock(&mc.lock); |
4547 | mc.from = from; | 4733 | mc.from = from; |
4548 | mc.to = mem; | 4734 | mc.to = mem; |
@@ -378,6 +378,7 @@ void release_pages(struct page **pages, int nr, int cold) | |||
378 | 378 | ||
379 | pagevec_free(&pages_to_free); | 379 | pagevec_free(&pages_to_free); |
380 | } | 380 | } |
381 | EXPORT_SYMBOL(release_pages); | ||
381 | 382 | ||
382 | /* | 383 | /* |
383 | * The pages which we're about to release may be in the deferred lru-addition | 384 | * The pages which we're about to release may be in the deferred lru-addition |