diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-27 21:42:52 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2010-10-27 21:42:52 -0400 |
| commit | 17bb51d56cdc8cbf252031db3107de034cfeb44c (patch) | |
| tree | f9fb2c16b29a152d3413fa0028e660e3b6146584 /mm/memcontrol.c | |
| parent | 0671b7674f42ab3a200401ea0e48d6f47d34acae (diff) | |
| parent | 95aac7b1cd224f568fb83937044cd303ff11b029 (diff) | |
Merge branch 'akpm-incoming-2'
* akpm-incoming-2: (139 commits)
epoll: make epoll_wait() use the hrtimer range feature
select: rename estimate_accuracy() to select_estimate_accuracy()
Remove duplicate includes from many files
ramoops: use the platform data structure instead of module params
kernel/resource.c: handle reinsertion of an already-inserted resource
kfifo: fix kfifo_alloc() to return a signed int value
w1: don't allow arbitrary users to remove w1 devices
alpha: remove dma64_addr_t usage
mips: remove dma64_addr_t usage
sparc: remove dma64_addr_t usage
fuse: use release_pages()
taskstats: use real microsecond granularity for CPU times
taskstats: split fill_pid function
taskstats: separate taskstats commands
delayacct: align to 8 byte boundary on 64-bit systems
delay-accounting: reimplement -c for getdelays.c to report information on a target command
namespaces Kconfig: move namespace menu location after the cgroup
namespaces Kconfig: remove the cgroup device whitelist experimental tag
namespaces Kconfig: remove pointless cgroup dependency
namespaces Kconfig: make namespace a submenu
...
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 406 |
1 files changed, 296 insertions, 110 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9be3cf8a5da4..9a99cfaf0a19 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -89,7 +89,10 @@ enum mem_cgroup_stat_index { | |||
| 89 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 89 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
| 90 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 90 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
| 91 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | 91 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ |
| 92 | MEM_CGROUP_EVENTS, /* incremented at every pagein/pageout */ | 92 | MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ |
| 93 | /* incremented at every pagein/pageout */ | ||
| 94 | MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA, | ||
| 95 | MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ | ||
| 93 | 96 | ||
| 94 | MEM_CGROUP_STAT_NSTATS, | 97 | MEM_CGROUP_STAT_NSTATS, |
| 95 | }; | 98 | }; |
| @@ -254,6 +257,12 @@ struct mem_cgroup { | |||
| 254 | * percpu counter. | 257 | * percpu counter. |
| 255 | */ | 258 | */ |
| 256 | struct mem_cgroup_stat_cpu *stat; | 259 | struct mem_cgroup_stat_cpu *stat; |
| 260 | /* | ||
| 261 | * used when a cpu is offlined or other synchronizations | ||
| 262 | * See mem_cgroup_read_stat(). | ||
| 263 | */ | ||
| 264 | struct mem_cgroup_stat_cpu nocpu_base; | ||
| 265 | spinlock_t pcp_counter_lock; | ||
| 257 | }; | 266 | }; |
| 258 | 267 | ||
| 259 | /* Stuffs for move charges at task migration. */ | 268 | /* Stuffs for move charges at task migration. */ |
| @@ -530,14 +539,40 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | |||
| 530 | return mz; | 539 | return mz; |
| 531 | } | 540 | } |
| 532 | 541 | ||
| 542 | /* | ||
| 543 | * Implementation Note: reading percpu statistics for memcg. | ||
| 544 | * | ||
| 545 | * Both of vmstat[] and percpu_counter has threshold and do periodic | ||
| 546 | * synchronization to implement "quick" read. There are trade-off between | ||
| 547 | * reading cost and precision of value. Then, we may have a chance to implement | ||
| 548 | * a periodic synchronizion of counter in memcg's counter. | ||
| 549 | * | ||
| 550 | * But this _read() function is used for user interface now. The user accounts | ||
| 551 | * memory usage by memory cgroup and he _always_ requires exact value because | ||
| 552 | * he accounts memory. Even if we provide quick-and-fuzzy read, we always | ||
| 553 | * have to visit all online cpus and make sum. So, for now, unnecessary | ||
| 554 | * synchronization is not implemented. (just implemented for cpu hotplug) | ||
| 555 | * | ||
| 556 | * If there are kernel internal actions which can make use of some not-exact | ||
| 557 | * value, and reading all cpu value can be performance bottleneck in some | ||
| 558 | * common workload, threashold and synchonization as vmstat[] should be | ||
| 559 | * implemented. | ||
| 560 | */ | ||
| 533 | static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, | 561 | static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, |
| 534 | enum mem_cgroup_stat_index idx) | 562 | enum mem_cgroup_stat_index idx) |
| 535 | { | 563 | { |
| 536 | int cpu; | 564 | int cpu; |
| 537 | s64 val = 0; | 565 | s64 val = 0; |
| 538 | 566 | ||
| 539 | for_each_possible_cpu(cpu) | 567 | get_online_cpus(); |
| 568 | for_each_online_cpu(cpu) | ||
| 540 | val += per_cpu(mem->stat->count[idx], cpu); | 569 | val += per_cpu(mem->stat->count[idx], cpu); |
| 570 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 571 | spin_lock(&mem->pcp_counter_lock); | ||
| 572 | val += mem->nocpu_base.count[idx]; | ||
| 573 | spin_unlock(&mem->pcp_counter_lock); | ||
| 574 | #endif | ||
| 575 | put_online_cpus(); | ||
| 541 | return val; | 576 | return val; |
| 542 | } | 577 | } |
| 543 | 578 | ||
| @@ -659,40 +694,83 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) | |||
| 659 | return mem; | 694 | return mem; |
| 660 | } | 695 | } |
| 661 | 696 | ||
| 662 | /* | 697 | /* The caller has to guarantee "mem" exists before calling this */ |
| 663 | * Call callback function against all cgroup under hierarchy tree. | 698 | static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) |
| 664 | */ | ||
| 665 | static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, | ||
| 666 | int (*func)(struct mem_cgroup *, void *)) | ||
| 667 | { | 699 | { |
| 668 | int found, ret, nextid; | ||
| 669 | struct cgroup_subsys_state *css; | 700 | struct cgroup_subsys_state *css; |
| 670 | struct mem_cgroup *mem; | 701 | int found; |
| 671 | |||
| 672 | if (!root->use_hierarchy) | ||
| 673 | return (*func)(root, data); | ||
| 674 | 702 | ||
| 675 | nextid = 1; | 703 | if (!mem) /* ROOT cgroup has the smallest ID */ |
| 676 | do { | 704 | return root_mem_cgroup; /*css_put/get against root is ignored*/ |
| 677 | ret = 0; | 705 | if (!mem->use_hierarchy) { |
| 706 | if (css_tryget(&mem->css)) | ||
| 707 | return mem; | ||
| 708 | return NULL; | ||
| 709 | } | ||
| 710 | rcu_read_lock(); | ||
| 711 | /* | ||
| 712 | * searching a memory cgroup which has the smallest ID under given | ||
| 713 | * ROOT cgroup. (ID >= 1) | ||
| 714 | */ | ||
| 715 | css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); | ||
| 716 | if (css && css_tryget(css)) | ||
| 717 | mem = container_of(css, struct mem_cgroup, css); | ||
| 718 | else | ||
| 678 | mem = NULL; | 719 | mem = NULL; |
| 720 | rcu_read_unlock(); | ||
| 721 | return mem; | ||
| 722 | } | ||
| 723 | |||
| 724 | static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, | ||
| 725 | struct mem_cgroup *root, | ||
| 726 | bool cond) | ||
| 727 | { | ||
| 728 | int nextid = css_id(&iter->css) + 1; | ||
| 729 | int found; | ||
| 730 | int hierarchy_used; | ||
| 731 | struct cgroup_subsys_state *css; | ||
| 732 | |||
| 733 | hierarchy_used = iter->use_hierarchy; | ||
| 679 | 734 | ||
| 735 | css_put(&iter->css); | ||
| 736 | /* If no ROOT, walk all, ignore hierarchy */ | ||
| 737 | if (!cond || (root && !hierarchy_used)) | ||
| 738 | return NULL; | ||
| 739 | |||
| 740 | if (!root) | ||
| 741 | root = root_mem_cgroup; | ||
| 742 | |||
| 743 | do { | ||
| 744 | iter = NULL; | ||
| 680 | rcu_read_lock(); | 745 | rcu_read_lock(); |
| 681 | css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, | 746 | |
| 682 | &found); | 747 | css = css_get_next(&mem_cgroup_subsys, nextid, |
| 748 | &root->css, &found); | ||
| 683 | if (css && css_tryget(css)) | 749 | if (css && css_tryget(css)) |
| 684 | mem = container_of(css, struct mem_cgroup, css); | 750 | iter = container_of(css, struct mem_cgroup, css); |
| 685 | rcu_read_unlock(); | 751 | rcu_read_unlock(); |
| 686 | 752 | /* If css is NULL, no more cgroups will be found */ | |
| 687 | if (mem) { | ||
| 688 | ret = (*func)(mem, data); | ||
| 689 | css_put(&mem->css); | ||
| 690 | } | ||
| 691 | nextid = found + 1; | 753 | nextid = found + 1; |
| 692 | } while (!ret && css); | 754 | } while (css && !iter); |
| 693 | 755 | ||
| 694 | return ret; | 756 | return iter; |
| 695 | } | 757 | } |
| 758 | /* | ||
| 759 | * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please | ||
| 760 | * be careful that "break" loop is not allowed. We have reference count. | ||
| 761 | * Instead of that modify "cond" to be false and "continue" to exit the loop. | ||
| 762 | */ | ||
| 763 | #define for_each_mem_cgroup_tree_cond(iter, root, cond) \ | ||
| 764 | for (iter = mem_cgroup_start_loop(root);\ | ||
| 765 | iter != NULL;\ | ||
| 766 | iter = mem_cgroup_get_next(iter, root, cond)) | ||
| 767 | |||
| 768 | #define for_each_mem_cgroup_tree(iter, root) \ | ||
| 769 | for_each_mem_cgroup_tree_cond(iter, root, true) | ||
| 770 | |||
| 771 | #define for_each_mem_cgroup_all(iter) \ | ||
| 772 | for_each_mem_cgroup_tree_cond(iter, NULL, true) | ||
| 773 | |||
| 696 | 774 | ||
| 697 | static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) | 775 | static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) |
| 698 | { | 776 | { |
| @@ -1051,7 +1129,52 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg) | |||
| 1051 | return swappiness; | 1129 | return swappiness; |
| 1052 | } | 1130 | } |
| 1053 | 1131 | ||
| 1054 | /* A routine for testing mem is not under move_account */ | 1132 | static void mem_cgroup_start_move(struct mem_cgroup *mem) |
| 1133 | { | ||
| 1134 | int cpu; | ||
| 1135 | |||
| 1136 | get_online_cpus(); | ||
| 1137 | spin_lock(&mem->pcp_counter_lock); | ||
| 1138 | for_each_online_cpu(cpu) | ||
| 1139 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; | ||
| 1140 | mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; | ||
| 1141 | spin_unlock(&mem->pcp_counter_lock); | ||
| 1142 | put_online_cpus(); | ||
| 1143 | |||
| 1144 | synchronize_rcu(); | ||
| 1145 | } | ||
| 1146 | |||
| 1147 | static void mem_cgroup_end_move(struct mem_cgroup *mem) | ||
| 1148 | { | ||
| 1149 | int cpu; | ||
| 1150 | |||
| 1151 | if (!mem) | ||
| 1152 | return; | ||
| 1153 | get_online_cpus(); | ||
| 1154 | spin_lock(&mem->pcp_counter_lock); | ||
| 1155 | for_each_online_cpu(cpu) | ||
| 1156 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; | ||
| 1157 | mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; | ||
| 1158 | spin_unlock(&mem->pcp_counter_lock); | ||
| 1159 | put_online_cpus(); | ||
| 1160 | } | ||
| 1161 | /* | ||
| 1162 | * 2 routines for checking "mem" is under move_account() or not. | ||
| 1163 | * | ||
| 1164 | * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used | ||
| 1165 | * for avoiding race in accounting. If true, | ||
| 1166 | * pc->mem_cgroup may be overwritten. | ||
| 1167 | * | ||
| 1168 | * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or | ||
| 1169 | * under hierarchy of moving cgroups. This is for | ||
| 1170 | * waiting at hith-memory prressure caused by "move". | ||
| 1171 | */ | ||
| 1172 | |||
| 1173 | static bool mem_cgroup_stealed(struct mem_cgroup *mem) | ||
| 1174 | { | ||
| 1175 | VM_BUG_ON(!rcu_read_lock_held()); | ||
| 1176 | return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; | ||
| 1177 | } | ||
| 1055 | 1178 | ||
| 1056 | static bool mem_cgroup_under_move(struct mem_cgroup *mem) | 1179 | static bool mem_cgroup_under_move(struct mem_cgroup *mem) |
| 1057 | { | 1180 | { |
| @@ -1092,13 +1215,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) | |||
| 1092 | return false; | 1215 | return false; |
| 1093 | } | 1216 | } |
| 1094 | 1217 | ||
| 1095 | static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) | ||
| 1096 | { | ||
| 1097 | int *val = data; | ||
| 1098 | (*val)++; | ||
| 1099 | return 0; | ||
| 1100 | } | ||
| 1101 | |||
| 1102 | /** | 1218 | /** |
| 1103 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. | 1219 | * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. |
| 1104 | * @memcg: The memory cgroup that went over limit | 1220 | * @memcg: The memory cgroup that went over limit |
| @@ -1173,7 +1289,10 @@ done: | |||
| 1173 | static int mem_cgroup_count_children(struct mem_cgroup *mem) | 1289 | static int mem_cgroup_count_children(struct mem_cgroup *mem) |
| 1174 | { | 1290 | { |
| 1175 | int num = 0; | 1291 | int num = 0; |
| 1176 | mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); | 1292 | struct mem_cgroup *iter; |
| 1293 | |||
| 1294 | for_each_mem_cgroup_tree(iter, mem) | ||
| 1295 | num++; | ||
| 1177 | return num; | 1296 | return num; |
| 1178 | } | 1297 | } |
| 1179 | 1298 | ||
| @@ -1322,49 +1441,39 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
| 1322 | return total; | 1441 | return total; |
| 1323 | } | 1442 | } |
| 1324 | 1443 | ||
| 1325 | static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data) | ||
| 1326 | { | ||
| 1327 | int *val = (int *)data; | ||
| 1328 | int x; | ||
| 1329 | /* | ||
| 1330 | * Logically, we can stop scanning immediately when we find | ||
| 1331 | * a memcg is already locked. But condidering unlock ops and | ||
| 1332 | * creation/removal of memcg, scan-all is simple operation. | ||
| 1333 | */ | ||
| 1334 | x = atomic_inc_return(&mem->oom_lock); | ||
| 1335 | *val = max(x, *val); | ||
| 1336 | return 0; | ||
| 1337 | } | ||
| 1338 | /* | 1444 | /* |
| 1339 | * Check OOM-Killer is already running under our hierarchy. | 1445 | * Check OOM-Killer is already running under our hierarchy. |
| 1340 | * If someone is running, return false. | 1446 | * If someone is running, return false. |
| 1341 | */ | 1447 | */ |
| 1342 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) | 1448 | static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) |
| 1343 | { | 1449 | { |
| 1344 | int lock_count = 0; | 1450 | int x, lock_count = 0; |
| 1451 | struct mem_cgroup *iter; | ||
| 1345 | 1452 | ||
| 1346 | mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb); | 1453 | for_each_mem_cgroup_tree(iter, mem) { |
| 1454 | x = atomic_inc_return(&iter->oom_lock); | ||
| 1455 | lock_count = max(x, lock_count); | ||
| 1456 | } | ||
| 1347 | 1457 | ||
| 1348 | if (lock_count == 1) | 1458 | if (lock_count == 1) |
| 1349 | return true; | 1459 | return true; |
| 1350 | return false; | 1460 | return false; |
| 1351 | } | 1461 | } |
| 1352 | 1462 | ||
| 1353 | static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data) | 1463 | static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) |
| 1354 | { | 1464 | { |
| 1465 | struct mem_cgroup *iter; | ||
| 1466 | |||
| 1355 | /* | 1467 | /* |
| 1356 | * When a new child is created while the hierarchy is under oom, | 1468 | * When a new child is created while the hierarchy is under oom, |
| 1357 | * mem_cgroup_oom_lock() may not be called. We have to use | 1469 | * mem_cgroup_oom_lock() may not be called. We have to use |
| 1358 | * atomic_add_unless() here. | 1470 | * atomic_add_unless() here. |
| 1359 | */ | 1471 | */ |
| 1360 | atomic_add_unless(&mem->oom_lock, -1, 0); | 1472 | for_each_mem_cgroup_tree(iter, mem) |
| 1473 | atomic_add_unless(&iter->oom_lock, -1, 0); | ||
| 1361 | return 0; | 1474 | return 0; |
| 1362 | } | 1475 | } |
| 1363 | 1476 | ||
| 1364 | static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) | ||
| 1365 | { | ||
| 1366 | mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb); | ||
| 1367 | } | ||
| 1368 | 1477 | ||
| 1369 | static DEFINE_MUTEX(memcg_oom_mutex); | 1478 | static DEFINE_MUTEX(memcg_oom_mutex); |
| 1370 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | 1479 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); |
| @@ -1462,34 +1571,73 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
| 1462 | /* | 1571 | /* |
| 1463 | * Currently used to update mapped file statistics, but the routine can be | 1572 | * Currently used to update mapped file statistics, but the routine can be |
| 1464 | * generalized to update other statistics as well. | 1573 | * generalized to update other statistics as well. |
| 1574 | * | ||
| 1575 | * Notes: Race condition | ||
| 1576 | * | ||
| 1577 | * We usually use page_cgroup_lock() for accessing page_cgroup member but | ||
| 1578 | * it tends to be costly. But considering some conditions, we doesn't need | ||
| 1579 | * to do so _always_. | ||
| 1580 | * | ||
| 1581 | * Considering "charge", lock_page_cgroup() is not required because all | ||
| 1582 | * file-stat operations happen after a page is attached to radix-tree. There | ||
| 1583 | * are no race with "charge". | ||
| 1584 | * | ||
| 1585 | * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup | ||
| 1586 | * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even | ||
| 1587 | * if there are race with "uncharge". Statistics itself is properly handled | ||
| 1588 | * by flags. | ||
| 1589 | * | ||
| 1590 | * Considering "move", this is an only case we see a race. To make the race | ||
| 1591 | * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are | ||
| 1592 | * possibility of race condition. If there is, we take a lock. | ||
| 1465 | */ | 1593 | */ |
| 1466 | void mem_cgroup_update_file_mapped(struct page *page, int val) | 1594 | |
| 1595 | static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) | ||
| 1467 | { | 1596 | { |
| 1468 | struct mem_cgroup *mem; | 1597 | struct mem_cgroup *mem; |
| 1469 | struct page_cgroup *pc; | 1598 | struct page_cgroup *pc = lookup_page_cgroup(page); |
| 1599 | bool need_unlock = false; | ||
| 1470 | 1600 | ||
| 1471 | pc = lookup_page_cgroup(page); | ||
| 1472 | if (unlikely(!pc)) | 1601 | if (unlikely(!pc)) |
| 1473 | return; | 1602 | return; |
| 1474 | 1603 | ||
| 1475 | lock_page_cgroup(pc); | 1604 | rcu_read_lock(); |
| 1476 | mem = pc->mem_cgroup; | 1605 | mem = pc->mem_cgroup; |
| 1477 | if (!mem || !PageCgroupUsed(pc)) | 1606 | if (unlikely(!mem || !PageCgroupUsed(pc))) |
| 1478 | goto done; | 1607 | goto out; |
| 1608 | /* pc->mem_cgroup is unstable ? */ | ||
| 1609 | if (unlikely(mem_cgroup_stealed(mem))) { | ||
| 1610 | /* take a lock against to access pc->mem_cgroup */ | ||
| 1611 | lock_page_cgroup(pc); | ||
| 1612 | need_unlock = true; | ||
| 1613 | mem = pc->mem_cgroup; | ||
| 1614 | if (!mem || !PageCgroupUsed(pc)) | ||
| 1615 | goto out; | ||
| 1616 | } | ||
| 1479 | 1617 | ||
| 1480 | /* | 1618 | this_cpu_add(mem->stat->count[idx], val); |
| 1481 | * Preemption is already disabled. We can use __this_cpu_xxx | 1619 | |
| 1482 | */ | 1620 | switch (idx) { |
| 1483 | if (val > 0) { | 1621 | case MEM_CGROUP_STAT_FILE_MAPPED: |
| 1484 | __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 1622 | if (val > 0) |
| 1485 | SetPageCgroupFileMapped(pc); | 1623 | SetPageCgroupFileMapped(pc); |
| 1486 | } else { | 1624 | else if (!page_mapped(page)) |
| 1487 | __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); | 1625 | ClearPageCgroupFileMapped(pc); |
| 1488 | ClearPageCgroupFileMapped(pc); | 1626 | break; |
| 1627 | default: | ||
| 1628 | BUG(); | ||
| 1489 | } | 1629 | } |
| 1490 | 1630 | ||
| 1491 | done: | 1631 | out: |
| 1492 | unlock_page_cgroup(pc); | 1632 | if (unlikely(need_unlock)) |
| 1633 | unlock_page_cgroup(pc); | ||
| 1634 | rcu_read_unlock(); | ||
| 1635 | return; | ||
| 1636 | } | ||
| 1637 | |||
| 1638 | void mem_cgroup_update_file_mapped(struct page *page, int val) | ||
| 1639 | { | ||
| 1640 | mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val); | ||
| 1493 | } | 1641 | } |
| 1494 | 1642 | ||
| 1495 | /* | 1643 | /* |
| @@ -1605,15 +1753,55 @@ static void drain_all_stock_sync(void) | |||
| 1605 | atomic_dec(&memcg_drain_count); | 1753 | atomic_dec(&memcg_drain_count); |
| 1606 | } | 1754 | } |
| 1607 | 1755 | ||
| 1608 | static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | 1756 | /* |
| 1757 | * This function drains percpu counter value from DEAD cpu and | ||
| 1758 | * move it to local cpu. Note that this function can be preempted. | ||
| 1759 | */ | ||
| 1760 | static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) | ||
| 1761 | { | ||
| 1762 | int i; | ||
| 1763 | |||
| 1764 | spin_lock(&mem->pcp_counter_lock); | ||
| 1765 | for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { | ||
| 1766 | s64 x = per_cpu(mem->stat->count[i], cpu); | ||
| 1767 | |||
| 1768 | per_cpu(mem->stat->count[i], cpu) = 0; | ||
| 1769 | mem->nocpu_base.count[i] += x; | ||
| 1770 | } | ||
| 1771 | /* need to clear ON_MOVE value, works as a kind of lock. */ | ||
| 1772 | per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; | ||
| 1773 | spin_unlock(&mem->pcp_counter_lock); | ||
| 1774 | } | ||
| 1775 | |||
| 1776 | static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) | ||
| 1777 | { | ||
| 1778 | int idx = MEM_CGROUP_ON_MOVE; | ||
| 1779 | |||
| 1780 | spin_lock(&mem->pcp_counter_lock); | ||
| 1781 | per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; | ||
| 1782 | spin_unlock(&mem->pcp_counter_lock); | ||
| 1783 | } | ||
| 1784 | |||
| 1785 | static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, | ||
| 1609 | unsigned long action, | 1786 | unsigned long action, |
| 1610 | void *hcpu) | 1787 | void *hcpu) |
| 1611 | { | 1788 | { |
| 1612 | int cpu = (unsigned long)hcpu; | 1789 | int cpu = (unsigned long)hcpu; |
| 1613 | struct memcg_stock_pcp *stock; | 1790 | struct memcg_stock_pcp *stock; |
| 1791 | struct mem_cgroup *iter; | ||
| 1792 | |||
| 1793 | if ((action == CPU_ONLINE)) { | ||
| 1794 | for_each_mem_cgroup_all(iter) | ||
| 1795 | synchronize_mem_cgroup_on_move(iter, cpu); | ||
| 1796 | return NOTIFY_OK; | ||
| 1797 | } | ||
| 1614 | 1798 | ||
| 1615 | if (action != CPU_DEAD) | 1799 | if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) |
| 1616 | return NOTIFY_OK; | 1800 | return NOTIFY_OK; |
| 1801 | |||
| 1802 | for_each_mem_cgroup_all(iter) | ||
| 1803 | mem_cgroup_drain_pcp_counter(iter, cpu); | ||
| 1804 | |||
| 1617 | stock = &per_cpu(memcg_stock, cpu); | 1805 | stock = &per_cpu(memcg_stock, cpu); |
| 1618 | drain_stock(stock); | 1806 | drain_stock(stock); |
| 1619 | return NOTIFY_OK; | 1807 | return NOTIFY_OK; |
| @@ -3038,6 +3226,7 @@ move_account: | |||
| 3038 | lru_add_drain_all(); | 3226 | lru_add_drain_all(); |
| 3039 | drain_all_stock_sync(); | 3227 | drain_all_stock_sync(); |
| 3040 | ret = 0; | 3228 | ret = 0; |
| 3229 | mem_cgroup_start_move(mem); | ||
| 3041 | for_each_node_state(node, N_HIGH_MEMORY) { | 3230 | for_each_node_state(node, N_HIGH_MEMORY) { |
| 3042 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 3231 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
| 3043 | enum lru_list l; | 3232 | enum lru_list l; |
| @@ -3051,6 +3240,7 @@ move_account: | |||
| 3051 | if (ret) | 3240 | if (ret) |
| 3052 | break; | 3241 | break; |
| 3053 | } | 3242 | } |
| 3243 | mem_cgroup_end_move(mem); | ||
| 3054 | memcg_oom_recover(mem); | 3244 | memcg_oom_recover(mem); |
| 3055 | /* it seems parent cgroup doesn't have enough mem */ | 3245 | /* it seems parent cgroup doesn't have enough mem */ |
| 3056 | if (ret == -ENOMEM) | 3246 | if (ret == -ENOMEM) |
| @@ -3137,33 +3327,25 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
| 3137 | return retval; | 3327 | return retval; |
| 3138 | } | 3328 | } |
| 3139 | 3329 | ||
| 3140 | struct mem_cgroup_idx_data { | ||
| 3141 | s64 val; | ||
| 3142 | enum mem_cgroup_stat_index idx; | ||
| 3143 | }; | ||
| 3144 | 3330 | ||
| 3145 | static int | 3331 | static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, |
| 3146 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) | 3332 | enum mem_cgroup_stat_index idx) |
| 3147 | { | 3333 | { |
| 3148 | struct mem_cgroup_idx_data *d = data; | 3334 | struct mem_cgroup *iter; |
| 3149 | d->val += mem_cgroup_read_stat(mem, d->idx); | 3335 | s64 val = 0; |
| 3150 | return 0; | ||
| 3151 | } | ||
| 3152 | 3336 | ||
| 3153 | static void | 3337 | /* each per cpu's value can be minus.Then, use s64 */ |
| 3154 | mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, | 3338 | for_each_mem_cgroup_tree(iter, mem) |
| 3155 | enum mem_cgroup_stat_index idx, s64 *val) | 3339 | val += mem_cgroup_read_stat(iter, idx); |
| 3156 | { | 3340 | |
| 3157 | struct mem_cgroup_idx_data d; | 3341 | if (val < 0) /* race ? */ |
| 3158 | d.idx = idx; | 3342 | val = 0; |
| 3159 | d.val = 0; | 3343 | return val; |
| 3160 | mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); | ||
| 3161 | *val = d.val; | ||
| 3162 | } | 3344 | } |
| 3163 | 3345 | ||
| 3164 | static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) | 3346 | static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) |
| 3165 | { | 3347 | { |
| 3166 | u64 idx_val, val; | 3348 | u64 val; |
| 3167 | 3349 | ||
| 3168 | if (!mem_cgroup_is_root(mem)) { | 3350 | if (!mem_cgroup_is_root(mem)) { |
| 3169 | if (!swap) | 3351 | if (!swap) |
| @@ -3172,16 +3354,12 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) | |||
| 3172 | return res_counter_read_u64(&mem->memsw, RES_USAGE); | 3354 | return res_counter_read_u64(&mem->memsw, RES_USAGE); |
| 3173 | } | 3355 | } |
| 3174 | 3356 | ||
| 3175 | mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE, &idx_val); | 3357 | val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE); |
| 3176 | val = idx_val; | 3358 | val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS); |
| 3177 | mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS, &idx_val); | ||
| 3178 | val += idx_val; | ||
| 3179 | 3359 | ||
| 3180 | if (swap) { | 3360 | if (swap) |
| 3181 | mem_cgroup_get_recursive_idx_stat(mem, | 3361 | val += mem_cgroup_get_recursive_idx_stat(mem, |
| 3182 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | 3362 | MEM_CGROUP_STAT_SWAPOUT); |
| 3183 | val += idx_val; | ||
| 3184 | } | ||
| 3185 | 3363 | ||
| 3186 | return val << PAGE_SHIFT; | 3364 | return val << PAGE_SHIFT; |
| 3187 | } | 3365 | } |
| @@ -3389,9 +3567,9 @@ struct { | |||
| 3389 | }; | 3567 | }; |
| 3390 | 3568 | ||
| 3391 | 3569 | ||
| 3392 | static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | 3570 | static void |
| 3571 | mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) | ||
| 3393 | { | 3572 | { |
| 3394 | struct mcs_total_stat *s = data; | ||
| 3395 | s64 val; | 3573 | s64 val; |
| 3396 | 3574 | ||
| 3397 | /* per cpu stat */ | 3575 | /* per cpu stat */ |
| @@ -3421,13 +3599,15 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | |||
| 3421 | s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; | 3599 | s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; |
| 3422 | val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); | 3600 | val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); |
| 3423 | s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; | 3601 | s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; |
| 3424 | return 0; | ||
| 3425 | } | 3602 | } |
| 3426 | 3603 | ||
| 3427 | static void | 3604 | static void |
| 3428 | mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) | 3605 | mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) |
| 3429 | { | 3606 | { |
| 3430 | mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); | 3607 | struct mem_cgroup *iter; |
| 3608 | |||
| 3609 | for_each_mem_cgroup_tree(iter, mem) | ||
| 3610 | mem_cgroup_get_local_stat(iter, s); | ||
| 3431 | } | 3611 | } |
| 3432 | 3612 | ||
| 3433 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | 3613 | static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, |
| @@ -3604,7 +3784,7 @@ static int compare_thresholds(const void *a, const void *b) | |||
| 3604 | return _a->threshold - _b->threshold; | 3784 | return _a->threshold - _b->threshold; |
| 3605 | } | 3785 | } |
| 3606 | 3786 | ||
| 3607 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) | 3787 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) |
| 3608 | { | 3788 | { |
| 3609 | struct mem_cgroup_eventfd_list *ev; | 3789 | struct mem_cgroup_eventfd_list *ev; |
| 3610 | 3790 | ||
| @@ -3615,7 +3795,10 @@ static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) | |||
| 3615 | 3795 | ||
| 3616 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem) | 3796 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem) |
| 3617 | { | 3797 | { |
| 3618 | mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); | 3798 | struct mem_cgroup *iter; |
| 3799 | |||
| 3800 | for_each_mem_cgroup_tree(iter, mem) | ||
| 3801 | mem_cgroup_oom_notify_cb(iter); | ||
| 3619 | } | 3802 | } |
| 3620 | 3803 | ||
| 3621 | static int mem_cgroup_usage_register_event(struct cgroup *cgrp, | 3804 | static int mem_cgroup_usage_register_event(struct cgroup *cgrp, |
| @@ -4032,6 +4215,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) | |||
| 4032 | vfree(mem); | 4215 | vfree(mem); |
| 4033 | mem = NULL; | 4216 | mem = NULL; |
| 4034 | } | 4217 | } |
| 4218 | spin_lock_init(&mem->pcp_counter_lock); | ||
| 4035 | return mem; | 4219 | return mem; |
| 4036 | } | 4220 | } |
| 4037 | 4221 | ||
| @@ -4158,7 +4342,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 4158 | &per_cpu(memcg_stock, cpu); | 4342 | &per_cpu(memcg_stock, cpu); |
| 4159 | INIT_WORK(&stock->work, drain_local_stock); | 4343 | INIT_WORK(&stock->work, drain_local_stock); |
| 4160 | } | 4344 | } |
| 4161 | hotcpu_notifier(memcg_stock_cpu_callback, 0); | 4345 | hotcpu_notifier(memcg_cpu_hotplug_callback, 0); |
| 4162 | } else { | 4346 | } else { |
| 4163 | parent = mem_cgroup_from_cont(cont->parent); | 4347 | parent = mem_cgroup_from_cont(cont->parent); |
| 4164 | mem->use_hierarchy = parent->use_hierarchy; | 4348 | mem->use_hierarchy = parent->use_hierarchy; |
| @@ -4513,6 +4697,7 @@ static void mem_cgroup_clear_mc(void) | |||
| 4513 | mc.to = NULL; | 4697 | mc.to = NULL; |
| 4514 | mc.moving_task = NULL; | 4698 | mc.moving_task = NULL; |
| 4515 | spin_unlock(&mc.lock); | 4699 | spin_unlock(&mc.lock); |
| 4700 | mem_cgroup_end_move(from); | ||
| 4516 | memcg_oom_recover(from); | 4701 | memcg_oom_recover(from); |
| 4517 | memcg_oom_recover(to); | 4702 | memcg_oom_recover(to); |
| 4518 | wake_up_all(&mc.waitq); | 4703 | wake_up_all(&mc.waitq); |
| @@ -4543,6 +4728,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, | |||
| 4543 | VM_BUG_ON(mc.moved_charge); | 4728 | VM_BUG_ON(mc.moved_charge); |
| 4544 | VM_BUG_ON(mc.moved_swap); | 4729 | VM_BUG_ON(mc.moved_swap); |
| 4545 | VM_BUG_ON(mc.moving_task); | 4730 | VM_BUG_ON(mc.moving_task); |
| 4731 | mem_cgroup_start_move(from); | ||
| 4546 | spin_lock(&mc.lock); | 4732 | spin_lock(&mc.lock); |
| 4547 | mc.from = from; | 4733 | mc.from = from; |
| 4548 | mc.to = mem; | 4734 | mc.to = mem; |
