diff options
| author | Ingo Molnar <mingo@elte.hu> | 2009-10-01 05:20:33 -0400 |
|---|---|---|
| committer | Ingo Molnar <mingo@elte.hu> | 2009-10-01 05:20:48 -0400 |
| commit | 0aa73ba1c4e1ad1d51a29e0df95ccd9f746918b6 (patch) | |
| tree | f0714ddcd02812b4fbe3b5405df9e4068f5587e2 /mm/memcontrol.c | |
| parent | 925936ebf35a95c290e010b784c962164e6728f3 (diff) | |
| parent | 33974093c024f08caadd2fc71a83bd811ed1831d (diff) | |
Merge branch 'tracing/urgent' into tracing/core
Merge reason: Pick up latest fixes and update to latest upstream.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 739 |
1 files changed, 654 insertions, 85 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fd4529d86de5..e2b98a6875c0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -29,6 +29,7 @@ | |||
| 29 | #include <linux/rcupdate.h> | 29 | #include <linux/rcupdate.h> |
| 30 | #include <linux/limits.h> | 30 | #include <linux/limits.h> |
| 31 | #include <linux/mutex.h> | 31 | #include <linux/mutex.h> |
| 32 | #include <linux/rbtree.h> | ||
| 32 | #include <linux/slab.h> | 33 | #include <linux/slab.h> |
| 33 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
| 34 | #include <linux/spinlock.h> | 35 | #include <linux/spinlock.h> |
| @@ -43,6 +44,7 @@ | |||
| 43 | 44 | ||
| 44 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; | 45 | struct cgroup_subsys mem_cgroup_subsys __read_mostly; |
| 45 | #define MEM_CGROUP_RECLAIM_RETRIES 5 | 46 | #define MEM_CGROUP_RECLAIM_RETRIES 5 |
| 47 | struct mem_cgroup *root_mem_cgroup __read_mostly; | ||
| 46 | 48 | ||
| 47 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 49 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
| 48 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ | 50 | /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ |
| @@ -53,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | |||
| 53 | #endif | 55 | #endif |
| 54 | 56 | ||
| 55 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ | 57 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ |
| 58 | #define SOFTLIMIT_EVENTS_THRESH (1000) | ||
| 56 | 59 | ||
| 57 | /* | 60 | /* |
| 58 | * Statistics for memory cgroup. | 61 | * Statistics for memory cgroup. |
| @@ -66,6 +69,8 @@ enum mem_cgroup_stat_index { | |||
| 66 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ | 69 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ |
| 67 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
| 68 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
| 72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ | ||
| 73 | MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ | ||
| 69 | 74 | ||
| 70 | MEM_CGROUP_STAT_NSTATS, | 75 | MEM_CGROUP_STAT_NSTATS, |
| 71 | }; | 76 | }; |
| @@ -78,6 +83,20 @@ struct mem_cgroup_stat { | |||
| 78 | struct mem_cgroup_stat_cpu cpustat[0]; | 83 | struct mem_cgroup_stat_cpu cpustat[0]; |
| 79 | }; | 84 | }; |
| 80 | 85 | ||
| 86 | static inline void | ||
| 87 | __mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, | ||
| 88 | enum mem_cgroup_stat_index idx) | ||
| 89 | { | ||
| 90 | stat->count[idx] = 0; | ||
| 91 | } | ||
| 92 | |||
| 93 | static inline s64 | ||
| 94 | __mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, | ||
| 95 | enum mem_cgroup_stat_index idx) | ||
| 96 | { | ||
| 97 | return stat->count[idx]; | ||
| 98 | } | ||
| 99 | |||
| 81 | /* | 100 | /* |
| 82 | * For accounting under irq disable, no need for increment preempt count. | 101 | * For accounting under irq disable, no need for increment preempt count. |
| 83 | */ | 102 | */ |
| @@ -117,6 +136,12 @@ struct mem_cgroup_per_zone { | |||
| 117 | unsigned long count[NR_LRU_LISTS]; | 136 | unsigned long count[NR_LRU_LISTS]; |
| 118 | 137 | ||
| 119 | struct zone_reclaim_stat reclaim_stat; | 138 | struct zone_reclaim_stat reclaim_stat; |
| 139 | struct rb_node tree_node; /* RB tree node */ | ||
| 140 | unsigned long long usage_in_excess;/* Set to the value by which */ | ||
| 141 | /* the soft limit is exceeded*/ | ||
| 142 | bool on_tree; | ||
| 143 | struct mem_cgroup *mem; /* Back pointer, we cannot */ | ||
| 144 | /* use container_of */ | ||
| 120 | }; | 145 | }; |
| 121 | /* Macro for accessing counter */ | 146 | /* Macro for accessing counter */ |
| 122 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | 147 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) |
| @@ -130,6 +155,26 @@ struct mem_cgroup_lru_info { | |||
| 130 | }; | 155 | }; |
| 131 | 156 | ||
| 132 | /* | 157 | /* |
| 158 | * Cgroups above their limits are maintained in a RB-Tree, independent of | ||
| 159 | * their hierarchy representation | ||
| 160 | */ | ||
| 161 | |||
| 162 | struct mem_cgroup_tree_per_zone { | ||
| 163 | struct rb_root rb_root; | ||
| 164 | spinlock_t lock; | ||
| 165 | }; | ||
| 166 | |||
| 167 | struct mem_cgroup_tree_per_node { | ||
| 168 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; | ||
| 169 | }; | ||
| 170 | |||
| 171 | struct mem_cgroup_tree { | ||
| 172 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | ||
| 173 | }; | ||
| 174 | |||
| 175 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | ||
| 176 | |||
| 177 | /* | ||
| 133 | * The memory controller data structure. The memory controller controls both | 178 | * The memory controller data structure. The memory controller controls both |
| 134 | * page cache and RSS per cgroup. We would eventually like to provide | 179 | * page cache and RSS per cgroup. We would eventually like to provide |
| 135 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, | 180 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, |
| @@ -186,6 +231,13 @@ struct mem_cgroup { | |||
| 186 | struct mem_cgroup_stat stat; | 231 | struct mem_cgroup_stat stat; |
| 187 | }; | 232 | }; |
| 188 | 233 | ||
| 234 | /* | ||
| 235 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | ||
| 236 | * limit reclaim to prevent infinite loops, if they ever occur. | ||
| 237 | */ | ||
| 238 | #define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) | ||
| 239 | #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) | ||
| 240 | |||
| 189 | enum charge_type { | 241 | enum charge_type { |
| 190 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, | 242 | MEM_CGROUP_CHARGE_TYPE_CACHE = 0, |
| 191 | MEM_CGROUP_CHARGE_TYPE_MAPPED, | 243 | MEM_CGROUP_CHARGE_TYPE_MAPPED, |
| @@ -200,13 +252,8 @@ enum charge_type { | |||
| 200 | #define PCGF_CACHE (1UL << PCG_CACHE) | 252 | #define PCGF_CACHE (1UL << PCG_CACHE) |
| 201 | #define PCGF_USED (1UL << PCG_USED) | 253 | #define PCGF_USED (1UL << PCG_USED) |
| 202 | #define PCGF_LOCK (1UL << PCG_LOCK) | 254 | #define PCGF_LOCK (1UL << PCG_LOCK) |
| 203 | static const unsigned long | 255 | /* Not used, but added here for completeness */ |
| 204 | pcg_default_flags[NR_CHARGE_TYPE] = { | 256 | #define PCGF_ACCT (1UL << PCG_ACCT) |
| 205 | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ | ||
| 206 | PCGF_USED | PCGF_LOCK, /* Anon */ | ||
| 207 | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ | ||
| 208 | 0, /* FORCE */ | ||
| 209 | }; | ||
| 210 | 257 | ||
| 211 | /* for encoding cft->private value on file */ | 258 | /* for encoding cft->private value on file */ |
| 212 | #define _MEM (0) | 259 | #define _MEM (0) |
| @@ -215,15 +262,241 @@ pcg_default_flags[NR_CHARGE_TYPE] = { | |||
| 215 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | 262 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) |
| 216 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | 263 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
| 217 | 264 | ||
| 265 | /* | ||
| 266 | * Reclaim flags for mem_cgroup_hierarchical_reclaim | ||
| 267 | */ | ||
| 268 | #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 | ||
| 269 | #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) | ||
| 270 | #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 | ||
| 271 | #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) | ||
| 272 | #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 | ||
| 273 | #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) | ||
| 274 | |||
| 218 | static void mem_cgroup_get(struct mem_cgroup *mem); | 275 | static void mem_cgroup_get(struct mem_cgroup *mem); |
| 219 | static void mem_cgroup_put(struct mem_cgroup *mem); | 276 | static void mem_cgroup_put(struct mem_cgroup *mem); |
| 220 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 277 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
| 221 | 278 | ||
| 279 | static struct mem_cgroup_per_zone * | ||
| 280 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | ||
| 281 | { | ||
| 282 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | ||
| 283 | } | ||
| 284 | |||
| 285 | static struct mem_cgroup_per_zone * | ||
| 286 | page_cgroup_zoneinfo(struct page_cgroup *pc) | ||
| 287 | { | ||
| 288 | struct mem_cgroup *mem = pc->mem_cgroup; | ||
| 289 | int nid = page_cgroup_nid(pc); | ||
| 290 | int zid = page_cgroup_zid(pc); | ||
| 291 | |||
| 292 | if (!mem) | ||
| 293 | return NULL; | ||
| 294 | |||
| 295 | return mem_cgroup_zoneinfo(mem, nid, zid); | ||
| 296 | } | ||
| 297 | |||
| 298 | static struct mem_cgroup_tree_per_zone * | ||
| 299 | soft_limit_tree_node_zone(int nid, int zid) | ||
| 300 | { | ||
| 301 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
| 302 | } | ||
| 303 | |||
| 304 | static struct mem_cgroup_tree_per_zone * | ||
| 305 | soft_limit_tree_from_page(struct page *page) | ||
| 306 | { | ||
| 307 | int nid = page_to_nid(page); | ||
| 308 | int zid = page_zonenum(page); | ||
| 309 | |||
| 310 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
| 311 | } | ||
| 312 | |||
| 313 | static void | ||
| 314 | __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, | ||
| 315 | struct mem_cgroup_per_zone *mz, | ||
| 316 | struct mem_cgroup_tree_per_zone *mctz) | ||
| 317 | { | ||
| 318 | struct rb_node **p = &mctz->rb_root.rb_node; | ||
| 319 | struct rb_node *parent = NULL; | ||
| 320 | struct mem_cgroup_per_zone *mz_node; | ||
| 321 | |||
| 322 | if (mz->on_tree) | ||
| 323 | return; | ||
| 324 | |||
| 325 | mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res); | ||
| 326 | while (*p) { | ||
| 327 | parent = *p; | ||
| 328 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | ||
| 329 | tree_node); | ||
| 330 | if (mz->usage_in_excess < mz_node->usage_in_excess) | ||
| 331 | p = &(*p)->rb_left; | ||
| 332 | /* | ||
| 333 | * We can't avoid mem cgroups that are over their soft | ||
| 334 | * limit by the same amount | ||
| 335 | */ | ||
| 336 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) | ||
| 337 | p = &(*p)->rb_right; | ||
| 338 | } | ||
| 339 | rb_link_node(&mz->tree_node, parent, p); | ||
| 340 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | ||
| 341 | mz->on_tree = true; | ||
| 342 | } | ||
| 343 | |||
| 344 | static void | ||
| 345 | __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | ||
| 346 | struct mem_cgroup_per_zone *mz, | ||
| 347 | struct mem_cgroup_tree_per_zone *mctz) | ||
| 348 | { | ||
| 349 | if (!mz->on_tree) | ||
| 350 | return; | ||
| 351 | rb_erase(&mz->tree_node, &mctz->rb_root); | ||
| 352 | mz->on_tree = false; | ||
| 353 | } | ||
| 354 | |||
| 355 | static void | ||
| 356 | mem_cgroup_insert_exceeded(struct mem_cgroup *mem, | ||
| 357 | struct mem_cgroup_per_zone *mz, | ||
| 358 | struct mem_cgroup_tree_per_zone *mctz) | ||
| 359 | { | ||
| 360 | spin_lock(&mctz->lock); | ||
| 361 | __mem_cgroup_insert_exceeded(mem, mz, mctz); | ||
| 362 | spin_unlock(&mctz->lock); | ||
| 363 | } | ||
| 364 | |||
| 365 | static void | ||
| 366 | mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | ||
| 367 | struct mem_cgroup_per_zone *mz, | ||
| 368 | struct mem_cgroup_tree_per_zone *mctz) | ||
| 369 | { | ||
| 370 | spin_lock(&mctz->lock); | ||
| 371 | __mem_cgroup_remove_exceeded(mem, mz, mctz); | ||
| 372 | spin_unlock(&mctz->lock); | ||
| 373 | } | ||
| 374 | |||
| 375 | static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) | ||
| 376 | { | ||
| 377 | bool ret = false; | ||
| 378 | int cpu; | ||
| 379 | s64 val; | ||
| 380 | struct mem_cgroup_stat_cpu *cpustat; | ||
| 381 | |||
| 382 | cpu = get_cpu(); | ||
| 383 | cpustat = &mem->stat.cpustat[cpu]; | ||
| 384 | val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
| 385 | if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { | ||
| 386 | __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
| 387 | ret = true; | ||
| 388 | } | ||
| 389 | put_cpu(); | ||
| 390 | return ret; | ||
| 391 | } | ||
| 392 | |||
| 393 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | ||
| 394 | { | ||
| 395 | unsigned long long prev_usage_in_excess, new_usage_in_excess; | ||
| 396 | bool updated_tree = false; | ||
| 397 | struct mem_cgroup_per_zone *mz; | ||
| 398 | struct mem_cgroup_tree_per_zone *mctz; | ||
| 399 | |||
| 400 | mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page)); | ||
| 401 | mctz = soft_limit_tree_from_page(page); | ||
| 402 | |||
| 403 | /* | ||
| 404 | * We do updates in lazy mode, mem's are removed | ||
| 405 | * lazily from the per-zone, per-node rb tree | ||
| 406 | */ | ||
| 407 | prev_usage_in_excess = mz->usage_in_excess; | ||
| 408 | |||
| 409 | new_usage_in_excess = res_counter_soft_limit_excess(&mem->res); | ||
| 410 | if (prev_usage_in_excess) { | ||
| 411 | mem_cgroup_remove_exceeded(mem, mz, mctz); | ||
| 412 | updated_tree = true; | ||
| 413 | } | ||
| 414 | if (!new_usage_in_excess) | ||
| 415 | goto done; | ||
| 416 | mem_cgroup_insert_exceeded(mem, mz, mctz); | ||
| 417 | |||
| 418 | done: | ||
| 419 | if (updated_tree) { | ||
| 420 | spin_lock(&mctz->lock); | ||
| 421 | mz->usage_in_excess = new_usage_in_excess; | ||
| 422 | spin_unlock(&mctz->lock); | ||
| 423 | } | ||
| 424 | } | ||
| 425 | |||
| 426 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) | ||
| 427 | { | ||
| 428 | int node, zone; | ||
| 429 | struct mem_cgroup_per_zone *mz; | ||
| 430 | struct mem_cgroup_tree_per_zone *mctz; | ||
| 431 | |||
| 432 | for_each_node_state(node, N_POSSIBLE) { | ||
| 433 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
| 434 | mz = mem_cgroup_zoneinfo(mem, node, zone); | ||
| 435 | mctz = soft_limit_tree_node_zone(node, zone); | ||
| 436 | mem_cgroup_remove_exceeded(mem, mz, mctz); | ||
| 437 | } | ||
| 438 | } | ||
| 439 | } | ||
| 440 | |||
| 441 | static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) | ||
| 442 | { | ||
| 443 | return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; | ||
| 444 | } | ||
| 445 | |||
| 446 | static struct mem_cgroup_per_zone * | ||
| 447 | __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
| 448 | { | ||
| 449 | struct rb_node *rightmost = NULL; | ||
| 450 | struct mem_cgroup_per_zone *mz = NULL; | ||
| 451 | |||
| 452 | retry: | ||
| 453 | rightmost = rb_last(&mctz->rb_root); | ||
| 454 | if (!rightmost) | ||
| 455 | goto done; /* Nothing to reclaim from */ | ||
| 456 | |||
| 457 | mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); | ||
| 458 | /* | ||
| 459 | * Remove the node now but someone else can add it back, | ||
| 460 | * we will to add it back at the end of reclaim to its correct | ||
| 461 | * position in the tree. | ||
| 462 | */ | ||
| 463 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | ||
| 464 | if (!res_counter_soft_limit_excess(&mz->mem->res) || | ||
| 465 | !css_tryget(&mz->mem->css)) | ||
| 466 | goto retry; | ||
| 467 | done: | ||
| 468 | return mz; | ||
| 469 | } | ||
| 470 | |||
| 471 | static struct mem_cgroup_per_zone * | ||
| 472 | mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) | ||
| 473 | { | ||
| 474 | struct mem_cgroup_per_zone *mz; | ||
| 475 | |||
| 476 | spin_lock(&mctz->lock); | ||
| 477 | mz = __mem_cgroup_largest_soft_limit_node(mctz); | ||
| 478 | spin_unlock(&mctz->lock); | ||
| 479 | return mz; | ||
| 480 | } | ||
| 481 | |||
| 482 | static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, | ||
| 483 | bool charge) | ||
| 484 | { | ||
| 485 | int val = (charge) ? 1 : -1; | ||
| 486 | struct mem_cgroup_stat *stat = &mem->stat; | ||
| 487 | struct mem_cgroup_stat_cpu *cpustat; | ||
| 488 | int cpu = get_cpu(); | ||
| 489 | |||
| 490 | cpustat = &stat->cpustat[cpu]; | ||
| 491 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); | ||
| 492 | put_cpu(); | ||
| 493 | } | ||
| 494 | |||
| 222 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 495 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
| 223 | struct page_cgroup *pc, | 496 | struct page_cgroup *pc, |
| 224 | bool charge) | 497 | bool charge) |
| 225 | { | 498 | { |
| 226 | int val = (charge)? 1 : -1; | 499 | int val = (charge) ? 1 : -1; |
| 227 | struct mem_cgroup_stat *stat = &mem->stat; | 500 | struct mem_cgroup_stat *stat = &mem->stat; |
| 228 | struct mem_cgroup_stat_cpu *cpustat; | 501 | struct mem_cgroup_stat_cpu *cpustat; |
| 229 | int cpu = get_cpu(); | 502 | int cpu = get_cpu(); |
| @@ -240,28 +513,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
| 240 | else | 513 | else |
| 241 | __mem_cgroup_stat_add_safe(cpustat, | 514 | __mem_cgroup_stat_add_safe(cpustat, |
| 242 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); | 515 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); |
| 516 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); | ||
| 243 | put_cpu(); | 517 | put_cpu(); |
| 244 | } | 518 | } |
| 245 | 519 | ||
| 246 | static struct mem_cgroup_per_zone * | ||
| 247 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | ||
| 248 | { | ||
| 249 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | ||
| 250 | } | ||
| 251 | |||
| 252 | static struct mem_cgroup_per_zone * | ||
| 253 | page_cgroup_zoneinfo(struct page_cgroup *pc) | ||
| 254 | { | ||
| 255 | struct mem_cgroup *mem = pc->mem_cgroup; | ||
| 256 | int nid = page_cgroup_nid(pc); | ||
| 257 | int zid = page_cgroup_zid(pc); | ||
| 258 | |||
| 259 | if (!mem) | ||
| 260 | return NULL; | ||
| 261 | |||
| 262 | return mem_cgroup_zoneinfo(mem, nid, zid); | ||
| 263 | } | ||
| 264 | |||
| 265 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | 520 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, |
| 266 | enum lru_list idx) | 521 | enum lru_list idx) |
| 267 | { | 522 | { |
| @@ -354,6 +609,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, | |||
| 354 | return ret; | 609 | return ret; |
| 355 | } | 610 | } |
| 356 | 611 | ||
| 612 | static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) | ||
| 613 | { | ||
| 614 | return (mem == root_mem_cgroup); | ||
| 615 | } | ||
| 616 | |||
| 357 | /* | 617 | /* |
| 358 | * Following LRU functions are allowed to be used without PCG_LOCK. | 618 | * Following LRU functions are allowed to be used without PCG_LOCK. |
| 359 | * Operations are called by routine of global LRU independently from memcg. | 619 | * Operations are called by routine of global LRU independently from memcg. |
| @@ -371,22 +631,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, | |||
| 371 | void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) | 631 | void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) |
| 372 | { | 632 | { |
| 373 | struct page_cgroup *pc; | 633 | struct page_cgroup *pc; |
| 374 | struct mem_cgroup *mem; | ||
| 375 | struct mem_cgroup_per_zone *mz; | 634 | struct mem_cgroup_per_zone *mz; |
| 376 | 635 | ||
| 377 | if (mem_cgroup_disabled()) | 636 | if (mem_cgroup_disabled()) |
| 378 | return; | 637 | return; |
| 379 | pc = lookup_page_cgroup(page); | 638 | pc = lookup_page_cgroup(page); |
| 380 | /* can happen while we handle swapcache. */ | 639 | /* can happen while we handle swapcache. */ |
| 381 | if (list_empty(&pc->lru) || !pc->mem_cgroup) | 640 | if (!TestClearPageCgroupAcctLRU(pc)) |
| 382 | return; | 641 | return; |
| 642 | VM_BUG_ON(!pc->mem_cgroup); | ||
| 383 | /* | 643 | /* |
| 384 | * We don't check PCG_USED bit. It's cleared when the "page" is finally | 644 | * We don't check PCG_USED bit. It's cleared when the "page" is finally |
| 385 | * removed from global LRU. | 645 | * removed from global LRU. |
| 386 | */ | 646 | */ |
| 387 | mz = page_cgroup_zoneinfo(pc); | 647 | mz = page_cgroup_zoneinfo(pc); |
| 388 | mem = pc->mem_cgroup; | ||
| 389 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; | 648 | MEM_CGROUP_ZSTAT(mz, lru) -= 1; |
| 649 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
| 650 | return; | ||
| 651 | VM_BUG_ON(list_empty(&pc->lru)); | ||
| 390 | list_del_init(&pc->lru); | 652 | list_del_init(&pc->lru); |
| 391 | return; | 653 | return; |
| 392 | } | 654 | } |
| @@ -410,8 +672,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) | |||
| 410 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | 672 | * For making pc->mem_cgroup visible, insert smp_rmb() here. |
| 411 | */ | 673 | */ |
| 412 | smp_rmb(); | 674 | smp_rmb(); |
| 413 | /* unused page is not rotated. */ | 675 | /* unused or root page is not rotated. */ |
| 414 | if (!PageCgroupUsed(pc)) | 676 | if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) |
| 415 | return; | 677 | return; |
| 416 | mz = page_cgroup_zoneinfo(pc); | 678 | mz = page_cgroup_zoneinfo(pc); |
| 417 | list_move(&pc->lru, &mz->lists[lru]); | 679 | list_move(&pc->lru, &mz->lists[lru]); |
| @@ -425,6 +687,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
| 425 | if (mem_cgroup_disabled()) | 687 | if (mem_cgroup_disabled()) |
| 426 | return; | 688 | return; |
| 427 | pc = lookup_page_cgroup(page); | 689 | pc = lookup_page_cgroup(page); |
| 690 | VM_BUG_ON(PageCgroupAcctLRU(pc)); | ||
| 428 | /* | 691 | /* |
| 429 | * Used bit is set without atomic ops but after smp_wmb(). | 692 | * Used bit is set without atomic ops but after smp_wmb(). |
| 430 | * For making pc->mem_cgroup visible, insert smp_rmb() here. | 693 | * For making pc->mem_cgroup visible, insert smp_rmb() here. |
| @@ -435,6 +698,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) | |||
| 435 | 698 | ||
| 436 | mz = page_cgroup_zoneinfo(pc); | 699 | mz = page_cgroup_zoneinfo(pc); |
| 437 | MEM_CGROUP_ZSTAT(mz, lru) += 1; | 700 | MEM_CGROUP_ZSTAT(mz, lru) += 1; |
| 701 | SetPageCgroupAcctLRU(pc); | ||
| 702 | if (mem_cgroup_is_root(pc->mem_cgroup)) | ||
| 703 | return; | ||
| 438 | list_add(&pc->lru, &mz->lists[lru]); | 704 | list_add(&pc->lru, &mz->lists[lru]); |
| 439 | } | 705 | } |
| 440 | 706 | ||
| @@ -469,7 +735,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) | |||
| 469 | 735 | ||
| 470 | spin_lock_irqsave(&zone->lru_lock, flags); | 736 | spin_lock_irqsave(&zone->lru_lock, flags); |
| 471 | /* link when the page is linked to LRU but page_cgroup isn't */ | 737 | /* link when the page is linked to LRU but page_cgroup isn't */ |
| 472 | if (PageLRU(page) && list_empty(&pc->lru)) | 738 | if (PageLRU(page) && !PageCgroupAcctLRU(pc)) |
| 473 | mem_cgroup_add_lru_list(page, page_lru(page)); | 739 | mem_cgroup_add_lru_list(page, page_lru(page)); |
| 474 | spin_unlock_irqrestore(&zone->lru_lock, flags); | 740 | spin_unlock_irqrestore(&zone->lru_lock, flags); |
| 475 | } | 741 | } |
| @@ -648,7 +914,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, | |||
| 648 | int nid = z->zone_pgdat->node_id; | 914 | int nid = z->zone_pgdat->node_id; |
| 649 | int zid = zone_idx(z); | 915 | int zid = zone_idx(z); |
| 650 | struct mem_cgroup_per_zone *mz; | 916 | struct mem_cgroup_per_zone *mz; |
| 651 | int lru = LRU_FILE * !!file + !!active; | 917 | int lru = LRU_FILE * file + active; |
| 652 | int ret; | 918 | int ret; |
| 653 | 919 | ||
| 654 | BUG_ON(!mem_cont); | 920 | BUG_ON(!mem_cont); |
| @@ -855,28 +1121,62 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) | |||
| 855 | * If shrink==true, for avoiding to free too much, this returns immedieately. | 1121 | * If shrink==true, for avoiding to free too much, this returns immedieately. |
| 856 | */ | 1122 | */ |
| 857 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | 1123 | static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, |
| 858 | gfp_t gfp_mask, bool noswap, bool shrink) | 1124 | struct zone *zone, |
| 1125 | gfp_t gfp_mask, | ||
| 1126 | unsigned long reclaim_options) | ||
| 859 | { | 1127 | { |
| 860 | struct mem_cgroup *victim; | 1128 | struct mem_cgroup *victim; |
| 861 | int ret, total = 0; | 1129 | int ret, total = 0; |
| 862 | int loop = 0; | 1130 | int loop = 0; |
| 1131 | bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; | ||
| 1132 | bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; | ||
| 1133 | bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; | ||
| 1134 | unsigned long excess = mem_cgroup_get_excess(root_mem); | ||
| 863 | 1135 | ||
| 864 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ | 1136 | /* If memsw_is_minimum==1, swap-out is of-no-use. */ |
| 865 | if (root_mem->memsw_is_minimum) | 1137 | if (root_mem->memsw_is_minimum) |
| 866 | noswap = true; | 1138 | noswap = true; |
| 867 | 1139 | ||
| 868 | while (loop < 2) { | 1140 | while (1) { |
| 869 | victim = mem_cgroup_select_victim(root_mem); | 1141 | victim = mem_cgroup_select_victim(root_mem); |
| 870 | if (victim == root_mem) | 1142 | if (victim == root_mem) { |
| 871 | loop++; | 1143 | loop++; |
| 1144 | if (loop >= 2) { | ||
| 1145 | /* | ||
| 1146 | * If we have not been able to reclaim | ||
| 1147 | * anything, it might because there are | ||
| 1148 | * no reclaimable pages under this hierarchy | ||
| 1149 | */ | ||
| 1150 | if (!check_soft || !total) { | ||
| 1151 | css_put(&victim->css); | ||
| 1152 | break; | ||
| 1153 | } | ||
| 1154 | /* | ||
| 1155 | * We want to do more targetted reclaim. | ||
| 1156 | * excess >> 2 is not to excessive so as to | ||
| 1157 | * reclaim too much, nor too less that we keep | ||
| 1158 | * coming back to reclaim from this cgroup | ||
| 1159 | */ | ||
| 1160 | if (total >= (excess >> 2) || | ||
| 1161 | (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { | ||
| 1162 | css_put(&victim->css); | ||
| 1163 | break; | ||
| 1164 | } | ||
| 1165 | } | ||
| 1166 | } | ||
| 872 | if (!mem_cgroup_local_usage(&victim->stat)) { | 1167 | if (!mem_cgroup_local_usage(&victim->stat)) { |
| 873 | /* this cgroup's local usage == 0 */ | 1168 | /* this cgroup's local usage == 0 */ |
| 874 | css_put(&victim->css); | 1169 | css_put(&victim->css); |
| 875 | continue; | 1170 | continue; |
| 876 | } | 1171 | } |
| 877 | /* we use swappiness of local cgroup */ | 1172 | /* we use swappiness of local cgroup */ |
| 878 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, | 1173 | if (check_soft) |
| 879 | get_swappiness(victim)); | 1174 | ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, |
| 1175 | noswap, get_swappiness(victim), zone, | ||
| 1176 | zone->zone_pgdat->node_id); | ||
| 1177 | else | ||
| 1178 | ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, | ||
| 1179 | noswap, get_swappiness(victim)); | ||
| 880 | css_put(&victim->css); | 1180 | css_put(&victim->css); |
| 881 | /* | 1181 | /* |
| 882 | * At shrinking usage, we can't check we should stop here or | 1182 | * At shrinking usage, we can't check we should stop here or |
| @@ -886,7 +1186,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
| 886 | if (shrink) | 1186 | if (shrink) |
| 887 | return ret; | 1187 | return ret; |
| 888 | total += ret; | 1188 | total += ret; |
| 889 | if (mem_cgroup_check_under_limit(root_mem)) | 1189 | if (check_soft) { |
| 1190 | if (res_counter_check_under_soft_limit(&root_mem->res)) | ||
| 1191 | return total; | ||
| 1192 | } else if (mem_cgroup_check_under_limit(root_mem)) | ||
| 890 | return 1 + total; | 1193 | return 1 + total; |
| 891 | } | 1194 | } |
| 892 | return total; | 1195 | return total; |
| @@ -965,11 +1268,11 @@ done: | |||
| 965 | */ | 1268 | */ |
| 966 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1269 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
| 967 | gfp_t gfp_mask, struct mem_cgroup **memcg, | 1270 | gfp_t gfp_mask, struct mem_cgroup **memcg, |
| 968 | bool oom) | 1271 | bool oom, struct page *page) |
| 969 | { | 1272 | { |
| 970 | struct mem_cgroup *mem, *mem_over_limit; | 1273 | struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit; |
| 971 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1274 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
| 972 | struct res_counter *fail_res; | 1275 | struct res_counter *fail_res, *soft_fail_res = NULL; |
| 973 | 1276 | ||
| 974 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | 1277 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { |
| 975 | /* Don't account this! */ | 1278 | /* Don't account this! */ |
| @@ -996,20 +1299,23 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
| 996 | VM_BUG_ON(css_is_removed(&mem->css)); | 1299 | VM_BUG_ON(css_is_removed(&mem->css)); |
| 997 | 1300 | ||
| 998 | while (1) { | 1301 | while (1) { |
| 999 | int ret; | 1302 | int ret = 0; |
| 1000 | bool noswap = false; | 1303 | unsigned long flags = 0; |
| 1001 | 1304 | ||
| 1002 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); | 1305 | if (mem_cgroup_is_root(mem)) |
| 1306 | goto done; | ||
| 1307 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res, | ||
| 1308 | &soft_fail_res); | ||
| 1003 | if (likely(!ret)) { | 1309 | if (likely(!ret)) { |
| 1004 | if (!do_swap_account) | 1310 | if (!do_swap_account) |
| 1005 | break; | 1311 | break; |
| 1006 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, | 1312 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, |
| 1007 | &fail_res); | 1313 | &fail_res, NULL); |
| 1008 | if (likely(!ret)) | 1314 | if (likely(!ret)) |
| 1009 | break; | 1315 | break; |
| 1010 | /* mem+swap counter fails */ | 1316 | /* mem+swap counter fails */ |
| 1011 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1317 | res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); |
| 1012 | noswap = true; | 1318 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
| 1013 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1319 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
| 1014 | memsw); | 1320 | memsw); |
| 1015 | } else | 1321 | } else |
| @@ -1020,8 +1326,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
| 1020 | if (!(gfp_mask & __GFP_WAIT)) | 1326 | if (!(gfp_mask & __GFP_WAIT)) |
| 1021 | goto nomem; | 1327 | goto nomem; |
| 1022 | 1328 | ||
| 1023 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, | 1329 | ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, |
| 1024 | noswap, false); | 1330 | gfp_mask, flags); |
| 1025 | if (ret) | 1331 | if (ret) |
| 1026 | continue; | 1332 | continue; |
| 1027 | 1333 | ||
| @@ -1046,13 +1352,24 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
| 1046 | goto nomem; | 1352 | goto nomem; |
| 1047 | } | 1353 | } |
| 1048 | } | 1354 | } |
| 1355 | /* | ||
| 1356 | * Insert just the ancestor, we should trickle down to the correct | ||
| 1357 | * cgroup for reclaim, since the other nodes will be below their | ||
| 1358 | * soft limit | ||
| 1359 | */ | ||
| 1360 | if (soft_fail_res) { | ||
| 1361 | mem_over_soft_limit = | ||
| 1362 | mem_cgroup_from_res_counter(soft_fail_res, res); | ||
| 1363 | if (mem_cgroup_soft_limit_check(mem_over_soft_limit)) | ||
| 1364 | mem_cgroup_update_tree(mem_over_soft_limit, page); | ||
| 1365 | } | ||
| 1366 | done: | ||
| 1049 | return 0; | 1367 | return 0; |
| 1050 | nomem: | 1368 | nomem: |
| 1051 | css_put(&mem->css); | 1369 | css_put(&mem->css); |
| 1052 | return -ENOMEM; | 1370 | return -ENOMEM; |
| 1053 | } | 1371 | } |
| 1054 | 1372 | ||
| 1055 | |||
| 1056 | /* | 1373 | /* |
| 1057 | * A helper function to get mem_cgroup from ID. must be called under | 1374 | * A helper function to get mem_cgroup from ID. must be called under |
| 1058 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 1375 | * rcu_read_lock(). The caller must check css_is_removed() or some if |
| @@ -1119,15 +1436,38 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
| 1119 | lock_page_cgroup(pc); | 1436 | lock_page_cgroup(pc); |
| 1120 | if (unlikely(PageCgroupUsed(pc))) { | 1437 | if (unlikely(PageCgroupUsed(pc))) { |
| 1121 | unlock_page_cgroup(pc); | 1438 | unlock_page_cgroup(pc); |
| 1122 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1439 | if (!mem_cgroup_is_root(mem)) { |
| 1123 | if (do_swap_account) | 1440 | res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); |
| 1124 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1441 | if (do_swap_account) |
| 1442 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, | ||
| 1443 | NULL); | ||
| 1444 | } | ||
| 1125 | css_put(&mem->css); | 1445 | css_put(&mem->css); |
| 1126 | return; | 1446 | return; |
| 1127 | } | 1447 | } |
| 1448 | |||
| 1128 | pc->mem_cgroup = mem; | 1449 | pc->mem_cgroup = mem; |
| 1450 | /* | ||
| 1451 | * We access a page_cgroup asynchronously without lock_page_cgroup(). | ||
| 1452 | * Especially when a page_cgroup is taken from a page, pc->mem_cgroup | ||
| 1453 | * is accessed after testing USED bit. To make pc->mem_cgroup visible | ||
| 1454 | * before USED bit, we need memory barrier here. | ||
| 1455 | * See mem_cgroup_add_lru_list(), etc. | ||
| 1456 | */ | ||
| 1129 | smp_wmb(); | 1457 | smp_wmb(); |
| 1130 | pc->flags = pcg_default_flags[ctype]; | 1458 | switch (ctype) { |
| 1459 | case MEM_CGROUP_CHARGE_TYPE_CACHE: | ||
| 1460 | case MEM_CGROUP_CHARGE_TYPE_SHMEM: | ||
| 1461 | SetPageCgroupCache(pc); | ||
| 1462 | SetPageCgroupUsed(pc); | ||
| 1463 | break; | ||
| 1464 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | ||
| 1465 | ClearPageCgroupCache(pc); | ||
| 1466 | SetPageCgroupUsed(pc); | ||
| 1467 | break; | ||
| 1468 | default: | ||
| 1469 | break; | ||
| 1470 | } | ||
| 1131 | 1471 | ||
| 1132 | mem_cgroup_charge_statistics(mem, pc, true); | 1472 | mem_cgroup_charge_statistics(mem, pc, true); |
| 1133 | 1473 | ||
| @@ -1178,7 +1518,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
| 1178 | if (pc->mem_cgroup != from) | 1518 | if (pc->mem_cgroup != from) |
| 1179 | goto out; | 1519 | goto out; |
| 1180 | 1520 | ||
| 1181 | res_counter_uncharge(&from->res, PAGE_SIZE); | 1521 | if (!mem_cgroup_is_root(from)) |
| 1522 | res_counter_uncharge(&from->res, PAGE_SIZE, NULL); | ||
| 1182 | mem_cgroup_charge_statistics(from, pc, false); | 1523 | mem_cgroup_charge_statistics(from, pc, false); |
| 1183 | 1524 | ||
| 1184 | page = pc->page; | 1525 | page = pc->page; |
| @@ -1197,8 +1538,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
| 1197 | 1); | 1538 | 1); |
| 1198 | } | 1539 | } |
| 1199 | 1540 | ||
| 1200 | if (do_swap_account) | 1541 | if (do_swap_account && !mem_cgroup_is_root(from)) |
| 1201 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | 1542 | res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL); |
| 1202 | css_put(&from->css); | 1543 | css_put(&from->css); |
| 1203 | 1544 | ||
| 1204 | css_get(&to->css); | 1545 | css_get(&to->css); |
| @@ -1238,7 +1579,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
| 1238 | parent = mem_cgroup_from_cont(pcg); | 1579 | parent = mem_cgroup_from_cont(pcg); |
| 1239 | 1580 | ||
| 1240 | 1581 | ||
| 1241 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | 1582 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); |
| 1242 | if (ret || !parent) | 1583 | if (ret || !parent) |
| 1243 | return ret; | 1584 | return ret; |
| 1244 | 1585 | ||
| @@ -1268,9 +1609,11 @@ uncharge: | |||
| 1268 | /* drop extra refcnt by try_charge() */ | 1609 | /* drop extra refcnt by try_charge() */ |
| 1269 | css_put(&parent->css); | 1610 | css_put(&parent->css); |
| 1270 | /* uncharge if move fails */ | 1611 | /* uncharge if move fails */ |
| 1271 | res_counter_uncharge(&parent->res, PAGE_SIZE); | 1612 | if (!mem_cgroup_is_root(parent)) { |
| 1272 | if (do_swap_account) | 1613 | res_counter_uncharge(&parent->res, PAGE_SIZE, NULL); |
| 1273 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | 1614 | if (do_swap_account) |
| 1615 | res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL); | ||
| 1616 | } | ||
| 1274 | return ret; | 1617 | return ret; |
| 1275 | } | 1618 | } |
| 1276 | 1619 | ||
| @@ -1295,7 +1638,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
| 1295 | prefetchw(pc); | 1638 | prefetchw(pc); |
| 1296 | 1639 | ||
| 1297 | mem = memcg; | 1640 | mem = memcg; |
| 1298 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); | 1641 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); |
| 1299 | if (ret || !mem) | 1642 | if (ret || !mem) |
| 1300 | return ret; | 1643 | return ret; |
| 1301 | 1644 | ||
| @@ -1414,14 +1757,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
| 1414 | if (!mem) | 1757 | if (!mem) |
| 1415 | goto charge_cur_mm; | 1758 | goto charge_cur_mm; |
| 1416 | *ptr = mem; | 1759 | *ptr = mem; |
| 1417 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); | 1760 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); |
| 1418 | /* drop extra refcnt from tryget */ | 1761 | /* drop extra refcnt from tryget */ |
| 1419 | css_put(&mem->css); | 1762 | css_put(&mem->css); |
| 1420 | return ret; | 1763 | return ret; |
| 1421 | charge_cur_mm: | 1764 | charge_cur_mm: |
| 1422 | if (unlikely(!mm)) | 1765 | if (unlikely(!mm)) |
| 1423 | mm = &init_mm; | 1766 | mm = &init_mm; |
| 1424 | return __mem_cgroup_try_charge(mm, mask, ptr, true); | 1767 | return __mem_cgroup_try_charge(mm, mask, ptr, true, page); |
| 1425 | } | 1768 | } |
| 1426 | 1769 | ||
| 1427 | static void | 1770 | static void |
| @@ -1459,7 +1802,10 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
| 1459 | * This recorded memcg can be obsolete one. So, avoid | 1802 | * This recorded memcg can be obsolete one. So, avoid |
| 1460 | * calling css_tryget | 1803 | * calling css_tryget |
| 1461 | */ | 1804 | */ |
| 1462 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 1805 | if (!mem_cgroup_is_root(memcg)) |
| 1806 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE, | ||
| 1807 | NULL); | ||
| 1808 | mem_cgroup_swap_statistics(memcg, false); | ||
| 1463 | mem_cgroup_put(memcg); | 1809 | mem_cgroup_put(memcg); |
| 1464 | } | 1810 | } |
| 1465 | rcu_read_unlock(); | 1811 | rcu_read_unlock(); |
| @@ -1484,9 +1830,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
| 1484 | return; | 1830 | return; |
| 1485 | if (!mem) | 1831 | if (!mem) |
| 1486 | return; | 1832 | return; |
| 1487 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1833 | if (!mem_cgroup_is_root(mem)) { |
| 1488 | if (do_swap_account) | 1834 | res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); |
| 1489 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1835 | if (do_swap_account) |
| 1836 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); | ||
| 1837 | } | ||
| 1490 | css_put(&mem->css); | 1838 | css_put(&mem->css); |
| 1491 | } | 1839 | } |
| 1492 | 1840 | ||
| @@ -1500,6 +1848,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 1500 | struct page_cgroup *pc; | 1848 | struct page_cgroup *pc; |
| 1501 | struct mem_cgroup *mem = NULL; | 1849 | struct mem_cgroup *mem = NULL; |
| 1502 | struct mem_cgroup_per_zone *mz; | 1850 | struct mem_cgroup_per_zone *mz; |
| 1851 | bool soft_limit_excess = false; | ||
| 1503 | 1852 | ||
| 1504 | if (mem_cgroup_disabled()) | 1853 | if (mem_cgroup_disabled()) |
| 1505 | return NULL; | 1854 | return NULL; |
| @@ -1538,9 +1887,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 1538 | break; | 1887 | break; |
| 1539 | } | 1888 | } |
| 1540 | 1889 | ||
| 1541 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1890 | if (!mem_cgroup_is_root(mem)) { |
| 1542 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | 1891 | res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); |
| 1543 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1892 | if (do_swap_account && |
| 1893 | (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | ||
| 1894 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); | ||
| 1895 | } | ||
| 1896 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||
| 1897 | mem_cgroup_swap_statistics(mem, true); | ||
| 1544 | mem_cgroup_charge_statistics(mem, pc, false); | 1898 | mem_cgroup_charge_statistics(mem, pc, false); |
| 1545 | 1899 | ||
| 1546 | ClearPageCgroupUsed(pc); | 1900 | ClearPageCgroupUsed(pc); |
| @@ -1554,6 +1908,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 1554 | mz = page_cgroup_zoneinfo(pc); | 1908 | mz = page_cgroup_zoneinfo(pc); |
| 1555 | unlock_page_cgroup(pc); | 1909 | unlock_page_cgroup(pc); |
| 1556 | 1910 | ||
| 1911 | if (soft_limit_excess && mem_cgroup_soft_limit_check(mem)) | ||
| 1912 | mem_cgroup_update_tree(mem, page); | ||
| 1557 | /* at swapout, this memcg will be accessed to record to swap */ | 1913 | /* at swapout, this memcg will be accessed to record to swap */ |
| 1558 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 1914 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
| 1559 | css_put(&mem->css); | 1915 | css_put(&mem->css); |
| @@ -1629,7 +1985,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
| 1629 | * We uncharge this because swap is freed. | 1985 | * We uncharge this because swap is freed. |
| 1630 | * This memcg can be obsolete one. We avoid calling css_tryget | 1986 | * This memcg can be obsolete one. We avoid calling css_tryget |
| 1631 | */ | 1987 | */ |
| 1632 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 1988 | if (!mem_cgroup_is_root(memcg)) |
| 1989 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); | ||
| 1990 | mem_cgroup_swap_statistics(memcg, false); | ||
| 1633 | mem_cgroup_put(memcg); | 1991 | mem_cgroup_put(memcg); |
| 1634 | } | 1992 | } |
| 1635 | rcu_read_unlock(); | 1993 | rcu_read_unlock(); |
| @@ -1658,7 +2016,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | |||
| 1658 | unlock_page_cgroup(pc); | 2016 | unlock_page_cgroup(pc); |
| 1659 | 2017 | ||
| 1660 | if (mem) { | 2018 | if (mem) { |
| 1661 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); | 2019 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, |
| 2020 | page); | ||
| 1662 | css_put(&mem->css); | 2021 | css_put(&mem->css); |
| 1663 | } | 2022 | } |
| 1664 | *ptr = mem; | 2023 | *ptr = mem; |
| @@ -1798,8 +2157,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
| 1798 | if (!ret) | 2157 | if (!ret) |
| 1799 | break; | 2158 | break; |
| 1800 | 2159 | ||
| 1801 | progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, | 2160 | progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, |
| 1802 | false, true); | 2161 | GFP_KERNEL, |
| 2162 | MEM_CGROUP_RECLAIM_SHRINK); | ||
| 1803 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 2163 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
| 1804 | /* Usage is reduced ? */ | 2164 | /* Usage is reduced ? */ |
| 1805 | if (curusage >= oldusage) | 2165 | if (curusage >= oldusage) |
| @@ -1851,7 +2211,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
| 1851 | if (!ret) | 2211 | if (!ret) |
| 1852 | break; | 2212 | break; |
| 1853 | 2213 | ||
| 1854 | mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true); | 2214 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, |
| 2215 | MEM_CGROUP_RECLAIM_NOSWAP | | ||
| 2216 | MEM_CGROUP_RECLAIM_SHRINK); | ||
| 1855 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); | 2217 | curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); |
| 1856 | /* Usage is reduced ? */ | 2218 | /* Usage is reduced ? */ |
| 1857 | if (curusage >= oldusage) | 2219 | if (curusage >= oldusage) |
| @@ -1862,6 +2224,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
| 1862 | return ret; | 2224 | return ret; |
| 1863 | } | 2225 | } |
| 1864 | 2226 | ||
| 2227 | unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, | ||
| 2228 | gfp_t gfp_mask, int nid, | ||
| 2229 | int zid) | ||
| 2230 | { | ||
| 2231 | unsigned long nr_reclaimed = 0; | ||
| 2232 | struct mem_cgroup_per_zone *mz, *next_mz = NULL; | ||
| 2233 | unsigned long reclaimed; | ||
| 2234 | int loop = 0; | ||
| 2235 | struct mem_cgroup_tree_per_zone *mctz; | ||
| 2236 | |||
| 2237 | if (order > 0) | ||
| 2238 | return 0; | ||
| 2239 | |||
| 2240 | mctz = soft_limit_tree_node_zone(nid, zid); | ||
| 2241 | /* | ||
| 2242 | * This loop can run a while, specially if mem_cgroup's continuously | ||
| 2243 | * keep exceeding their soft limit and putting the system under | ||
| 2244 | * pressure | ||
| 2245 | */ | ||
| 2246 | do { | ||
| 2247 | if (next_mz) | ||
| 2248 | mz = next_mz; | ||
| 2249 | else | ||
| 2250 | mz = mem_cgroup_largest_soft_limit_node(mctz); | ||
| 2251 | if (!mz) | ||
| 2252 | break; | ||
| 2253 | |||
| 2254 | reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, | ||
| 2255 | gfp_mask, | ||
| 2256 | MEM_CGROUP_RECLAIM_SOFT); | ||
| 2257 | nr_reclaimed += reclaimed; | ||
| 2258 | spin_lock(&mctz->lock); | ||
| 2259 | |||
| 2260 | /* | ||
| 2261 | * If we failed to reclaim anything from this memory cgroup | ||
| 2262 | * it is time to move on to the next cgroup | ||
| 2263 | */ | ||
| 2264 | next_mz = NULL; | ||
| 2265 | if (!reclaimed) { | ||
| 2266 | do { | ||
| 2267 | /* | ||
| 2268 | * Loop until we find yet another one. | ||
| 2269 | * | ||
| 2270 | * By the time we get the soft_limit lock | ||
| 2271 | * again, someone might have aded the | ||
| 2272 | * group back on the RB tree. Iterate to | ||
| 2273 | * make sure we get a different mem. | ||
| 2274 | * mem_cgroup_largest_soft_limit_node returns | ||
| 2275 | * NULL if no other cgroup is present on | ||
| 2276 | * the tree | ||
| 2277 | */ | ||
| 2278 | next_mz = | ||
| 2279 | __mem_cgroup_largest_soft_limit_node(mctz); | ||
| 2280 | if (next_mz == mz) { | ||
| 2281 | css_put(&next_mz->mem->css); | ||
| 2282 | next_mz = NULL; | ||
| 2283 | } else /* next_mz == NULL or other memcg */ | ||
| 2284 | break; | ||
| 2285 | } while (1); | ||
| 2286 | } | ||
| 2287 | mz->usage_in_excess = | ||
| 2288 | res_counter_soft_limit_excess(&mz->mem->res); | ||
| 2289 | __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); | ||
| 2290 | /* | ||
| 2291 | * One school of thought says that we should not add | ||
| 2292 | * back the node to the tree if reclaim returns 0. | ||
| 2293 | * But our reclaim could return 0, simply because due | ||
| 2294 | * to priority we are exposing a smaller subset of | ||
| 2295 | * memory to reclaim from. Consider this as a longer | ||
| 2296 | * term TODO. | ||
| 2297 | */ | ||
| 2298 | if (mz->usage_in_excess) | ||
| 2299 | __mem_cgroup_insert_exceeded(mz->mem, mz, mctz); | ||
| 2300 | spin_unlock(&mctz->lock); | ||
| 2301 | css_put(&mz->mem->css); | ||
| 2302 | loop++; | ||
| 2303 | /* | ||
| 2304 | * Could not reclaim anything and there are no more | ||
| 2305 | * mem cgroups to try or we seem to be looping without | ||
| 2306 | * reclaiming anything. | ||
| 2307 | */ | ||
| 2308 | if (!nr_reclaimed && | ||
| 2309 | (next_mz == NULL || | ||
| 2310 | loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) | ||
| 2311 | break; | ||
| 2312 | } while (!nr_reclaimed); | ||
| 2313 | if (next_mz) | ||
| 2314 | css_put(&next_mz->mem->css); | ||
| 2315 | return nr_reclaimed; | ||
| 2316 | } | ||
| 2317 | |||
| 1865 | /* | 2318 | /* |
| 1866 | * This routine traverse page_cgroup in given list and drop them all. | 2319 | * This routine traverse page_cgroup in given list and drop them all. |
| 1867 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. | 2320 | * *And* this routine doesn't reclaim page itself, just removes page_cgroup. |
| @@ -2046,20 +2499,64 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, | |||
| 2046 | return retval; | 2499 | return retval; |
| 2047 | } | 2500 | } |
| 2048 | 2501 | ||
| 2502 | struct mem_cgroup_idx_data { | ||
| 2503 | s64 val; | ||
| 2504 | enum mem_cgroup_stat_index idx; | ||
| 2505 | }; | ||
| 2506 | |||
| 2507 | static int | ||
| 2508 | mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) | ||
| 2509 | { | ||
| 2510 | struct mem_cgroup_idx_data *d = data; | ||
| 2511 | d->val += mem_cgroup_read_stat(&mem->stat, d->idx); | ||
| 2512 | return 0; | ||
| 2513 | } | ||
| 2514 | |||
| 2515 | static void | ||
| 2516 | mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, | ||
| 2517 | enum mem_cgroup_stat_index idx, s64 *val) | ||
| 2518 | { | ||
| 2519 | struct mem_cgroup_idx_data d; | ||
| 2520 | d.idx = idx; | ||
| 2521 | d.val = 0; | ||
| 2522 | mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); | ||
| 2523 | *val = d.val; | ||
| 2524 | } | ||
| 2525 | |||
| 2049 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | 2526 | static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) |
| 2050 | { | 2527 | { |
| 2051 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); | 2528 | struct mem_cgroup *mem = mem_cgroup_from_cont(cont); |
| 2052 | u64 val = 0; | 2529 | u64 idx_val, val; |
| 2053 | int type, name; | 2530 | int type, name; |
| 2054 | 2531 | ||
| 2055 | type = MEMFILE_TYPE(cft->private); | 2532 | type = MEMFILE_TYPE(cft->private); |
| 2056 | name = MEMFILE_ATTR(cft->private); | 2533 | name = MEMFILE_ATTR(cft->private); |
| 2057 | switch (type) { | 2534 | switch (type) { |
| 2058 | case _MEM: | 2535 | case _MEM: |
| 2059 | val = res_counter_read_u64(&mem->res, name); | 2536 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { |
| 2537 | mem_cgroup_get_recursive_idx_stat(mem, | ||
| 2538 | MEM_CGROUP_STAT_CACHE, &idx_val); | ||
| 2539 | val = idx_val; | ||
| 2540 | mem_cgroup_get_recursive_idx_stat(mem, | ||
| 2541 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
| 2542 | val += idx_val; | ||
| 2543 | val <<= PAGE_SHIFT; | ||
| 2544 | } else | ||
| 2545 | val = res_counter_read_u64(&mem->res, name); | ||
| 2060 | break; | 2546 | break; |
| 2061 | case _MEMSWAP: | 2547 | case _MEMSWAP: |
| 2062 | val = res_counter_read_u64(&mem->memsw, name); | 2548 | if (name == RES_USAGE && mem_cgroup_is_root(mem)) { |
| 2549 | mem_cgroup_get_recursive_idx_stat(mem, | ||
| 2550 | MEM_CGROUP_STAT_CACHE, &idx_val); | ||
| 2551 | val = idx_val; | ||
| 2552 | mem_cgroup_get_recursive_idx_stat(mem, | ||
| 2553 | MEM_CGROUP_STAT_RSS, &idx_val); | ||
| 2554 | val += idx_val; | ||
| 2555 | mem_cgroup_get_recursive_idx_stat(mem, | ||
| 2556 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | ||
| 2557 | val <<= PAGE_SHIFT; | ||
| 2558 | } else | ||
| 2559 | val = res_counter_read_u64(&mem->memsw, name); | ||
| 2063 | break; | 2560 | break; |
| 2064 | default: | 2561 | default: |
| 2065 | BUG(); | 2562 | BUG(); |
| @@ -2083,6 +2580,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
| 2083 | name = MEMFILE_ATTR(cft->private); | 2580 | name = MEMFILE_ATTR(cft->private); |
| 2084 | switch (name) { | 2581 | switch (name) { |
| 2085 | case RES_LIMIT: | 2582 | case RES_LIMIT: |
| 2583 | if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ | ||
| 2584 | ret = -EINVAL; | ||
| 2585 | break; | ||
| 2586 | } | ||
| 2086 | /* This function does all necessary parse...reuse it */ | 2587 | /* This function does all necessary parse...reuse it */ |
| 2087 | ret = res_counter_memparse_write_strategy(buffer, &val); | 2588 | ret = res_counter_memparse_write_strategy(buffer, &val); |
| 2088 | if (ret) | 2589 | if (ret) |
| @@ -2092,6 +2593,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, | |||
| 2092 | else | 2593 | else |
| 2093 | ret = mem_cgroup_resize_memsw_limit(memcg, val); | 2594 | ret = mem_cgroup_resize_memsw_limit(memcg, val); |
| 2094 | break; | 2595 | break; |
| 2596 | case RES_SOFT_LIMIT: | ||
| 2597 | ret = res_counter_memparse_write_strategy(buffer, &val); | ||
| 2598 | if (ret) | ||
| 2599 | break; | ||
| 2600 | /* | ||
| 2601 | * For memsw, soft limits are hard to implement in terms | ||
| 2602 | * of semantics, for now, we support soft limits for | ||
| 2603 | * control without swap | ||
| 2604 | */ | ||
| 2605 | if (type == _MEM) | ||
| 2606 | ret = res_counter_set_soft_limit(&memcg->res, val); | ||
| 2607 | else | ||
| 2608 | ret = -EINVAL; | ||
| 2609 | break; | ||
| 2095 | default: | 2610 | default: |
| 2096 | ret = -EINVAL; /* should be BUG() ? */ | 2611 | ret = -EINVAL; /* should be BUG() ? */ |
| 2097 | break; | 2612 | break; |
| @@ -2149,6 +2664,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
| 2149 | res_counter_reset_failcnt(&mem->memsw); | 2664 | res_counter_reset_failcnt(&mem->memsw); |
| 2150 | break; | 2665 | break; |
| 2151 | } | 2666 | } |
| 2667 | |||
| 2152 | return 0; | 2668 | return 0; |
| 2153 | } | 2669 | } |
| 2154 | 2670 | ||
| @@ -2160,6 +2676,7 @@ enum { | |||
| 2160 | MCS_MAPPED_FILE, | 2676 | MCS_MAPPED_FILE, |
| 2161 | MCS_PGPGIN, | 2677 | MCS_PGPGIN, |
| 2162 | MCS_PGPGOUT, | 2678 | MCS_PGPGOUT, |
| 2679 | MCS_SWAP, | ||
| 2163 | MCS_INACTIVE_ANON, | 2680 | MCS_INACTIVE_ANON, |
| 2164 | MCS_ACTIVE_ANON, | 2681 | MCS_ACTIVE_ANON, |
| 2165 | MCS_INACTIVE_FILE, | 2682 | MCS_INACTIVE_FILE, |
| @@ -2181,6 +2698,7 @@ struct { | |||
| 2181 | {"mapped_file", "total_mapped_file"}, | 2698 | {"mapped_file", "total_mapped_file"}, |
| 2182 | {"pgpgin", "total_pgpgin"}, | 2699 | {"pgpgin", "total_pgpgin"}, |
| 2183 | {"pgpgout", "total_pgpgout"}, | 2700 | {"pgpgout", "total_pgpgout"}, |
| 2701 | {"swap", "total_swap"}, | ||
| 2184 | {"inactive_anon", "total_inactive_anon"}, | 2702 | {"inactive_anon", "total_inactive_anon"}, |
| 2185 | {"active_anon", "total_active_anon"}, | 2703 | {"active_anon", "total_active_anon"}, |
| 2186 | {"inactive_file", "total_inactive_file"}, | 2704 | {"inactive_file", "total_inactive_file"}, |
| @@ -2205,6 +2723,10 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | |||
| 2205 | s->stat[MCS_PGPGIN] += val; | 2723 | s->stat[MCS_PGPGIN] += val; |
| 2206 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); | 2724 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); |
| 2207 | s->stat[MCS_PGPGOUT] += val; | 2725 | s->stat[MCS_PGPGOUT] += val; |
| 2726 | if (do_swap_account) { | ||
| 2727 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); | ||
| 2728 | s->stat[MCS_SWAP] += val * PAGE_SIZE; | ||
| 2729 | } | ||
| 2208 | 2730 | ||
| 2209 | /* per zone stat */ | 2731 | /* per zone stat */ |
| 2210 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); | 2732 | val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); |
| @@ -2236,8 +2758,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
| 2236 | memset(&mystat, 0, sizeof(mystat)); | 2758 | memset(&mystat, 0, sizeof(mystat)); |
| 2237 | mem_cgroup_get_local_stat(mem_cont, &mystat); | 2759 | mem_cgroup_get_local_stat(mem_cont, &mystat); |
| 2238 | 2760 | ||
| 2239 | for (i = 0; i < NR_MCS_STAT; i++) | 2761 | for (i = 0; i < NR_MCS_STAT; i++) { |
| 2762 | if (i == MCS_SWAP && !do_swap_account) | ||
| 2763 | continue; | ||
| 2240 | cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); | 2764 | cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); |
| 2765 | } | ||
| 2241 | 2766 | ||
| 2242 | /* Hierarchical information */ | 2767 | /* Hierarchical information */ |
| 2243 | { | 2768 | { |
| @@ -2250,9 +2775,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, | |||
| 2250 | 2775 | ||
| 2251 | memset(&mystat, 0, sizeof(mystat)); | 2776 | memset(&mystat, 0, sizeof(mystat)); |
| 2252 | mem_cgroup_get_total_stat(mem_cont, &mystat); | 2777 | mem_cgroup_get_total_stat(mem_cont, &mystat); |
| 2253 | for (i = 0; i < NR_MCS_STAT; i++) | 2778 | for (i = 0; i < NR_MCS_STAT; i++) { |
| 2779 | if (i == MCS_SWAP && !do_swap_account) | ||
| 2780 | continue; | ||
| 2254 | cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); | 2781 | cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); |
| 2255 | 2782 | } | |
| 2256 | 2783 | ||
| 2257 | #ifdef CONFIG_DEBUG_VM | 2784 | #ifdef CONFIG_DEBUG_VM |
| 2258 | cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); | 2785 | cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); |
| @@ -2345,6 +2872,12 @@ static struct cftype mem_cgroup_files[] = { | |||
| 2345 | .read_u64 = mem_cgroup_read, | 2872 | .read_u64 = mem_cgroup_read, |
| 2346 | }, | 2873 | }, |
| 2347 | { | 2874 | { |
| 2875 | .name = "soft_limit_in_bytes", | ||
| 2876 | .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), | ||
| 2877 | .write_string = mem_cgroup_write, | ||
| 2878 | .read_u64 = mem_cgroup_read, | ||
| 2879 | }, | ||
| 2880 | { | ||
| 2348 | .name = "failcnt", | 2881 | .name = "failcnt", |
| 2349 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), | 2882 | .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), |
| 2350 | .trigger = mem_cgroup_reset, | 2883 | .trigger = mem_cgroup_reset, |
| @@ -2438,6 +2971,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
| 2438 | mz = &pn->zoneinfo[zone]; | 2971 | mz = &pn->zoneinfo[zone]; |
| 2439 | for_each_lru(l) | 2972 | for_each_lru(l) |
| 2440 | INIT_LIST_HEAD(&mz->lists[l]); | 2973 | INIT_LIST_HEAD(&mz->lists[l]); |
| 2974 | mz->usage_in_excess = 0; | ||
| 2975 | mz->on_tree = false; | ||
| 2976 | mz->mem = mem; | ||
| 2441 | } | 2977 | } |
| 2442 | return 0; | 2978 | return 0; |
| 2443 | } | 2979 | } |
| @@ -2483,6 +3019,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) | |||
| 2483 | { | 3019 | { |
| 2484 | int node; | 3020 | int node; |
| 2485 | 3021 | ||
| 3022 | mem_cgroup_remove_from_trees(mem); | ||
| 2486 | free_css_id(&mem_cgroup_subsys, &mem->css); | 3023 | free_css_id(&mem_cgroup_subsys, &mem->css); |
| 2487 | 3024 | ||
| 2488 | for_each_node_state(node, N_POSSIBLE) | 3025 | for_each_node_state(node, N_POSSIBLE) |
| @@ -2531,6 +3068,31 @@ static void __init enable_swap_cgroup(void) | |||
| 2531 | } | 3068 | } |
| 2532 | #endif | 3069 | #endif |
| 2533 | 3070 | ||
| 3071 | static int mem_cgroup_soft_limit_tree_init(void) | ||
| 3072 | { | ||
| 3073 | struct mem_cgroup_tree_per_node *rtpn; | ||
| 3074 | struct mem_cgroup_tree_per_zone *rtpz; | ||
| 3075 | int tmp, node, zone; | ||
| 3076 | |||
| 3077 | for_each_node_state(node, N_POSSIBLE) { | ||
| 3078 | tmp = node; | ||
| 3079 | if (!node_state(node, N_NORMAL_MEMORY)) | ||
| 3080 | tmp = -1; | ||
| 3081 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | ||
| 3082 | if (!rtpn) | ||
| 3083 | return 1; | ||
| 3084 | |||
| 3085 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
| 3086 | |||
| 3087 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
| 3088 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
| 3089 | rtpz->rb_root = RB_ROOT; | ||
| 3090 | spin_lock_init(&rtpz->lock); | ||
| 3091 | } | ||
| 3092 | } | ||
| 3093 | return 0; | ||
| 3094 | } | ||
| 3095 | |||
| 2534 | static struct cgroup_subsys_state * __ref | 3096 | static struct cgroup_subsys_state * __ref |
| 2535 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | 3097 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) |
| 2536 | { | 3098 | { |
| @@ -2545,10 +3107,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 2545 | for_each_node_state(node, N_POSSIBLE) | 3107 | for_each_node_state(node, N_POSSIBLE) |
| 2546 | if (alloc_mem_cgroup_per_zone_info(mem, node)) | 3108 | if (alloc_mem_cgroup_per_zone_info(mem, node)) |
| 2547 | goto free_out; | 3109 | goto free_out; |
| 3110 | |||
| 2548 | /* root ? */ | 3111 | /* root ? */ |
| 2549 | if (cont->parent == NULL) { | 3112 | if (cont->parent == NULL) { |
| 2550 | enable_swap_cgroup(); | 3113 | enable_swap_cgroup(); |
| 2551 | parent = NULL; | 3114 | parent = NULL; |
| 3115 | root_mem_cgroup = mem; | ||
| 3116 | if (mem_cgroup_soft_limit_tree_init()) | ||
| 3117 | goto free_out; | ||
| 3118 | |||
| 2552 | } else { | 3119 | } else { |
| 2553 | parent = mem_cgroup_from_cont(cont->parent); | 3120 | parent = mem_cgroup_from_cont(cont->parent); |
| 2554 | mem->use_hierarchy = parent->use_hierarchy; | 3121 | mem->use_hierarchy = parent->use_hierarchy; |
| @@ -2577,6 +3144,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 2577 | return &mem->css; | 3144 | return &mem->css; |
| 2578 | free_out: | 3145 | free_out: |
| 2579 | __mem_cgroup_free(mem); | 3146 | __mem_cgroup_free(mem); |
| 3147 | root_mem_cgroup = NULL; | ||
| 2580 | return ERR_PTR(error); | 3148 | return ERR_PTR(error); |
| 2581 | } | 3149 | } |
| 2582 | 3150 | ||
| @@ -2612,7 +3180,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, | |||
| 2612 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, | 3180 | static void mem_cgroup_move_task(struct cgroup_subsys *ss, |
| 2613 | struct cgroup *cont, | 3181 | struct cgroup *cont, |
| 2614 | struct cgroup *old_cont, | 3182 | struct cgroup *old_cont, |
| 2615 | struct task_struct *p) | 3183 | struct task_struct *p, |
| 3184 | bool threadgroup) | ||
| 2616 | { | 3185 | { |
| 2617 | mutex_lock(&memcg_tasklist); | 3186 | mutex_lock(&memcg_tasklist); |
| 2618 | /* | 3187 | /* |
