diff options
| -rw-r--r-- | include/linux/res_counter.h | 6 | ||||
| -rw-r--r-- | kernel/res_counter.c | 18 | ||||
| -rw-r--r-- | mm/memcontrol.c | 300 |
3 files changed, 277 insertions, 47 deletions
diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index fcb9884df618..731af71cddc9 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h | |||
| @@ -114,7 +114,8 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent); | |||
| 114 | int __must_check res_counter_charge_locked(struct res_counter *counter, | 114 | int __must_check res_counter_charge_locked(struct res_counter *counter, |
| 115 | unsigned long val); | 115 | unsigned long val); |
| 116 | int __must_check res_counter_charge(struct res_counter *counter, | 116 | int __must_check res_counter_charge(struct res_counter *counter, |
| 117 | unsigned long val, struct res_counter **limit_fail_at); | 117 | unsigned long val, struct res_counter **limit_fail_at, |
| 118 | struct res_counter **soft_limit_at); | ||
| 118 | 119 | ||
| 119 | /* | 120 | /* |
| 120 | * uncharge - tell that some portion of the resource is released | 121 | * uncharge - tell that some portion of the resource is released |
| @@ -127,7 +128,8 @@ int __must_check res_counter_charge(struct res_counter *counter, | |||
| 127 | */ | 128 | */ |
| 128 | 129 | ||
| 129 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); | 130 | void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val); |
| 130 | void res_counter_uncharge(struct res_counter *counter, unsigned long val); | 131 | void res_counter_uncharge(struct res_counter *counter, unsigned long val, |
| 132 | bool *was_soft_limit_excess); | ||
| 131 | 133 | ||
| 132 | static inline bool res_counter_limit_check_locked(struct res_counter *cnt) | 134 | static inline bool res_counter_limit_check_locked(struct res_counter *cnt) |
| 133 | { | 135 | { |
diff --git a/kernel/res_counter.c b/kernel/res_counter.c index bcdabf37c40b..88faec23e833 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c | |||
| @@ -37,17 +37,27 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val) | |||
| 37 | } | 37 | } |
| 38 | 38 | ||
| 39 | int res_counter_charge(struct res_counter *counter, unsigned long val, | 39 | int res_counter_charge(struct res_counter *counter, unsigned long val, |
| 40 | struct res_counter **limit_fail_at) | 40 | struct res_counter **limit_fail_at, |
| 41 | struct res_counter **soft_limit_fail_at) | ||
| 41 | { | 42 | { |
| 42 | int ret; | 43 | int ret; |
| 43 | unsigned long flags; | 44 | unsigned long flags; |
| 44 | struct res_counter *c, *u; | 45 | struct res_counter *c, *u; |
| 45 | 46 | ||
| 46 | *limit_fail_at = NULL; | 47 | *limit_fail_at = NULL; |
| 48 | if (soft_limit_fail_at) | ||
| 49 | *soft_limit_fail_at = NULL; | ||
| 47 | local_irq_save(flags); | 50 | local_irq_save(flags); |
| 48 | for (c = counter; c != NULL; c = c->parent) { | 51 | for (c = counter; c != NULL; c = c->parent) { |
| 49 | spin_lock(&c->lock); | 52 | spin_lock(&c->lock); |
| 50 | ret = res_counter_charge_locked(c, val); | 53 | ret = res_counter_charge_locked(c, val); |
| 54 | /* | ||
| 55 | * With soft limits, we return the highest ancestor | ||
| 56 | * that exceeds its soft limit | ||
| 57 | */ | ||
| 58 | if (soft_limit_fail_at && | ||
| 59 | !res_counter_soft_limit_check_locked(c)) | ||
| 60 | *soft_limit_fail_at = c; | ||
| 51 | spin_unlock(&c->lock); | 61 | spin_unlock(&c->lock); |
| 52 | if (ret < 0) { | 62 | if (ret < 0) { |
| 53 | *limit_fail_at = c; | 63 | *limit_fail_at = c; |
| @@ -75,7 +85,8 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val) | |||
| 75 | counter->usage -= val; | 85 | counter->usage -= val; |
| 76 | } | 86 | } |
| 77 | 87 | ||
| 78 | void res_counter_uncharge(struct res_counter *counter, unsigned long val) | 88 | void res_counter_uncharge(struct res_counter *counter, unsigned long val, |
| 89 | bool *was_soft_limit_excess) | ||
| 79 | { | 90 | { |
| 80 | unsigned long flags; | 91 | unsigned long flags; |
| 81 | struct res_counter *c; | 92 | struct res_counter *c; |
| @@ -83,6 +94,9 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val) | |||
| 83 | local_irq_save(flags); | 94 | local_irq_save(flags); |
| 84 | for (c = counter; c != NULL; c = c->parent) { | 95 | for (c = counter; c != NULL; c = c->parent) { |
| 85 | spin_lock(&c->lock); | 96 | spin_lock(&c->lock); |
| 97 | if (was_soft_limit_excess) | ||
| 98 | *was_soft_limit_excess = | ||
| 99 | !res_counter_soft_limit_check_locked(c); | ||
| 86 | res_counter_uncharge_locked(c, val); | 100 | res_counter_uncharge_locked(c, val); |
| 87 | spin_unlock(&c->lock); | 101 | spin_unlock(&c->lock); |
| 88 | } | 102 | } |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4ad3e6be045d..0ed325943cd1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -29,6 +29,7 @@ | |||
| 29 | #include <linux/rcupdate.h> | 29 | #include <linux/rcupdate.h> |
| 30 | #include <linux/limits.h> | 30 | #include <linux/limits.h> |
| 31 | #include <linux/mutex.h> | 31 | #include <linux/mutex.h> |
| 32 | #include <linux/rbtree.h> | ||
| 32 | #include <linux/slab.h> | 33 | #include <linux/slab.h> |
| 33 | #include <linux/swap.h> | 34 | #include <linux/swap.h> |
| 34 | #include <linux/spinlock.h> | 35 | #include <linux/spinlock.h> |
| @@ -54,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | |||
| 54 | #endif | 55 | #endif |
| 55 | 56 | ||
| 56 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ | 57 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ |
| 58 | #define SOFTLIMIT_EVENTS_THRESH (1000) | ||
| 57 | 59 | ||
| 58 | /* | 60 | /* |
| 59 | * Statistics for memory cgroup. | 61 | * Statistics for memory cgroup. |
| @@ -67,6 +69,7 @@ enum mem_cgroup_stat_index { | |||
| 67 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ | 69 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ |
| 68 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
| 69 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
| 72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ | ||
| 70 | 73 | ||
| 71 | MEM_CGROUP_STAT_NSTATS, | 74 | MEM_CGROUP_STAT_NSTATS, |
| 72 | }; | 75 | }; |
| @@ -79,6 +82,20 @@ struct mem_cgroup_stat { | |||
| 79 | struct mem_cgroup_stat_cpu cpustat[0]; | 82 | struct mem_cgroup_stat_cpu cpustat[0]; |
| 80 | }; | 83 | }; |
| 81 | 84 | ||
| 85 | static inline void | ||
| 86 | __mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, | ||
| 87 | enum mem_cgroup_stat_index idx) | ||
| 88 | { | ||
| 89 | stat->count[idx] = 0; | ||
| 90 | } | ||
| 91 | |||
| 92 | static inline s64 | ||
| 93 | __mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, | ||
| 94 | enum mem_cgroup_stat_index idx) | ||
| 95 | { | ||
| 96 | return stat->count[idx]; | ||
| 97 | } | ||
| 98 | |||
| 82 | /* | 99 | /* |
| 83 | * For accounting under irq disable, no need for increment preempt count. | 100 | * For accounting under irq disable, no need for increment preempt count. |
| 84 | */ | 101 | */ |
| @@ -118,6 +135,10 @@ struct mem_cgroup_per_zone { | |||
| 118 | unsigned long count[NR_LRU_LISTS]; | 135 | unsigned long count[NR_LRU_LISTS]; |
| 119 | 136 | ||
| 120 | struct zone_reclaim_stat reclaim_stat; | 137 | struct zone_reclaim_stat reclaim_stat; |
| 138 | struct rb_node tree_node; /* RB tree node */ | ||
| 139 | unsigned long long usage_in_excess;/* Set to the value by which */ | ||
| 140 | /* the soft limit is exceeded*/ | ||
| 141 | bool on_tree; | ||
| 121 | }; | 142 | }; |
| 122 | /* Macro for accessing counter */ | 143 | /* Macro for accessing counter */ |
| 123 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) | 144 | #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) |
| @@ -131,6 +152,26 @@ struct mem_cgroup_lru_info { | |||
| 131 | }; | 152 | }; |
| 132 | 153 | ||
| 133 | /* | 154 | /* |
| 155 | * Cgroups above their limits are maintained in a RB-Tree, independent of | ||
| 156 | * their hierarchy representation | ||
| 157 | */ | ||
| 158 | |||
| 159 | struct mem_cgroup_tree_per_zone { | ||
| 160 | struct rb_root rb_root; | ||
| 161 | spinlock_t lock; | ||
| 162 | }; | ||
| 163 | |||
| 164 | struct mem_cgroup_tree_per_node { | ||
| 165 | struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; | ||
| 166 | }; | ||
| 167 | |||
| 168 | struct mem_cgroup_tree { | ||
| 169 | struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; | ||
| 170 | }; | ||
| 171 | |||
| 172 | static struct mem_cgroup_tree soft_limit_tree __read_mostly; | ||
| 173 | |||
| 174 | /* | ||
| 134 | * The memory controller data structure. The memory controller controls both | 175 | * The memory controller data structure. The memory controller controls both |
| 135 | * page cache and RSS per cgroup. We would eventually like to provide | 176 | * page cache and RSS per cgroup. We would eventually like to provide |
| 136 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, | 177 | * statistics based on the statistics developed by Rik Van Riel for clock-pro, |
| @@ -215,6 +256,150 @@ static void mem_cgroup_get(struct mem_cgroup *mem); | |||
| 215 | static void mem_cgroup_put(struct mem_cgroup *mem); | 256 | static void mem_cgroup_put(struct mem_cgroup *mem); |
| 216 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 257 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
| 217 | 258 | ||
| 259 | static struct mem_cgroup_per_zone * | ||
| 260 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | ||
| 261 | { | ||
| 262 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | ||
| 263 | } | ||
| 264 | |||
| 265 | static struct mem_cgroup_per_zone * | ||
| 266 | page_cgroup_zoneinfo(struct page_cgroup *pc) | ||
| 267 | { | ||
| 268 | struct mem_cgroup *mem = pc->mem_cgroup; | ||
| 269 | int nid = page_cgroup_nid(pc); | ||
| 270 | int zid = page_cgroup_zid(pc); | ||
| 271 | |||
| 272 | if (!mem) | ||
| 273 | return NULL; | ||
| 274 | |||
| 275 | return mem_cgroup_zoneinfo(mem, nid, zid); | ||
| 276 | } | ||
| 277 | |||
| 278 | static struct mem_cgroup_tree_per_zone * | ||
| 279 | soft_limit_tree_node_zone(int nid, int zid) | ||
| 280 | { | ||
| 281 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
| 282 | } | ||
| 283 | |||
| 284 | static struct mem_cgroup_tree_per_zone * | ||
| 285 | soft_limit_tree_from_page(struct page *page) | ||
| 286 | { | ||
| 287 | int nid = page_to_nid(page); | ||
| 288 | int zid = page_zonenum(page); | ||
| 289 | |||
| 290 | return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; | ||
| 291 | } | ||
| 292 | |||
| 293 | static void | ||
| 294 | mem_cgroup_insert_exceeded(struct mem_cgroup *mem, | ||
| 295 | struct mem_cgroup_per_zone *mz, | ||
| 296 | struct mem_cgroup_tree_per_zone *mctz) | ||
| 297 | { | ||
| 298 | struct rb_node **p = &mctz->rb_root.rb_node; | ||
| 299 | struct rb_node *parent = NULL; | ||
| 300 | struct mem_cgroup_per_zone *mz_node; | ||
| 301 | |||
| 302 | if (mz->on_tree) | ||
| 303 | return; | ||
| 304 | |||
| 305 | mz->usage_in_excess = res_counter_soft_limit_excess(&mem->res); | ||
| 306 | spin_lock(&mctz->lock); | ||
| 307 | while (*p) { | ||
| 308 | parent = *p; | ||
| 309 | mz_node = rb_entry(parent, struct mem_cgroup_per_zone, | ||
| 310 | tree_node); | ||
| 311 | if (mz->usage_in_excess < mz_node->usage_in_excess) | ||
| 312 | p = &(*p)->rb_left; | ||
| 313 | /* | ||
| 314 | * We can't avoid mem cgroups that are over their soft | ||
| 315 | * limit by the same amount | ||
| 316 | */ | ||
| 317 | else if (mz->usage_in_excess >= mz_node->usage_in_excess) | ||
| 318 | p = &(*p)->rb_right; | ||
| 319 | } | ||
| 320 | rb_link_node(&mz->tree_node, parent, p); | ||
| 321 | rb_insert_color(&mz->tree_node, &mctz->rb_root); | ||
| 322 | mz->on_tree = true; | ||
| 323 | spin_unlock(&mctz->lock); | ||
| 324 | } | ||
| 325 | |||
| 326 | static void | ||
| 327 | mem_cgroup_remove_exceeded(struct mem_cgroup *mem, | ||
| 328 | struct mem_cgroup_per_zone *mz, | ||
| 329 | struct mem_cgroup_tree_per_zone *mctz) | ||
| 330 | { | ||
| 331 | spin_lock(&mctz->lock); | ||
| 332 | rb_erase(&mz->tree_node, &mctz->rb_root); | ||
| 333 | mz->on_tree = false; | ||
| 334 | spin_unlock(&mctz->lock); | ||
| 335 | } | ||
| 336 | |||
| 337 | static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) | ||
| 338 | { | ||
| 339 | bool ret = false; | ||
| 340 | int cpu; | ||
| 341 | s64 val; | ||
| 342 | struct mem_cgroup_stat_cpu *cpustat; | ||
| 343 | |||
| 344 | cpu = get_cpu(); | ||
| 345 | cpustat = &mem->stat.cpustat[cpu]; | ||
| 346 | val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
| 347 | if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { | ||
| 348 | __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); | ||
| 349 | ret = true; | ||
| 350 | } | ||
| 351 | put_cpu(); | ||
| 352 | return ret; | ||
| 353 | } | ||
| 354 | |||
| 355 | static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) | ||
| 356 | { | ||
| 357 | unsigned long long prev_usage_in_excess, new_usage_in_excess; | ||
| 358 | bool updated_tree = false; | ||
| 359 | struct mem_cgroup_per_zone *mz; | ||
| 360 | struct mem_cgroup_tree_per_zone *mctz; | ||
| 361 | |||
| 362 | mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page)); | ||
| 363 | mctz = soft_limit_tree_from_page(page); | ||
| 364 | |||
| 365 | /* | ||
| 366 | * We do updates in lazy mode, mem's are removed | ||
| 367 | * lazily from the per-zone, per-node rb tree | ||
| 368 | */ | ||
| 369 | prev_usage_in_excess = mz->usage_in_excess; | ||
| 370 | |||
| 371 | new_usage_in_excess = res_counter_soft_limit_excess(&mem->res); | ||
| 372 | if (prev_usage_in_excess) { | ||
| 373 | mem_cgroup_remove_exceeded(mem, mz, mctz); | ||
| 374 | updated_tree = true; | ||
| 375 | } | ||
| 376 | if (!new_usage_in_excess) | ||
| 377 | goto done; | ||
| 378 | mem_cgroup_insert_exceeded(mem, mz, mctz); | ||
| 379 | |||
| 380 | done: | ||
| 381 | if (updated_tree) { | ||
| 382 | spin_lock(&mctz->lock); | ||
| 383 | mz->usage_in_excess = new_usage_in_excess; | ||
| 384 | spin_unlock(&mctz->lock); | ||
| 385 | } | ||
| 386 | } | ||
| 387 | |||
| 388 | static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) | ||
| 389 | { | ||
| 390 | int node, zone; | ||
| 391 | struct mem_cgroup_per_zone *mz; | ||
| 392 | struct mem_cgroup_tree_per_zone *mctz; | ||
| 393 | |||
| 394 | for_each_node_state(node, N_POSSIBLE) { | ||
| 395 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
| 396 | mz = mem_cgroup_zoneinfo(mem, node, zone); | ||
| 397 | mctz = soft_limit_tree_node_zone(node, zone); | ||
| 398 | mem_cgroup_remove_exceeded(mem, mz, mctz); | ||
| 399 | } | ||
| 400 | } | ||
| 401 | } | ||
| 402 | |||
| 218 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | 403 | static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, |
| 219 | struct page_cgroup *pc, | 404 | struct page_cgroup *pc, |
| 220 | bool charge) | 405 | bool charge) |
| @@ -236,28 +421,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, | |||
| 236 | else | 421 | else |
| 237 | __mem_cgroup_stat_add_safe(cpustat, | 422 | __mem_cgroup_stat_add_safe(cpustat, |
| 238 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); | 423 | MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); |
| 424 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); | ||
| 239 | put_cpu(); | 425 | put_cpu(); |
| 240 | } | 426 | } |
| 241 | 427 | ||
| 242 | static struct mem_cgroup_per_zone * | ||
| 243 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | ||
| 244 | { | ||
| 245 | return &mem->info.nodeinfo[nid]->zoneinfo[zid]; | ||
| 246 | } | ||
| 247 | |||
| 248 | static struct mem_cgroup_per_zone * | ||
| 249 | page_cgroup_zoneinfo(struct page_cgroup *pc) | ||
| 250 | { | ||
| 251 | struct mem_cgroup *mem = pc->mem_cgroup; | ||
| 252 | int nid = page_cgroup_nid(pc); | ||
| 253 | int zid = page_cgroup_zid(pc); | ||
| 254 | |||
| 255 | if (!mem) | ||
| 256 | return NULL; | ||
| 257 | |||
| 258 | return mem_cgroup_zoneinfo(mem, nid, zid); | ||
| 259 | } | ||
| 260 | |||
| 261 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, | 428 | static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, |
| 262 | enum lru_list idx) | 429 | enum lru_list idx) |
| 263 | { | 430 | { |
| @@ -972,11 +1139,11 @@ done: | |||
| 972 | */ | 1139 | */ |
| 973 | static int __mem_cgroup_try_charge(struct mm_struct *mm, | 1140 | static int __mem_cgroup_try_charge(struct mm_struct *mm, |
| 974 | gfp_t gfp_mask, struct mem_cgroup **memcg, | 1141 | gfp_t gfp_mask, struct mem_cgroup **memcg, |
| 975 | bool oom) | 1142 | bool oom, struct page *page) |
| 976 | { | 1143 | { |
| 977 | struct mem_cgroup *mem, *mem_over_limit; | 1144 | struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit; |
| 978 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1145 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
| 979 | struct res_counter *fail_res; | 1146 | struct res_counter *fail_res, *soft_fail_res = NULL; |
| 980 | 1147 | ||
| 981 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | 1148 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { |
| 982 | /* Don't account this! */ | 1149 | /* Don't account this! */ |
| @@ -1006,16 +1173,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
| 1006 | int ret; | 1173 | int ret; |
| 1007 | bool noswap = false; | 1174 | bool noswap = false; |
| 1008 | 1175 | ||
| 1009 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); | 1176 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res, |
| 1177 | &soft_fail_res); | ||
| 1010 | if (likely(!ret)) { | 1178 | if (likely(!ret)) { |
| 1011 | if (!do_swap_account) | 1179 | if (!do_swap_account) |
| 1012 | break; | 1180 | break; |
| 1013 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, | 1181 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, |
| 1014 | &fail_res); | 1182 | &fail_res, NULL); |
| 1015 | if (likely(!ret)) | 1183 | if (likely(!ret)) |
| 1016 | break; | 1184 | break; |
| 1017 | /* mem+swap counter fails */ | 1185 | /* mem+swap counter fails */ |
| 1018 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1186 | res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); |
| 1019 | noswap = true; | 1187 | noswap = true; |
| 1020 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1188 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
| 1021 | memsw); | 1189 | memsw); |
| @@ -1053,13 +1221,23 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
| 1053 | goto nomem; | 1221 | goto nomem; |
| 1054 | } | 1222 | } |
| 1055 | } | 1223 | } |
| 1224 | /* | ||
| 1225 | * Insert just the ancestor, we should trickle down to the correct | ||
| 1226 | * cgroup for reclaim, since the other nodes will be below their | ||
| 1227 | * soft limit | ||
| 1228 | */ | ||
| 1229 | if (soft_fail_res) { | ||
| 1230 | mem_over_soft_limit = | ||
| 1231 | mem_cgroup_from_res_counter(soft_fail_res, res); | ||
| 1232 | if (mem_cgroup_soft_limit_check(mem_over_soft_limit)) | ||
| 1233 | mem_cgroup_update_tree(mem_over_soft_limit, page); | ||
| 1234 | } | ||
| 1056 | return 0; | 1235 | return 0; |
| 1057 | nomem: | 1236 | nomem: |
| 1058 | css_put(&mem->css); | 1237 | css_put(&mem->css); |
| 1059 | return -ENOMEM; | 1238 | return -ENOMEM; |
| 1060 | } | 1239 | } |
| 1061 | 1240 | ||
| 1062 | |||
| 1063 | /* | 1241 | /* |
| 1064 | * A helper function to get mem_cgroup from ID. must be called under | 1242 | * A helper function to get mem_cgroup from ID. must be called under |
| 1065 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 1243 | * rcu_read_lock(). The caller must check css_is_removed() or some if |
| @@ -1126,9 +1304,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
| 1126 | lock_page_cgroup(pc); | 1304 | lock_page_cgroup(pc); |
| 1127 | if (unlikely(PageCgroupUsed(pc))) { | 1305 | if (unlikely(PageCgroupUsed(pc))) { |
| 1128 | unlock_page_cgroup(pc); | 1306 | unlock_page_cgroup(pc); |
| 1129 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1307 | res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); |
| 1130 | if (do_swap_account) | 1308 | if (do_swap_account) |
| 1131 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1309 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); |
| 1132 | css_put(&mem->css); | 1310 | css_put(&mem->css); |
| 1133 | return; | 1311 | return; |
| 1134 | } | 1312 | } |
| @@ -1205,7 +1383,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
| 1205 | if (pc->mem_cgroup != from) | 1383 | if (pc->mem_cgroup != from) |
| 1206 | goto out; | 1384 | goto out; |
| 1207 | 1385 | ||
| 1208 | res_counter_uncharge(&from->res, PAGE_SIZE); | 1386 | res_counter_uncharge(&from->res, PAGE_SIZE, NULL); |
| 1209 | mem_cgroup_charge_statistics(from, pc, false); | 1387 | mem_cgroup_charge_statistics(from, pc, false); |
| 1210 | 1388 | ||
| 1211 | page = pc->page; | 1389 | page = pc->page; |
| @@ -1225,7 +1403,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
| 1225 | } | 1403 | } |
| 1226 | 1404 | ||
| 1227 | if (do_swap_account) | 1405 | if (do_swap_account) |
| 1228 | res_counter_uncharge(&from->memsw, PAGE_SIZE); | 1406 | res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL); |
| 1229 | css_put(&from->css); | 1407 | css_put(&from->css); |
| 1230 | 1408 | ||
| 1231 | css_get(&to->css); | 1409 | css_get(&to->css); |
| @@ -1265,7 +1443,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
| 1265 | parent = mem_cgroup_from_cont(pcg); | 1443 | parent = mem_cgroup_from_cont(pcg); |
| 1266 | 1444 | ||
| 1267 | 1445 | ||
| 1268 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); | 1446 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); |
| 1269 | if (ret || !parent) | 1447 | if (ret || !parent) |
| 1270 | return ret; | 1448 | return ret; |
| 1271 | 1449 | ||
| @@ -1295,9 +1473,9 @@ uncharge: | |||
| 1295 | /* drop extra refcnt by try_charge() */ | 1473 | /* drop extra refcnt by try_charge() */ |
| 1296 | css_put(&parent->css); | 1474 | css_put(&parent->css); |
| 1297 | /* uncharge if move fails */ | 1475 | /* uncharge if move fails */ |
| 1298 | res_counter_uncharge(&parent->res, PAGE_SIZE); | 1476 | res_counter_uncharge(&parent->res, PAGE_SIZE, NULL); |
| 1299 | if (do_swap_account) | 1477 | if (do_swap_account) |
| 1300 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | 1478 | res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL); |
| 1301 | return ret; | 1479 | return ret; |
| 1302 | } | 1480 | } |
| 1303 | 1481 | ||
| @@ -1322,7 +1500,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, | |||
| 1322 | prefetchw(pc); | 1500 | prefetchw(pc); |
| 1323 | 1501 | ||
| 1324 | mem = memcg; | 1502 | mem = memcg; |
| 1325 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); | 1503 | ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); |
| 1326 | if (ret || !mem) | 1504 | if (ret || !mem) |
| 1327 | return ret; | 1505 | return ret; |
| 1328 | 1506 | ||
| @@ -1441,14 +1619,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, | |||
| 1441 | if (!mem) | 1619 | if (!mem) |
| 1442 | goto charge_cur_mm; | 1620 | goto charge_cur_mm; |
| 1443 | *ptr = mem; | 1621 | *ptr = mem; |
| 1444 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); | 1622 | ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); |
| 1445 | /* drop extra refcnt from tryget */ | 1623 | /* drop extra refcnt from tryget */ |
| 1446 | css_put(&mem->css); | 1624 | css_put(&mem->css); |
| 1447 | return ret; | 1625 | return ret; |
| 1448 | charge_cur_mm: | 1626 | charge_cur_mm: |
| 1449 | if (unlikely(!mm)) | 1627 | if (unlikely(!mm)) |
| 1450 | mm = &init_mm; | 1628 | mm = &init_mm; |
| 1451 | return __mem_cgroup_try_charge(mm, mask, ptr, true); | 1629 | return __mem_cgroup_try_charge(mm, mask, ptr, true, page); |
| 1452 | } | 1630 | } |
| 1453 | 1631 | ||
| 1454 | static void | 1632 | static void |
| @@ -1486,7 +1664,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, | |||
| 1486 | * This recorded memcg can be obsolete one. So, avoid | 1664 | * This recorded memcg can be obsolete one. So, avoid |
| 1487 | * calling css_tryget | 1665 | * calling css_tryget |
| 1488 | */ | 1666 | */ |
| 1489 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 1667 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); |
| 1490 | mem_cgroup_put(memcg); | 1668 | mem_cgroup_put(memcg); |
| 1491 | } | 1669 | } |
| 1492 | rcu_read_unlock(); | 1670 | rcu_read_unlock(); |
| @@ -1511,9 +1689,9 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
| 1511 | return; | 1689 | return; |
| 1512 | if (!mem) | 1690 | if (!mem) |
| 1513 | return; | 1691 | return; |
| 1514 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1692 | res_counter_uncharge(&mem->res, PAGE_SIZE, NULL); |
| 1515 | if (do_swap_account) | 1693 | if (do_swap_account) |
| 1516 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1694 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); |
| 1517 | css_put(&mem->css); | 1695 | css_put(&mem->css); |
| 1518 | } | 1696 | } |
| 1519 | 1697 | ||
| @@ -1527,6 +1705,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 1527 | struct page_cgroup *pc; | 1705 | struct page_cgroup *pc; |
| 1528 | struct mem_cgroup *mem = NULL; | 1706 | struct mem_cgroup *mem = NULL; |
| 1529 | struct mem_cgroup_per_zone *mz; | 1707 | struct mem_cgroup_per_zone *mz; |
| 1708 | bool soft_limit_excess = false; | ||
| 1530 | 1709 | ||
| 1531 | if (mem_cgroup_disabled()) | 1710 | if (mem_cgroup_disabled()) |
| 1532 | return NULL; | 1711 | return NULL; |
| @@ -1565,9 +1744,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 1565 | break; | 1744 | break; |
| 1566 | } | 1745 | } |
| 1567 | 1746 | ||
| 1568 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1747 | res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess); |
| 1569 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | 1748 | if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) |
| 1570 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 1749 | res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL); |
| 1571 | mem_cgroup_charge_statistics(mem, pc, false); | 1750 | mem_cgroup_charge_statistics(mem, pc, false); |
| 1572 | 1751 | ||
| 1573 | ClearPageCgroupUsed(pc); | 1752 | ClearPageCgroupUsed(pc); |
| @@ -1581,6 +1760,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
| 1581 | mz = page_cgroup_zoneinfo(pc); | 1760 | mz = page_cgroup_zoneinfo(pc); |
| 1582 | unlock_page_cgroup(pc); | 1761 | unlock_page_cgroup(pc); |
| 1583 | 1762 | ||
| 1763 | if (soft_limit_excess && mem_cgroup_soft_limit_check(mem)) | ||
| 1764 | mem_cgroup_update_tree(mem, page); | ||
| 1584 | /* at swapout, this memcg will be accessed to record to swap */ | 1765 | /* at swapout, this memcg will be accessed to record to swap */ |
| 1585 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 1766 | if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
| 1586 | css_put(&mem->css); | 1767 | css_put(&mem->css); |
| @@ -1656,7 +1837,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) | |||
| 1656 | * We uncharge this because swap is freed. | 1837 | * We uncharge this because swap is freed. |
| 1657 | * This memcg can be obsolete one. We avoid calling css_tryget | 1838 | * This memcg can be obsolete one. We avoid calling css_tryget |
| 1658 | */ | 1839 | */ |
| 1659 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE); | 1840 | res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL); |
| 1660 | mem_cgroup_put(memcg); | 1841 | mem_cgroup_put(memcg); |
| 1661 | } | 1842 | } |
| 1662 | rcu_read_unlock(); | 1843 | rcu_read_unlock(); |
| @@ -1685,7 +1866,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | |||
| 1685 | unlock_page_cgroup(pc); | 1866 | unlock_page_cgroup(pc); |
| 1686 | 1867 | ||
| 1687 | if (mem) { | 1868 | if (mem) { |
| 1688 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); | 1869 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, |
| 1870 | page); | ||
| 1689 | css_put(&mem->css); | 1871 | css_put(&mem->css); |
| 1690 | } | 1872 | } |
| 1691 | *ptr = mem; | 1873 | *ptr = mem; |
| @@ -2194,6 +2376,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
| 2194 | res_counter_reset_failcnt(&mem->memsw); | 2376 | res_counter_reset_failcnt(&mem->memsw); |
| 2195 | break; | 2377 | break; |
| 2196 | } | 2378 | } |
| 2379 | |||
| 2197 | return 0; | 2380 | return 0; |
| 2198 | } | 2381 | } |
| 2199 | 2382 | ||
| @@ -2489,6 +2672,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) | |||
| 2489 | mz = &pn->zoneinfo[zone]; | 2672 | mz = &pn->zoneinfo[zone]; |
| 2490 | for_each_lru(l) | 2673 | for_each_lru(l) |
| 2491 | INIT_LIST_HEAD(&mz->lists[l]); | 2674 | INIT_LIST_HEAD(&mz->lists[l]); |
| 2675 | mz->usage_in_excess = 0; | ||
| 2492 | } | 2676 | } |
| 2493 | return 0; | 2677 | return 0; |
| 2494 | } | 2678 | } |
| @@ -2534,6 +2718,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) | |||
| 2534 | { | 2718 | { |
| 2535 | int node; | 2719 | int node; |
| 2536 | 2720 | ||
| 2721 | mem_cgroup_remove_from_trees(mem); | ||
| 2537 | free_css_id(&mem_cgroup_subsys, &mem->css); | 2722 | free_css_id(&mem_cgroup_subsys, &mem->css); |
| 2538 | 2723 | ||
| 2539 | for_each_node_state(node, N_POSSIBLE) | 2724 | for_each_node_state(node, N_POSSIBLE) |
| @@ -2582,6 +2767,31 @@ static void __init enable_swap_cgroup(void) | |||
| 2582 | } | 2767 | } |
| 2583 | #endif | 2768 | #endif |
| 2584 | 2769 | ||
| 2770 | static int mem_cgroup_soft_limit_tree_init(void) | ||
| 2771 | { | ||
| 2772 | struct mem_cgroup_tree_per_node *rtpn; | ||
| 2773 | struct mem_cgroup_tree_per_zone *rtpz; | ||
| 2774 | int tmp, node, zone; | ||
| 2775 | |||
| 2776 | for_each_node_state(node, N_POSSIBLE) { | ||
| 2777 | tmp = node; | ||
| 2778 | if (!node_state(node, N_NORMAL_MEMORY)) | ||
| 2779 | tmp = -1; | ||
| 2780 | rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); | ||
| 2781 | if (!rtpn) | ||
| 2782 | return 1; | ||
| 2783 | |||
| 2784 | soft_limit_tree.rb_tree_per_node[node] = rtpn; | ||
| 2785 | |||
| 2786 | for (zone = 0; zone < MAX_NR_ZONES; zone++) { | ||
| 2787 | rtpz = &rtpn->rb_tree_per_zone[zone]; | ||
| 2788 | rtpz->rb_root = RB_ROOT; | ||
| 2789 | spin_lock_init(&rtpz->lock); | ||
| 2790 | } | ||
| 2791 | } | ||
| 2792 | return 0; | ||
| 2793 | } | ||
| 2794 | |||
| 2585 | static struct cgroup_subsys_state * __ref | 2795 | static struct cgroup_subsys_state * __ref |
| 2586 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | 2796 | mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) |
| 2587 | { | 2797 | { |
| @@ -2596,11 +2806,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 2596 | for_each_node_state(node, N_POSSIBLE) | 2806 | for_each_node_state(node, N_POSSIBLE) |
| 2597 | if (alloc_mem_cgroup_per_zone_info(mem, node)) | 2807 | if (alloc_mem_cgroup_per_zone_info(mem, node)) |
| 2598 | goto free_out; | 2808 | goto free_out; |
| 2809 | |||
| 2599 | /* root ? */ | 2810 | /* root ? */ |
| 2600 | if (cont->parent == NULL) { | 2811 | if (cont->parent == NULL) { |
| 2601 | enable_swap_cgroup(); | 2812 | enable_swap_cgroup(); |
| 2602 | parent = NULL; | 2813 | parent = NULL; |
| 2603 | root_mem_cgroup = mem; | 2814 | root_mem_cgroup = mem; |
| 2815 | if (mem_cgroup_soft_limit_tree_init()) | ||
| 2816 | goto free_out; | ||
| 2817 | |||
| 2604 | } else { | 2818 | } else { |
| 2605 | parent = mem_cgroup_from_cont(cont->parent); | 2819 | parent = mem_cgroup_from_cont(cont->parent); |
| 2606 | mem->use_hierarchy = parent->use_hierarchy; | 2820 | mem->use_hierarchy = parent->use_hierarchy; |
