aboutsummaryrefslogtreecommitdiffstats
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c365
1 files changed, 138 insertions, 227 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 631002d085d1..8b9f6cae938e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -137,14 +137,21 @@ struct mem_cgroup {
137 */ 137 */
138 struct mem_cgroup_stat stat; 138 struct mem_cgroup_stat stat;
139}; 139};
140static struct mem_cgroup init_mem_cgroup;
140 141
141/* 142/*
142 * We use the lower bit of the page->page_cgroup pointer as a bit spin 143 * We use the lower bit of the page->page_cgroup pointer as a bit spin
143 * lock. We need to ensure that page->page_cgroup is atleast two 144 * lock. We need to ensure that page->page_cgroup is at least two
144 * byte aligned (based on comments from Nick Piggin) 145 * byte aligned (based on comments from Nick Piggin). But since
146 * bit_spin_lock doesn't actually set that lock bit in a non-debug
147 * uniprocessor kernel, we should avoid setting it here too.
145 */ 148 */
146#define PAGE_CGROUP_LOCK_BIT 0x0 149#define PAGE_CGROUP_LOCK_BIT 0x0
147#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) 150#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
151#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT)
152#else
153#define PAGE_CGROUP_LOCK 0x0
154#endif
148 155
149/* 156/*
150 * A page_cgroup page is associated with every page descriptor. The 157 * A page_cgroup page is associated with every page descriptor. The
@@ -154,37 +161,27 @@ struct page_cgroup {
154 struct list_head lru; /* per cgroup LRU list */ 161 struct list_head lru; /* per cgroup LRU list */
155 struct page *page; 162 struct page *page;
156 struct mem_cgroup *mem_cgroup; 163 struct mem_cgroup *mem_cgroup;
157 atomic_t ref_cnt; /* Helpful when pages move b/w */ 164 int ref_cnt; /* cached, mapped, migrating */
158 /* mapped and cached states */ 165 int flags;
159 int flags;
160}; 166};
161#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ 167#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
162#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ 168#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */
163 169
164static inline int page_cgroup_nid(struct page_cgroup *pc) 170static int page_cgroup_nid(struct page_cgroup *pc)
165{ 171{
166 return page_to_nid(pc->page); 172 return page_to_nid(pc->page);
167} 173}
168 174
169static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) 175static enum zone_type page_cgroup_zid(struct page_cgroup *pc)
170{ 176{
171 return page_zonenum(pc->page); 177 return page_zonenum(pc->page);
172} 178}
173 179
174enum {
175 MEM_CGROUP_TYPE_UNSPEC = 0,
176 MEM_CGROUP_TYPE_MAPPED,
177 MEM_CGROUP_TYPE_CACHED,
178 MEM_CGROUP_TYPE_ALL,
179 MEM_CGROUP_TYPE_MAX,
180};
181
182enum charge_type { 180enum charge_type {
183 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 181 MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
184 MEM_CGROUP_CHARGE_TYPE_MAPPED, 182 MEM_CGROUP_CHARGE_TYPE_MAPPED,
185}; 183};
186 184
187
188/* 185/*
189 * Always modified under lru lock. Then, not necessary to preempt_disable() 186 * Always modified under lru lock. Then, not necessary to preempt_disable()
190 */ 187 */
@@ -193,23 +190,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags,
193{ 190{
194 int val = (charge)? 1 : -1; 191 int val = (charge)? 1 : -1;
195 struct mem_cgroup_stat *stat = &mem->stat; 192 struct mem_cgroup_stat *stat = &mem->stat;
196 VM_BUG_ON(!irqs_disabled());
197 193
194 VM_BUG_ON(!irqs_disabled());
198 if (flags & PAGE_CGROUP_FLAG_CACHE) 195 if (flags & PAGE_CGROUP_FLAG_CACHE)
199 __mem_cgroup_stat_add_safe(stat, 196 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val);
200 MEM_CGROUP_STAT_CACHE, val);
201 else 197 else
202 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); 198 __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val);
203} 199}
204 200
205static inline struct mem_cgroup_per_zone * 201static struct mem_cgroup_per_zone *
206mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) 202mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid)
207{ 203{
208 BUG_ON(!mem->info.nodeinfo[nid]);
209 return &mem->info.nodeinfo[nid]->zoneinfo[zid]; 204 return &mem->info.nodeinfo[nid]->zoneinfo[zid];
210} 205}
211 206
212static inline struct mem_cgroup_per_zone * 207static struct mem_cgroup_per_zone *
213page_cgroup_zoneinfo(struct page_cgroup *pc) 208page_cgroup_zoneinfo(struct page_cgroup *pc)
214{ 209{
215 struct mem_cgroup *mem = pc->mem_cgroup; 210 struct mem_cgroup *mem = pc->mem_cgroup;
@@ -234,18 +229,14 @@ static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem,
234 return total; 229 return total;
235} 230}
236 231
237static struct mem_cgroup init_mem_cgroup; 232static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
238
239static inline
240struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
241{ 233{
242 return container_of(cgroup_subsys_state(cont, 234 return container_of(cgroup_subsys_state(cont,
243 mem_cgroup_subsys_id), struct mem_cgroup, 235 mem_cgroup_subsys_id), struct mem_cgroup,
244 css); 236 css);
245} 237}
246 238
247static inline 239static struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
248struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
249{ 240{
250 return container_of(task_subsys_state(p, mem_cgroup_subsys_id), 241 return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
251 struct mem_cgroup, css); 242 struct mem_cgroup, css);
@@ -267,81 +258,33 @@ void mm_free_cgroup(struct mm_struct *mm)
267 258
268static inline int page_cgroup_locked(struct page *page) 259static inline int page_cgroup_locked(struct page *page)
269{ 260{
270 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, 261 return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
271 &page->page_cgroup);
272} 262}
273 263
274void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) 264static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
275{ 265{
276 int locked; 266 VM_BUG_ON(!page_cgroup_locked(page));
277 267 page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK);
278 /*
279 * While resetting the page_cgroup we might not hold the
280 * page_cgroup lock. free_hot_cold_page() is an example
281 * of such a scenario
282 */
283 if (pc)
284 VM_BUG_ON(!page_cgroup_locked(page));
285 locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
286 page->page_cgroup = ((unsigned long)pc | locked);
287} 268}
288 269
289struct page_cgroup *page_get_page_cgroup(struct page *page) 270struct page_cgroup *page_get_page_cgroup(struct page *page)
290{ 271{
291 return (struct page_cgroup *) 272 return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK);
292 (page->page_cgroup & ~PAGE_CGROUP_LOCK);
293} 273}
294 274
295static void __always_inline lock_page_cgroup(struct page *page) 275static void lock_page_cgroup(struct page *page)
296{ 276{
297 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); 277 bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
298 VM_BUG_ON(!page_cgroup_locked(page));
299}
300
301static void __always_inline unlock_page_cgroup(struct page *page)
302{
303 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
304} 278}
305 279
306/* 280static int try_lock_page_cgroup(struct page *page)
307 * Tie new page_cgroup to struct page under lock_page_cgroup()
308 * This can fail if the page has been tied to a page_cgroup.
309 * If success, returns 0.
310 */
311static int page_cgroup_assign_new_page_cgroup(struct page *page,
312 struct page_cgroup *pc)
313{ 281{
314 int ret = 0; 282 return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
315
316 lock_page_cgroup(page);
317 if (!page_get_page_cgroup(page))
318 page_assign_page_cgroup(page, pc);
319 else /* A page is tied to other pc. */
320 ret = 1;
321 unlock_page_cgroup(page);
322 return ret;
323} 283}
324 284
325/* 285static void unlock_page_cgroup(struct page *page)
326 * Clear page->page_cgroup member under lock_page_cgroup().
327 * If given "pc" value is different from one page->page_cgroup,
328 * page->cgroup is not cleared.
329 * Returns a value of page->page_cgroup at lock taken.
330 * A can can detect failure of clearing by following
331 * clear_page_cgroup(page, pc) == pc
332 */
333
334static struct page_cgroup *clear_page_cgroup(struct page *page,
335 struct page_cgroup *pc)
336{ 286{
337 struct page_cgroup *ret; 287 bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
338 /* lock and clear */
339 lock_page_cgroup(page);
340 ret = page_get_page_cgroup(page);
341 if (likely(ret == pc))
342 page_assign_page_cgroup(page, NULL);
343 unlock_page_cgroup(page);
344 return ret;
345} 288}
346 289
347static void __mem_cgroup_remove_list(struct page_cgroup *pc) 290static void __mem_cgroup_remove_list(struct page_cgroup *pc)
@@ -399,7 +342,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
399 int ret; 342 int ret;
400 343
401 task_lock(task); 344 task_lock(task);
402 ret = task->mm && vm_match_cgroup(task->mm, mem); 345 ret = task->mm && mm_match_cgroup(task->mm, mem);
403 task_unlock(task); 346 task_unlock(task);
404 return ret; 347 return ret;
405} 348}
@@ -407,18 +350,30 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
407/* 350/*
408 * This routine assumes that the appropriate zone's lru lock is already held 351 * This routine assumes that the appropriate zone's lru lock is already held
409 */ 352 */
410void mem_cgroup_move_lists(struct page_cgroup *pc, bool active) 353void mem_cgroup_move_lists(struct page *page, bool active)
411{ 354{
355 struct page_cgroup *pc;
412 struct mem_cgroup_per_zone *mz; 356 struct mem_cgroup_per_zone *mz;
413 unsigned long flags; 357 unsigned long flags;
414 358
415 if (!pc) 359 /*
360 * We cannot lock_page_cgroup while holding zone's lru_lock,
361 * because other holders of lock_page_cgroup can be interrupted
362 * with an attempt to rotate_reclaimable_page. But we cannot
363 * safely get to page_cgroup without it, so just try_lock it:
364 * mem_cgroup_isolate_pages allows for page left on wrong list.
365 */
366 if (!try_lock_page_cgroup(page))
416 return; 367 return;
417 368
418 mz = page_cgroup_zoneinfo(pc); 369 pc = page_get_page_cgroup(page);
419 spin_lock_irqsave(&mz->lru_lock, flags); 370 if (pc) {
420 __mem_cgroup_move_lists(pc, active); 371 mz = page_cgroup_zoneinfo(pc);
421 spin_unlock_irqrestore(&mz->lru_lock, flags); 372 spin_lock_irqsave(&mz->lru_lock, flags);
373 __mem_cgroup_move_lists(pc, active);
374 spin_unlock_irqrestore(&mz->lru_lock, flags);
375 }
376 unlock_page_cgroup(page);
422} 377}
423 378
424/* 379/*
@@ -437,6 +392,7 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem)
437 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); 392 rss = (long)mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
438 return (int)((rss * 100L) / total); 393 return (int)((rss * 100L) / total);
439} 394}
395
440/* 396/*
441 * This function is called from vmscan.c. In page reclaiming loop. balance 397 * This function is called from vmscan.c. In page reclaiming loop. balance
442 * between active and inactive list is calculated. For memory controller 398 * between active and inactive list is calculated. For memory controller
@@ -500,7 +456,6 @@ long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem,
500 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); 456 struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid);
501 457
502 nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); 458 nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE);
503
504 return (nr_inactive >> priority); 459 return (nr_inactive >> priority);
505} 460}
506 461
@@ -586,26 +541,21 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
586 * with it 541 * with it
587 */ 542 */
588retry: 543retry:
589 if (page) { 544 lock_page_cgroup(page);
590 lock_page_cgroup(page); 545 pc = page_get_page_cgroup(page);
591 pc = page_get_page_cgroup(page); 546 /*
592 /* 547 * The page_cgroup exists and
593 * The page_cgroup exists and 548 * the page has already been accounted.
594 * the page has already been accounted. 549 */
595 */ 550 if (pc) {
596 if (pc) { 551 VM_BUG_ON(pc->page != page);
597 if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) { 552 VM_BUG_ON(pc->ref_cnt <= 0);
598 /* this page is under being uncharged ? */ 553
599 unlock_page_cgroup(page); 554 pc->ref_cnt++;
600 cpu_relax();
601 goto retry;
602 } else {
603 unlock_page_cgroup(page);
604 goto done;
605 }
606 }
607 unlock_page_cgroup(page); 555 unlock_page_cgroup(page);
556 goto done;
608 } 557 }
558 unlock_page_cgroup(page);
609 559
610 pc = kzalloc(sizeof(struct page_cgroup), gfp_mask); 560 pc = kzalloc(sizeof(struct page_cgroup), gfp_mask);
611 if (pc == NULL) 561 if (pc == NULL)
@@ -623,16 +573,11 @@ retry:
623 rcu_read_lock(); 573 rcu_read_lock();
624 mem = rcu_dereference(mm->mem_cgroup); 574 mem = rcu_dereference(mm->mem_cgroup);
625 /* 575 /*
626 * For every charge from the cgroup, increment reference 576 * For every charge from the cgroup, increment reference count
627 * count
628 */ 577 */
629 css_get(&mem->css); 578 css_get(&mem->css);
630 rcu_read_unlock(); 579 rcu_read_unlock();
631 580
632 /*
633 * If we created the page_cgroup, we should free it on exceeding
634 * the cgroup limit.
635 */
636 while (res_counter_charge(&mem->res, PAGE_SIZE)) { 581 while (res_counter_charge(&mem->res, PAGE_SIZE)) {
637 if (!(gfp_mask & __GFP_WAIT)) 582 if (!(gfp_mask & __GFP_WAIT))
638 goto out; 583 goto out;
@@ -641,12 +586,12 @@ retry:
641 continue; 586 continue;
642 587
643 /* 588 /*
644 * try_to_free_mem_cgroup_pages() might not give us a full 589 * try_to_free_mem_cgroup_pages() might not give us a full
645 * picture of reclaim. Some pages are reclaimed and might be 590 * picture of reclaim. Some pages are reclaimed and might be
646 * moved to swap cache or just unmapped from the cgroup. 591 * moved to swap cache or just unmapped from the cgroup.
647 * Check the limit again to see if the reclaim reduced the 592 * Check the limit again to see if the reclaim reduced the
648 * current usage of the cgroup before giving up 593 * current usage of the cgroup before giving up
649 */ 594 */
650 if (res_counter_check_under_limit(&mem->res)) 595 if (res_counter_check_under_limit(&mem->res))
651 continue; 596 continue;
652 597
@@ -657,14 +602,16 @@ retry:
657 congestion_wait(WRITE, HZ/10); 602 congestion_wait(WRITE, HZ/10);
658 } 603 }
659 604
660 atomic_set(&pc->ref_cnt, 1); 605 pc->ref_cnt = 1;
661 pc->mem_cgroup = mem; 606 pc->mem_cgroup = mem;
662 pc->page = page; 607 pc->page = page;
663 pc->flags = PAGE_CGROUP_FLAG_ACTIVE; 608 pc->flags = PAGE_CGROUP_FLAG_ACTIVE;
664 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) 609 if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
665 pc->flags |= PAGE_CGROUP_FLAG_CACHE; 610 pc->flags |= PAGE_CGROUP_FLAG_CACHE;
666 611
667 if (!page || page_cgroup_assign_new_page_cgroup(page, pc)) { 612 lock_page_cgroup(page);
613 if (page_get_page_cgroup(page)) {
614 unlock_page_cgroup(page);
668 /* 615 /*
669 * Another charge has been added to this page already. 616 * Another charge has been added to this page already.
670 * We take lock_page_cgroup(page) again and read 617 * We take lock_page_cgroup(page) again and read
@@ -673,17 +620,16 @@ retry:
673 res_counter_uncharge(&mem->res, PAGE_SIZE); 620 res_counter_uncharge(&mem->res, PAGE_SIZE);
674 css_put(&mem->css); 621 css_put(&mem->css);
675 kfree(pc); 622 kfree(pc);
676 if (!page)
677 goto done;
678 goto retry; 623 goto retry;
679 } 624 }
625 page_assign_page_cgroup(page, pc);
680 626
681 mz = page_cgroup_zoneinfo(pc); 627 mz = page_cgroup_zoneinfo(pc);
682 spin_lock_irqsave(&mz->lru_lock, flags); 628 spin_lock_irqsave(&mz->lru_lock, flags);
683 /* Update statistics vector */
684 __mem_cgroup_add_list(pc); 629 __mem_cgroup_add_list(pc);
685 spin_unlock_irqrestore(&mz->lru_lock, flags); 630 spin_unlock_irqrestore(&mz->lru_lock, flags);
686 631
632 unlock_page_cgroup(page);
687done: 633done:
688 return 0; 634 return 0;
689out: 635out:
@@ -693,70 +639,61 @@ err:
693 return -ENOMEM; 639 return -ENOMEM;
694} 640}
695 641
696int mem_cgroup_charge(struct page *page, struct mm_struct *mm, 642int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
697 gfp_t gfp_mask)
698{ 643{
699 return mem_cgroup_charge_common(page, mm, gfp_mask, 644 return mem_cgroup_charge_common(page, mm, gfp_mask,
700 MEM_CGROUP_CHARGE_TYPE_MAPPED); 645 MEM_CGROUP_CHARGE_TYPE_MAPPED);
701} 646}
702 647
703/*
704 * See if the cached pages should be charged at all?
705 */
706int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 648int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
707 gfp_t gfp_mask) 649 gfp_t gfp_mask)
708{ 650{
709 int ret = 0;
710 if (!mm) 651 if (!mm)
711 mm = &init_mm; 652 mm = &init_mm;
712 653 return mem_cgroup_charge_common(page, mm, gfp_mask,
713 ret = mem_cgroup_charge_common(page, mm, gfp_mask,
714 MEM_CGROUP_CHARGE_TYPE_CACHE); 654 MEM_CGROUP_CHARGE_TYPE_CACHE);
715 return ret;
716} 655}
717 656
718/* 657/*
719 * Uncharging is always a welcome operation, we never complain, simply 658 * Uncharging is always a welcome operation, we never complain, simply
720 * uncharge. This routine should be called with lock_page_cgroup held 659 * uncharge.
721 */ 660 */
722void mem_cgroup_uncharge(struct page_cgroup *pc) 661void mem_cgroup_uncharge_page(struct page *page)
723{ 662{
663 struct page_cgroup *pc;
724 struct mem_cgroup *mem; 664 struct mem_cgroup *mem;
725 struct mem_cgroup_per_zone *mz; 665 struct mem_cgroup_per_zone *mz;
726 struct page *page;
727 unsigned long flags; 666 unsigned long flags;
728 667
729 /* 668 /*
730 * Check if our page_cgroup is valid 669 * Check if our page_cgroup is valid
731 */ 670 */
671 lock_page_cgroup(page);
672 pc = page_get_page_cgroup(page);
732 if (!pc) 673 if (!pc)
733 return; 674 goto unlock;
734 675
735 if (atomic_dec_and_test(&pc->ref_cnt)) { 676 VM_BUG_ON(pc->page != page);
736 page = pc->page; 677 VM_BUG_ON(pc->ref_cnt <= 0);
678
679 if (--(pc->ref_cnt) == 0) {
737 mz = page_cgroup_zoneinfo(pc); 680 mz = page_cgroup_zoneinfo(pc);
738 /* 681 spin_lock_irqsave(&mz->lru_lock, flags);
739 * get page->cgroup and clear it under lock. 682 __mem_cgroup_remove_list(pc);
740 * force_empty can drop page->cgroup without checking refcnt. 683 spin_unlock_irqrestore(&mz->lru_lock, flags);
741 */ 684
685 page_assign_page_cgroup(page, NULL);
742 unlock_page_cgroup(page); 686 unlock_page_cgroup(page);
743 if (clear_page_cgroup(page, pc) == pc) { 687
744 mem = pc->mem_cgroup; 688 mem = pc->mem_cgroup;
745 css_put(&mem->css); 689 res_counter_uncharge(&mem->res, PAGE_SIZE);
746 res_counter_uncharge(&mem->res, PAGE_SIZE); 690 css_put(&mem->css);
747 spin_lock_irqsave(&mz->lru_lock, flags); 691
748 __mem_cgroup_remove_list(pc); 692 kfree(pc);
749 spin_unlock_irqrestore(&mz->lru_lock, flags); 693 return;
750 kfree(pc);
751 }
752 lock_page_cgroup(page);
753 } 694 }
754}
755 695
756void mem_cgroup_uncharge_page(struct page *page) 696unlock:
757{
758 lock_page_cgroup(page);
759 mem_cgroup_uncharge(page_get_page_cgroup(page));
760 unlock_page_cgroup(page); 697 unlock_page_cgroup(page);
761} 698}
762 699
@@ -764,63 +701,59 @@ void mem_cgroup_uncharge_page(struct page *page)
764 * Returns non-zero if a page (under migration) has valid page_cgroup member. 701 * Returns non-zero if a page (under migration) has valid page_cgroup member.
765 * Refcnt of page_cgroup is incremented. 702 * Refcnt of page_cgroup is incremented.
766 */ 703 */
767
768int mem_cgroup_prepare_migration(struct page *page) 704int mem_cgroup_prepare_migration(struct page *page)
769{ 705{
770 struct page_cgroup *pc; 706 struct page_cgroup *pc;
771 int ret = 0; 707
772 lock_page_cgroup(page); 708 lock_page_cgroup(page);
773 pc = page_get_page_cgroup(page); 709 pc = page_get_page_cgroup(page);
774 if (pc && atomic_inc_not_zero(&pc->ref_cnt)) 710 if (pc)
775 ret = 1; 711 pc->ref_cnt++;
776 unlock_page_cgroup(page); 712 unlock_page_cgroup(page);
777 return ret; 713 return pc != NULL;
778} 714}
779 715
780void mem_cgroup_end_migration(struct page *page) 716void mem_cgroup_end_migration(struct page *page)
781{ 717{
782 struct page_cgroup *pc; 718 mem_cgroup_uncharge_page(page);
783
784 lock_page_cgroup(page);
785 pc = page_get_page_cgroup(page);
786 mem_cgroup_uncharge(pc);
787 unlock_page_cgroup(page);
788} 719}
720
789/* 721/*
790 * We know both *page* and *newpage* are now not-on-LRU and Pg_locked. 722 * We know both *page* and *newpage* are now not-on-LRU and PG_locked.
791 * And no race with uncharge() routines because page_cgroup for *page* 723 * And no race with uncharge() routines because page_cgroup for *page*
792 * has extra one reference by mem_cgroup_prepare_migration. 724 * has extra one reference by mem_cgroup_prepare_migration.
793 */ 725 */
794
795void mem_cgroup_page_migration(struct page *page, struct page *newpage) 726void mem_cgroup_page_migration(struct page *page, struct page *newpage)
796{ 727{
797 struct page_cgroup *pc; 728 struct page_cgroup *pc;
798 struct mem_cgroup *mem;
799 unsigned long flags;
800 struct mem_cgroup_per_zone *mz; 729 struct mem_cgroup_per_zone *mz;
801retry: 730 unsigned long flags;
731
732 lock_page_cgroup(page);
802 pc = page_get_page_cgroup(page); 733 pc = page_get_page_cgroup(page);
803 if (!pc) 734 if (!pc) {
735 unlock_page_cgroup(page);
804 return; 736 return;
805 mem = pc->mem_cgroup; 737 }
738
806 mz = page_cgroup_zoneinfo(pc); 739 mz = page_cgroup_zoneinfo(pc);
807 if (clear_page_cgroup(page, pc) != pc)
808 goto retry;
809 spin_lock_irqsave(&mz->lru_lock, flags); 740 spin_lock_irqsave(&mz->lru_lock, flags);
810
811 __mem_cgroup_remove_list(pc); 741 __mem_cgroup_remove_list(pc);
812 spin_unlock_irqrestore(&mz->lru_lock, flags); 742 spin_unlock_irqrestore(&mz->lru_lock, flags);
813 743
744 page_assign_page_cgroup(page, NULL);
745 unlock_page_cgroup(page);
746
814 pc->page = newpage; 747 pc->page = newpage;
815 lock_page_cgroup(newpage); 748 lock_page_cgroup(newpage);
816 page_assign_page_cgroup(newpage, pc); 749 page_assign_page_cgroup(newpage, pc);
817 unlock_page_cgroup(newpage);
818 750
819 mz = page_cgroup_zoneinfo(pc); 751 mz = page_cgroup_zoneinfo(pc);
820 spin_lock_irqsave(&mz->lru_lock, flags); 752 spin_lock_irqsave(&mz->lru_lock, flags);
821 __mem_cgroup_add_list(pc); 753 __mem_cgroup_add_list(pc);
822 spin_unlock_irqrestore(&mz->lru_lock, flags); 754 spin_unlock_irqrestore(&mz->lru_lock, flags);
823 return; 755
756 unlock_page_cgroup(newpage);
824} 757}
825 758
826/* 759/*
@@ -829,14 +762,13 @@ retry:
829 * *And* this routine doesn't reclaim page itself, just removes page_cgroup. 762 * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
830 */ 763 */
831#define FORCE_UNCHARGE_BATCH (128) 764#define FORCE_UNCHARGE_BATCH (128)
832static void 765static void mem_cgroup_force_empty_list(struct mem_cgroup *mem,
833mem_cgroup_force_empty_list(struct mem_cgroup *mem,
834 struct mem_cgroup_per_zone *mz, 766 struct mem_cgroup_per_zone *mz,
835 int active) 767 int active)
836{ 768{
837 struct page_cgroup *pc; 769 struct page_cgroup *pc;
838 struct page *page; 770 struct page *page;
839 int count; 771 int count = FORCE_UNCHARGE_BATCH;
840 unsigned long flags; 772 unsigned long flags;
841 struct list_head *list; 773 struct list_head *list;
842 774
@@ -845,46 +777,36 @@ mem_cgroup_force_empty_list(struct mem_cgroup *mem,
845 else 777 else
846 list = &mz->inactive_list; 778 list = &mz->inactive_list;
847 779
848 if (list_empty(list))
849 return;
850retry:
851 count = FORCE_UNCHARGE_BATCH;
852 spin_lock_irqsave(&mz->lru_lock, flags); 780 spin_lock_irqsave(&mz->lru_lock, flags);
853 781 while (!list_empty(list)) {
854 while (--count && !list_empty(list)) {
855 pc = list_entry(list->prev, struct page_cgroup, lru); 782 pc = list_entry(list->prev, struct page_cgroup, lru);
856 page = pc->page; 783 page = pc->page;
857 /* Avoid race with charge */ 784 get_page(page);
858 atomic_set(&pc->ref_cnt, 0); 785 spin_unlock_irqrestore(&mz->lru_lock, flags);
859 if (clear_page_cgroup(page, pc) == pc) { 786 mem_cgroup_uncharge_page(page);
860 css_put(&mem->css); 787 put_page(page);
861 res_counter_uncharge(&mem->res, PAGE_SIZE); 788 if (--count <= 0) {
862 __mem_cgroup_remove_list(pc); 789 count = FORCE_UNCHARGE_BATCH;
863 kfree(pc); 790 cond_resched();
864 } else /* being uncharged ? ...do relax */ 791 }
865 break; 792 spin_lock_irqsave(&mz->lru_lock, flags);
866 } 793 }
867 spin_unlock_irqrestore(&mz->lru_lock, flags); 794 spin_unlock_irqrestore(&mz->lru_lock, flags);
868 if (!list_empty(list)) {
869 cond_resched();
870 goto retry;
871 }
872 return;
873} 795}
874 796
875/* 797/*
876 * make mem_cgroup's charge to be 0 if there is no task. 798 * make mem_cgroup's charge to be 0 if there is no task.
877 * This enables deleting this mem_cgroup. 799 * This enables deleting this mem_cgroup.
878 */ 800 */
879 801static int mem_cgroup_force_empty(struct mem_cgroup *mem)
880int mem_cgroup_force_empty(struct mem_cgroup *mem)
881{ 802{
882 int ret = -EBUSY; 803 int ret = -EBUSY;
883 int node, zid; 804 int node, zid;
805
884 css_get(&mem->css); 806 css_get(&mem->css);
885 /* 807 /*
886 * page reclaim code (kswapd etc..) will move pages between 808 * page reclaim code (kswapd etc..) will move pages between
887` * active_list <-> inactive_list while we don't take a lock. 809 * active_list <-> inactive_list while we don't take a lock.
888 * So, we have to do loop here until all lists are empty. 810 * So, we have to do loop here until all lists are empty.
889 */ 811 */
890 while (mem->res.usage > 0) { 812 while (mem->res.usage > 0) {
@@ -906,9 +828,7 @@ out:
906 return ret; 828 return ret;
907} 829}
908 830
909 831static int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
910
911int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
912{ 832{
913 *tmp = memparse(buf, &buf); 833 *tmp = memparse(buf, &buf);
914 if (*buf != '\0') 834 if (*buf != '\0')
@@ -945,8 +865,7 @@ static ssize_t mem_force_empty_write(struct cgroup *cont,
945 size_t nbytes, loff_t *ppos) 865 size_t nbytes, loff_t *ppos)
946{ 866{
947 struct mem_cgroup *mem = mem_cgroup_from_cont(cont); 867 struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
948 int ret; 868 int ret = mem_cgroup_force_empty(mem);
949 ret = mem_cgroup_force_empty(mem);
950 if (!ret) 869 if (!ret)
951 ret = nbytes; 870 ret = nbytes;
952 return ret; 871 return ret;
@@ -955,7 +874,6 @@ static ssize_t mem_force_empty_write(struct cgroup *cont,
955/* 874/*
956 * Note: This should be removed if cgroup supports write-only file. 875 * Note: This should be removed if cgroup supports write-only file.
957 */ 876 */
958
959static ssize_t mem_force_empty_read(struct cgroup *cont, 877static ssize_t mem_force_empty_read(struct cgroup *cont,
960 struct cftype *cft, 878 struct cftype *cft,
961 struct file *file, char __user *userbuf, 879 struct file *file, char __user *userbuf,
@@ -964,7 +882,6 @@ static ssize_t mem_force_empty_read(struct cgroup *cont,
964 return -EINVAL; 882 return -EINVAL;
965} 883}
966 884
967
968static const struct mem_cgroup_stat_desc { 885static const struct mem_cgroup_stat_desc {
969 const char *msg; 886 const char *msg;
970 u64 unit; 887 u64 unit;
@@ -1017,8 +934,6 @@ static int mem_control_stat_open(struct inode *unused, struct file *file)
1017 return single_open(file, mem_control_stat_show, cont); 934 return single_open(file, mem_control_stat_show, cont);
1018} 935}
1019 936
1020
1021
1022static struct cftype mem_cgroup_files[] = { 937static struct cftype mem_cgroup_files[] = {
1023 { 938 {
1024 .name = "usage_in_bytes", 939 .name = "usage_in_bytes",
@@ -1084,9 +999,6 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
1084 kfree(mem->info.nodeinfo[node]); 999 kfree(mem->info.nodeinfo[node]);
1085} 1000}
1086 1001
1087
1088static struct mem_cgroup init_mem_cgroup;
1089
1090static struct cgroup_subsys_state * 1002static struct cgroup_subsys_state *
1091mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) 1003mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
1092{ 1004{
@@ -1176,7 +1088,6 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
1176 1088
1177out: 1089out:
1178 mmput(mm); 1090 mmput(mm);
1179 return;
1180} 1091}
1181 1092
1182struct cgroup_subsys mem_cgroup_subsys = { 1093struct cgroup_subsys mem_cgroup_subsys = {