diff options
| -rw-r--r-- | include/linux/memcontrol.h | 20 | ||||
| -rw-r--r-- | mm/filemap.c | 12 | ||||
| -rw-r--r-- | mm/memcontrol.c | 166 | ||||
| -rw-r--r-- | mm/memory.c | 47 | ||||
| -rw-r--r-- | mm/migrate.c | 6 | ||||
| -rw-r--r-- | mm/page_alloc.c | 3 | ||||
| -rw-r--r-- | mm/rmap.c | 17 | ||||
| -rw-r--r-- | mm/swap_state.c | 10 | ||||
| -rw-r--r-- | mm/swapfile.c | 41 |
9 files changed, 295 insertions, 27 deletions
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 7d1f119c796e..f5b47efab48b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
| @@ -30,6 +30,13 @@ extern void mm_free_cgroup(struct mm_struct *mm); | |||
| 30 | extern void page_assign_page_cgroup(struct page *page, | 30 | extern void page_assign_page_cgroup(struct page *page, |
| 31 | struct page_cgroup *pc); | 31 | struct page_cgroup *pc); |
| 32 | extern struct page_cgroup *page_get_page_cgroup(struct page *page); | 32 | extern struct page_cgroup *page_get_page_cgroup(struct page *page); |
| 33 | extern int mem_cgroup_charge(struct page *page, struct mm_struct *mm); | ||
| 34 | extern void mem_cgroup_uncharge(struct page_cgroup *pc); | ||
| 35 | |||
| 36 | static inline void mem_cgroup_uncharge_page(struct page *page) | ||
| 37 | { | ||
| 38 | mem_cgroup_uncharge(page_get_page_cgroup(page)); | ||
| 39 | } | ||
| 33 | 40 | ||
| 34 | #else /* CONFIG_CGROUP_MEM_CONT */ | 41 | #else /* CONFIG_CGROUP_MEM_CONT */ |
| 35 | static inline void mm_init_cgroup(struct mm_struct *mm, | 42 | static inline void mm_init_cgroup(struct mm_struct *mm, |
| @@ -51,6 +58,19 @@ static inline struct page_cgroup *page_get_page_cgroup(struct page *page) | |||
| 51 | return NULL; | 58 | return NULL; |
| 52 | } | 59 | } |
| 53 | 60 | ||
| 61 | static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm) | ||
| 62 | { | ||
| 63 | return 0; | ||
| 64 | } | ||
| 65 | |||
| 66 | static inline void mem_cgroup_uncharge(struct page_cgroup *pc) | ||
| 67 | { | ||
| 68 | } | ||
| 69 | |||
| 70 | static inline void mem_cgroup_uncharge_page(struct page *page) | ||
| 71 | { | ||
| 72 | } | ||
| 73 | |||
| 54 | #endif /* CONFIG_CGROUP_MEM_CONT */ | 74 | #endif /* CONFIG_CGROUP_MEM_CONT */ |
| 55 | 75 | ||
| 56 | #endif /* _LINUX_MEMCONTROL_H */ | 76 | #endif /* _LINUX_MEMCONTROL_H */ |
diff --git a/mm/filemap.c b/mm/filemap.c index 81fb9bff0d4f..b7a01e927953 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -33,6 +33,7 @@ | |||
| 33 | #include <linux/syscalls.h> | 33 | #include <linux/syscalls.h> |
| 34 | #include <linux/cpuset.h> | 34 | #include <linux/cpuset.h> |
| 35 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ | 35 | #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ |
| 36 | #include <linux/memcontrol.h> | ||
| 36 | #include "internal.h" | 37 | #include "internal.h" |
| 37 | 38 | ||
| 38 | /* | 39 | /* |
| @@ -118,6 +119,7 @@ void __remove_from_page_cache(struct page *page) | |||
| 118 | { | 119 | { |
| 119 | struct address_space *mapping = page->mapping; | 120 | struct address_space *mapping = page->mapping; |
| 120 | 121 | ||
| 122 | mem_cgroup_uncharge_page(page); | ||
| 121 | radix_tree_delete(&mapping->page_tree, page->index); | 123 | radix_tree_delete(&mapping->page_tree, page->index); |
| 122 | page->mapping = NULL; | 124 | page->mapping = NULL; |
| 123 | mapping->nrpages--; | 125 | mapping->nrpages--; |
| @@ -461,6 +463,11 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, | |||
| 461 | int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); | 463 | int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM); |
| 462 | 464 | ||
| 463 | if (error == 0) { | 465 | if (error == 0) { |
| 466 | |||
| 467 | error = mem_cgroup_charge(page, current->mm); | ||
| 468 | if (error) | ||
| 469 | goto out; | ||
| 470 | |||
| 464 | write_lock_irq(&mapping->tree_lock); | 471 | write_lock_irq(&mapping->tree_lock); |
| 465 | error = radix_tree_insert(&mapping->page_tree, offset, page); | 472 | error = radix_tree_insert(&mapping->page_tree, offset, page); |
| 466 | if (!error) { | 473 | if (!error) { |
| @@ -470,10 +477,13 @@ int add_to_page_cache(struct page *page, struct address_space *mapping, | |||
| 470 | page->index = offset; | 477 | page->index = offset; |
| 471 | mapping->nrpages++; | 478 | mapping->nrpages++; |
| 472 | __inc_zone_page_state(page, NR_FILE_PAGES); | 479 | __inc_zone_page_state(page, NR_FILE_PAGES); |
| 473 | } | 480 | } else |
| 481 | mem_cgroup_uncharge_page(page); | ||
| 482 | |||
| 474 | write_unlock_irq(&mapping->tree_lock); | 483 | write_unlock_irq(&mapping->tree_lock); |
| 475 | radix_tree_preload_end(); | 484 | radix_tree_preload_end(); |
| 476 | } | 485 | } |
| 486 | out: | ||
| 477 | return error; | 487 | return error; |
| 478 | } | 488 | } |
| 479 | EXPORT_SYMBOL(add_to_page_cache); | 489 | EXPORT_SYMBOL(add_to_page_cache); |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4d4805eb37c7..ebca767292dc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -21,6 +21,9 @@ | |||
| 21 | #include <linux/memcontrol.h> | 21 | #include <linux/memcontrol.h> |
| 22 | #include <linux/cgroup.h> | 22 | #include <linux/cgroup.h> |
| 23 | #include <linux/mm.h> | 23 | #include <linux/mm.h> |
| 24 | #include <linux/page-flags.h> | ||
| 25 | #include <linux/bit_spinlock.h> | ||
| 26 | #include <linux/rcupdate.h> | ||
| 24 | 27 | ||
| 25 | struct cgroup_subsys mem_cgroup_subsys; | 28 | struct cgroup_subsys mem_cgroup_subsys; |
| 26 | 29 | ||
| @@ -31,7 +34,9 @@ struct cgroup_subsys mem_cgroup_subsys; | |||
| 31 | * to help the administrator determine what knobs to tune. | 34 | * to help the administrator determine what knobs to tune. |
| 32 | * | 35 | * |
| 33 | * TODO: Add a water mark for the memory controller. Reclaim will begin when | 36 | * TODO: Add a water mark for the memory controller. Reclaim will begin when |
| 34 | * we hit the water mark. | 37 | * we hit the water mark. May be even add a low water mark, such that |
| 38 | * no reclaim occurs from a cgroup at it's low water mark, this is | ||
| 39 | * a feature that will be implemented much later in the future. | ||
| 35 | */ | 40 | */ |
| 36 | struct mem_cgroup { | 41 | struct mem_cgroup { |
| 37 | struct cgroup_subsys_state css; | 42 | struct cgroup_subsys_state css; |
| @@ -49,6 +54,14 @@ struct mem_cgroup { | |||
| 49 | }; | 54 | }; |
| 50 | 55 | ||
| 51 | /* | 56 | /* |
| 57 | * We use the lower bit of the page->page_cgroup pointer as a bit spin | ||
| 58 | * lock. We need to ensure that page->page_cgroup is atleast two | ||
| 59 | * byte aligned (based on comments from Nick Piggin) | ||
| 60 | */ | ||
| 61 | #define PAGE_CGROUP_LOCK_BIT 0x0 | ||
| 62 | #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) | ||
| 63 | |||
| 64 | /* | ||
| 52 | * A page_cgroup page is associated with every page descriptor. The | 65 | * A page_cgroup page is associated with every page descriptor. The |
| 53 | * page_cgroup helps us identify information about the cgroup | 66 | * page_cgroup helps us identify information about the cgroup |
| 54 | */ | 67 | */ |
| @@ -56,6 +69,8 @@ struct page_cgroup { | |||
| 56 | struct list_head lru; /* per cgroup LRU list */ | 69 | struct list_head lru; /* per cgroup LRU list */ |
| 57 | struct page *page; | 70 | struct page *page; |
| 58 | struct mem_cgroup *mem_cgroup; | 71 | struct mem_cgroup *mem_cgroup; |
| 72 | atomic_t ref_cnt; /* Helpful when pages move b/w */ | ||
| 73 | /* mapped and cached states */ | ||
| 59 | }; | 74 | }; |
| 60 | 75 | ||
| 61 | 76 | ||
| @@ -88,14 +103,157 @@ void mm_free_cgroup(struct mm_struct *mm) | |||
| 88 | css_put(&mm->mem_cgroup->css); | 103 | css_put(&mm->mem_cgroup->css); |
| 89 | } | 104 | } |
| 90 | 105 | ||
| 106 | static inline int page_cgroup_locked(struct page *page) | ||
| 107 | { | ||
| 108 | return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, | ||
| 109 | &page->page_cgroup); | ||
| 110 | } | ||
| 111 | |||
| 91 | void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) | 112 | void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) |
| 92 | { | 113 | { |
| 93 | page->page_cgroup = (unsigned long)pc; | 114 | int locked; |
| 115 | |||
| 116 | /* | ||
| 117 | * While resetting the page_cgroup we might not hold the | ||
| 118 | * page_cgroup lock. free_hot_cold_page() is an example | ||
| 119 | * of such a scenario | ||
| 120 | */ | ||
| 121 | if (pc) | ||
| 122 | VM_BUG_ON(!page_cgroup_locked(page)); | ||
| 123 | locked = (page->page_cgroup & PAGE_CGROUP_LOCK); | ||
| 124 | page->page_cgroup = ((unsigned long)pc | locked); | ||
| 94 | } | 125 | } |
| 95 | 126 | ||
| 96 | struct page_cgroup *page_get_page_cgroup(struct page *page) | 127 | struct page_cgroup *page_get_page_cgroup(struct page *page) |
| 97 | { | 128 | { |
| 98 | return page->page_cgroup; | 129 | return (struct page_cgroup *) |
| 130 | (page->page_cgroup & ~PAGE_CGROUP_LOCK); | ||
| 131 | } | ||
| 132 | |||
| 133 | void __always_inline lock_page_cgroup(struct page *page) | ||
| 134 | { | ||
| 135 | bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
| 136 | VM_BUG_ON(!page_cgroup_locked(page)); | ||
| 137 | } | ||
| 138 | |||
| 139 | void __always_inline unlock_page_cgroup(struct page *page) | ||
| 140 | { | ||
| 141 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
| 142 | } | ||
| 143 | |||
| 144 | /* | ||
| 145 | * Charge the memory controller for page usage. | ||
| 146 | * Return | ||
| 147 | * 0 if the charge was successful | ||
| 148 | * < 0 if the cgroup is over its limit | ||
| 149 | */ | ||
| 150 | int mem_cgroup_charge(struct page *page, struct mm_struct *mm) | ||
| 151 | { | ||
| 152 | struct mem_cgroup *mem; | ||
| 153 | struct page_cgroup *pc, *race_pc; | ||
| 154 | |||
| 155 | /* | ||
| 156 | * Should page_cgroup's go to their own slab? | ||
| 157 | * One could optimize the performance of the charging routine | ||
| 158 | * by saving a bit in the page_flags and using it as a lock | ||
| 159 | * to see if the cgroup page already has a page_cgroup associated | ||
| 160 | * with it | ||
| 161 | */ | ||
| 162 | lock_page_cgroup(page); | ||
| 163 | pc = page_get_page_cgroup(page); | ||
| 164 | /* | ||
| 165 | * The page_cgroup exists and the page has already been accounted | ||
| 166 | */ | ||
| 167 | if (pc) { | ||
| 168 | atomic_inc(&pc->ref_cnt); | ||
| 169 | goto done; | ||
| 170 | } | ||
| 171 | |||
| 172 | unlock_page_cgroup(page); | ||
| 173 | |||
| 174 | pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL); | ||
| 175 | if (pc == NULL) | ||
| 176 | goto err; | ||
| 177 | |||
| 178 | rcu_read_lock(); | ||
| 179 | /* | ||
| 180 | * We always charge the cgroup the mm_struct belongs to | ||
| 181 | * the mm_struct's mem_cgroup changes on task migration if the | ||
| 182 | * thread group leader migrates. It's possible that mm is not | ||
| 183 | * set, if so charge the init_mm (happens for pagecache usage). | ||
| 184 | */ | ||
| 185 | if (!mm) | ||
| 186 | mm = &init_mm; | ||
| 187 | |||
| 188 | mem = rcu_dereference(mm->mem_cgroup); | ||
| 189 | /* | ||
| 190 | * For every charge from the cgroup, increment reference | ||
| 191 | * count | ||
| 192 | */ | ||
| 193 | css_get(&mem->css); | ||
| 194 | rcu_read_unlock(); | ||
| 195 | |||
| 196 | /* | ||
| 197 | * If we created the page_cgroup, we should free it on exceeding | ||
| 198 | * the cgroup limit. | ||
| 199 | */ | ||
| 200 | if (res_counter_charge(&mem->res, 1)) { | ||
| 201 | css_put(&mem->css); | ||
| 202 | goto free_pc; | ||
| 203 | } | ||
| 204 | |||
| 205 | lock_page_cgroup(page); | ||
| 206 | /* | ||
| 207 | * Check if somebody else beat us to allocating the page_cgroup | ||
| 208 | */ | ||
| 209 | race_pc = page_get_page_cgroup(page); | ||
| 210 | if (race_pc) { | ||
| 211 | kfree(pc); | ||
| 212 | pc = race_pc; | ||
| 213 | atomic_inc(&pc->ref_cnt); | ||
| 214 | res_counter_uncharge(&mem->res, 1); | ||
| 215 | css_put(&mem->css); | ||
| 216 | goto done; | ||
| 217 | } | ||
| 218 | |||
| 219 | atomic_set(&pc->ref_cnt, 1); | ||
| 220 | pc->mem_cgroup = mem; | ||
| 221 | pc->page = page; | ||
| 222 | page_assign_page_cgroup(page, pc); | ||
| 223 | |||
| 224 | done: | ||
| 225 | unlock_page_cgroup(page); | ||
| 226 | return 0; | ||
| 227 | free_pc: | ||
| 228 | kfree(pc); | ||
| 229 | return -ENOMEM; | ||
| 230 | err: | ||
| 231 | unlock_page_cgroup(page); | ||
| 232 | return -ENOMEM; | ||
| 233 | } | ||
| 234 | |||
| 235 | /* | ||
| 236 | * Uncharging is always a welcome operation, we never complain, simply | ||
| 237 | * uncharge. | ||
| 238 | */ | ||
| 239 | void mem_cgroup_uncharge(struct page_cgroup *pc) | ||
| 240 | { | ||
| 241 | struct mem_cgroup *mem; | ||
| 242 | struct page *page; | ||
| 243 | |||
| 244 | if (!pc) | ||
| 245 | return; | ||
| 246 | |||
| 247 | if (atomic_dec_and_test(&pc->ref_cnt)) { | ||
| 248 | page = pc->page; | ||
| 249 | lock_page_cgroup(page); | ||
| 250 | mem = pc->mem_cgroup; | ||
| 251 | css_put(&mem->css); | ||
| 252 | page_assign_page_cgroup(page, NULL); | ||
| 253 | unlock_page_cgroup(page); | ||
| 254 | res_counter_uncharge(&mem->res, 1); | ||
| 255 | kfree(pc); | ||
| 256 | } | ||
| 99 | } | 257 | } |
| 100 | 258 | ||
| 101 | static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, | 259 | static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, |
| @@ -150,6 +308,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 150 | return NULL; | 308 | return NULL; |
| 151 | 309 | ||
| 152 | res_counter_init(&mem->res); | 310 | res_counter_init(&mem->res); |
| 311 | INIT_LIST_HEAD(&mem->active_list); | ||
| 312 | INIT_LIST_HEAD(&mem->inactive_list); | ||
| 153 | return &mem->css; | 313 | return &mem->css; |
| 154 | } | 314 | } |
| 155 | 315 | ||
diff --git a/mm/memory.c b/mm/memory.c index 9d073fa0a2d0..0ba224ea6ba4 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -50,6 +50,7 @@ | |||
| 50 | #include <linux/delayacct.h> | 50 | #include <linux/delayacct.h> |
| 51 | #include <linux/init.h> | 51 | #include <linux/init.h> |
| 52 | #include <linux/writeback.h> | 52 | #include <linux/writeback.h> |
| 53 | #include <linux/memcontrol.h> | ||
| 53 | 54 | ||
| 54 | #include <asm/pgalloc.h> | 55 | #include <asm/pgalloc.h> |
| 55 | #include <asm/uaccess.h> | 56 | #include <asm/uaccess.h> |
| @@ -1144,16 +1145,20 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa | |||
| 1144 | { | 1145 | { |
| 1145 | int retval; | 1146 | int retval; |
| 1146 | pte_t *pte; | 1147 | pte_t *pte; |
| 1147 | spinlock_t *ptl; | 1148 | spinlock_t *ptl; |
| 1149 | |||
| 1150 | retval = mem_cgroup_charge(page, mm); | ||
| 1151 | if (retval) | ||
| 1152 | goto out; | ||
| 1148 | 1153 | ||
| 1149 | retval = -EINVAL; | 1154 | retval = -EINVAL; |
| 1150 | if (PageAnon(page)) | 1155 | if (PageAnon(page)) |
| 1151 | goto out; | 1156 | goto out_uncharge; |
| 1152 | retval = -ENOMEM; | 1157 | retval = -ENOMEM; |
| 1153 | flush_dcache_page(page); | 1158 | flush_dcache_page(page); |
| 1154 | pte = get_locked_pte(mm, addr, &ptl); | 1159 | pte = get_locked_pte(mm, addr, &ptl); |
| 1155 | if (!pte) | 1160 | if (!pte) |
| 1156 | goto out; | 1161 | goto out_uncharge; |
| 1157 | retval = -EBUSY; | 1162 | retval = -EBUSY; |
| 1158 | if (!pte_none(*pte)) | 1163 | if (!pte_none(*pte)) |
| 1159 | goto out_unlock; | 1164 | goto out_unlock; |
| @@ -1165,8 +1170,12 @@ static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *pa | |||
| 1165 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); | 1170 | set_pte_at(mm, addr, pte, mk_pte(page, prot)); |
| 1166 | 1171 | ||
| 1167 | retval = 0; | 1172 | retval = 0; |
| 1173 | pte_unmap_unlock(pte, ptl); | ||
| 1174 | return retval; | ||
| 1168 | out_unlock: | 1175 | out_unlock: |
| 1169 | pte_unmap_unlock(pte, ptl); | 1176 | pte_unmap_unlock(pte, ptl); |
| 1177 | out_uncharge: | ||
| 1178 | mem_cgroup_uncharge_page(page); | ||
| 1170 | out: | 1179 | out: |
| 1171 | return retval; | 1180 | return retval; |
| 1172 | } | 1181 | } |
| @@ -1641,6 +1650,9 @@ gotten: | |||
| 1641 | cow_user_page(new_page, old_page, address, vma); | 1650 | cow_user_page(new_page, old_page, address, vma); |
| 1642 | __SetPageUptodate(new_page); | 1651 | __SetPageUptodate(new_page); |
| 1643 | 1652 | ||
| 1653 | if (mem_cgroup_charge(new_page, mm)) | ||
| 1654 | goto oom_free_new; | ||
| 1655 | |||
| 1644 | /* | 1656 | /* |
| 1645 | * Re-check the pte - we dropped the lock | 1657 | * Re-check the pte - we dropped the lock |
| 1646 | */ | 1658 | */ |
| @@ -1672,7 +1684,9 @@ gotten: | |||
| 1672 | /* Free the old page.. */ | 1684 | /* Free the old page.. */ |
| 1673 | new_page = old_page; | 1685 | new_page = old_page; |
| 1674 | ret |= VM_FAULT_WRITE; | 1686 | ret |= VM_FAULT_WRITE; |
| 1675 | } | 1687 | } else |
| 1688 | mem_cgroup_uncharge_page(new_page); | ||
| 1689 | |||
| 1676 | if (new_page) | 1690 | if (new_page) |
| 1677 | page_cache_release(new_page); | 1691 | page_cache_release(new_page); |
| 1678 | if (old_page) | 1692 | if (old_page) |
| @@ -1696,6 +1710,8 @@ unlock: | |||
| 1696 | put_page(dirty_page); | 1710 | put_page(dirty_page); |
| 1697 | } | 1711 | } |
| 1698 | return ret; | 1712 | return ret; |
| 1713 | oom_free_new: | ||
| 1714 | __free_page(new_page); | ||
| 1699 | oom: | 1715 | oom: |
| 1700 | if (old_page) | 1716 | if (old_page) |
| 1701 | page_cache_release(old_page); | 1717 | page_cache_release(old_page); |
| @@ -2036,6 +2052,12 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2036 | count_vm_event(PGMAJFAULT); | 2052 | count_vm_event(PGMAJFAULT); |
| 2037 | } | 2053 | } |
| 2038 | 2054 | ||
| 2055 | if (mem_cgroup_charge(page, mm)) { | ||
| 2056 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | ||
| 2057 | ret = VM_FAULT_OOM; | ||
| 2058 | goto out; | ||
| 2059 | } | ||
| 2060 | |||
| 2039 | mark_page_accessed(page); | 2061 | mark_page_accessed(page); |
| 2040 | lock_page(page); | 2062 | lock_page(page); |
| 2041 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); | 2063 | delayacct_clear_flag(DELAYACCT_PF_SWAPIN); |
| @@ -2073,8 +2095,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2073 | if (write_access) { | 2095 | if (write_access) { |
| 2074 | /* XXX: We could OR the do_wp_page code with this one? */ | 2096 | /* XXX: We could OR the do_wp_page code with this one? */ |
| 2075 | if (do_wp_page(mm, vma, address, | 2097 | if (do_wp_page(mm, vma, address, |
| 2076 | page_table, pmd, ptl, pte) & VM_FAULT_OOM) | 2098 | page_table, pmd, ptl, pte) & VM_FAULT_OOM) { |
| 2099 | mem_cgroup_uncharge_page(page); | ||
| 2077 | ret = VM_FAULT_OOM; | 2100 | ret = VM_FAULT_OOM; |
| 2101 | } | ||
| 2078 | goto out; | 2102 | goto out; |
| 2079 | } | 2103 | } |
| 2080 | 2104 | ||
| @@ -2085,6 +2109,7 @@ unlock: | |||
| 2085 | out: | 2109 | out: |
| 2086 | return ret; | 2110 | return ret; |
| 2087 | out_nomap: | 2111 | out_nomap: |
| 2112 | mem_cgroup_uncharge_page(page); | ||
| 2088 | pte_unmap_unlock(page_table, ptl); | 2113 | pte_unmap_unlock(page_table, ptl); |
| 2089 | unlock_page(page); | 2114 | unlock_page(page); |
| 2090 | page_cache_release(page); | 2115 | page_cache_release(page); |
| @@ -2114,6 +2139,9 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2114 | goto oom; | 2139 | goto oom; |
| 2115 | __SetPageUptodate(page); | 2140 | __SetPageUptodate(page); |
| 2116 | 2141 | ||
| 2142 | if (mem_cgroup_charge(page, mm)) | ||
| 2143 | goto oom_free_page; | ||
| 2144 | |||
| 2117 | entry = mk_pte(page, vma->vm_page_prot); | 2145 | entry = mk_pte(page, vma->vm_page_prot); |
| 2118 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); | 2146 | entry = maybe_mkwrite(pte_mkdirty(entry), vma); |
| 2119 | 2147 | ||
| @@ -2131,8 +2159,11 @@ unlock: | |||
| 2131 | pte_unmap_unlock(page_table, ptl); | 2159 | pte_unmap_unlock(page_table, ptl); |
| 2132 | return 0; | 2160 | return 0; |
| 2133 | release: | 2161 | release: |
| 2162 | mem_cgroup_uncharge_page(page); | ||
| 2134 | page_cache_release(page); | 2163 | page_cache_release(page); |
| 2135 | goto unlock; | 2164 | goto unlock; |
| 2165 | oom_free_page: | ||
| 2166 | __free_page(page); | ||
| 2136 | oom: | 2167 | oom: |
| 2137 | return VM_FAULT_OOM; | 2168 | return VM_FAULT_OOM; |
| 2138 | } | 2169 | } |
| @@ -2246,6 +2277,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2246 | 2277 | ||
| 2247 | } | 2278 | } |
| 2248 | 2279 | ||
| 2280 | if (mem_cgroup_charge(page, mm)) { | ||
| 2281 | ret = VM_FAULT_OOM; | ||
| 2282 | goto out; | ||
| 2283 | } | ||
| 2284 | |||
| 2249 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); | 2285 | page_table = pte_offset_map_lock(mm, pmd, address, &ptl); |
| 2250 | 2286 | ||
| 2251 | /* | 2287 | /* |
| @@ -2281,6 +2317,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 2281 | /* no need to invalidate: a not-present page won't be cached */ | 2317 | /* no need to invalidate: a not-present page won't be cached */ |
| 2282 | update_mmu_cache(vma, address, entry); | 2318 | update_mmu_cache(vma, address, entry); |
| 2283 | } else { | 2319 | } else { |
| 2320 | mem_cgroup_uncharge_page(page); | ||
| 2284 | if (anon) | 2321 | if (anon) |
| 2285 | page_cache_release(page); | 2322 | page_cache_release(page); |
| 2286 | else | 2323 | else |
diff --git a/mm/migrate.c b/mm/migrate.c index 857a987e3690..417bbda14e5b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -29,6 +29,7 @@ | |||
| 29 | #include <linux/mempolicy.h> | 29 | #include <linux/mempolicy.h> |
| 30 | #include <linux/vmalloc.h> | 30 | #include <linux/vmalloc.h> |
| 31 | #include <linux/security.h> | 31 | #include <linux/security.h> |
| 32 | #include <linux/memcontrol.h> | ||
| 32 | 33 | ||
| 33 | #include "internal.h" | 34 | #include "internal.h" |
| 34 | 35 | ||
| @@ -152,6 +153,11 @@ static void remove_migration_pte(struct vm_area_struct *vma, | |||
| 152 | return; | 153 | return; |
| 153 | } | 154 | } |
| 154 | 155 | ||
| 156 | if (mem_cgroup_charge(new, mm)) { | ||
| 157 | pte_unmap(ptep); | ||
| 158 | return; | ||
| 159 | } | ||
| 160 | |||
| 155 | ptl = pte_lockptr(mm, pmd); | 161 | ptl = pte_lockptr(mm, pmd); |
| 156 | spin_lock(ptl); | 162 | spin_lock(ptl); |
| 157 | pte = *ptep; | 163 | pte = *ptep; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 37576b822f06..26a54a17dc9f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
| @@ -43,6 +43,7 @@ | |||
| 43 | #include <linux/backing-dev.h> | 43 | #include <linux/backing-dev.h> |
| 44 | #include <linux/fault-inject.h> | 44 | #include <linux/fault-inject.h> |
| 45 | #include <linux/page-isolation.h> | 45 | #include <linux/page-isolation.h> |
| 46 | #include <linux/memcontrol.h> | ||
| 46 | 47 | ||
| 47 | #include <asm/tlbflush.h> | 48 | #include <asm/tlbflush.h> |
| 48 | #include <asm/div64.h> | 49 | #include <asm/div64.h> |
| @@ -987,6 +988,7 @@ static void free_hot_cold_page(struct page *page, int cold) | |||
| 987 | 988 | ||
| 988 | if (!PageHighMem(page)) | 989 | if (!PageHighMem(page)) |
| 989 | debug_check_no_locks_freed(page_address(page), PAGE_SIZE); | 990 | debug_check_no_locks_freed(page_address(page), PAGE_SIZE); |
| 991 | VM_BUG_ON(page_get_page_cgroup(page)); | ||
| 990 | arch_free_page(page, 0); | 992 | arch_free_page(page, 0); |
| 991 | kernel_map_pages(page, 1, 0); | 993 | kernel_map_pages(page, 1, 0); |
| 992 | 994 | ||
| @@ -2525,6 +2527,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, | |||
| 2525 | set_page_links(page, zone, nid, pfn); | 2527 | set_page_links(page, zone, nid, pfn); |
| 2526 | init_page_count(page); | 2528 | init_page_count(page); |
| 2527 | reset_page_mapcount(page); | 2529 | reset_page_mapcount(page); |
| 2530 | page_assign_page_cgroup(page, NULL); | ||
| 2528 | SetPageReserved(page); | 2531 | SetPageReserved(page); |
| 2529 | 2532 | ||
| 2530 | /* | 2533 | /* |
| @@ -48,6 +48,7 @@ | |||
| 48 | #include <linux/rcupdate.h> | 48 | #include <linux/rcupdate.h> |
| 49 | #include <linux/module.h> | 49 | #include <linux/module.h> |
| 50 | #include <linux/kallsyms.h> | 50 | #include <linux/kallsyms.h> |
| 51 | #include <linux/memcontrol.h> | ||
| 51 | 52 | ||
| 52 | #include <asm/tlbflush.h> | 53 | #include <asm/tlbflush.h> |
| 53 | 54 | ||
| @@ -554,8 +555,14 @@ void page_add_anon_rmap(struct page *page, | |||
| 554 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); | 555 | VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); |
| 555 | if (atomic_inc_and_test(&page->_mapcount)) | 556 | if (atomic_inc_and_test(&page->_mapcount)) |
| 556 | __page_set_anon_rmap(page, vma, address); | 557 | __page_set_anon_rmap(page, vma, address); |
| 557 | else | 558 | else { |
| 558 | __page_check_anon_rmap(page, vma, address); | 559 | __page_check_anon_rmap(page, vma, address); |
| 560 | /* | ||
| 561 | * We unconditionally charged during prepare, we uncharge here | ||
| 562 | * This takes care of balancing the reference counts | ||
| 563 | */ | ||
| 564 | mem_cgroup_uncharge_page(page); | ||
| 565 | } | ||
| 559 | } | 566 | } |
| 560 | 567 | ||
| 561 | /* | 568 | /* |
| @@ -586,6 +593,12 @@ void page_add_file_rmap(struct page *page) | |||
| 586 | { | 593 | { |
| 587 | if (atomic_inc_and_test(&page->_mapcount)) | 594 | if (atomic_inc_and_test(&page->_mapcount)) |
| 588 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 595 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
| 596 | else | ||
| 597 | /* | ||
| 598 | * We unconditionally charged during prepare, we uncharge here | ||
| 599 | * This takes care of balancing the reference counts | ||
| 600 | */ | ||
| 601 | mem_cgroup_uncharge_page(page); | ||
| 589 | } | 602 | } |
| 590 | 603 | ||
| 591 | #ifdef CONFIG_DEBUG_VM | 604 | #ifdef CONFIG_DEBUG_VM |
| @@ -646,6 +659,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) | |||
| 646 | page_clear_dirty(page); | 659 | page_clear_dirty(page); |
| 647 | set_page_dirty(page); | 660 | set_page_dirty(page); |
| 648 | } | 661 | } |
| 662 | mem_cgroup_uncharge_page(page); | ||
| 663 | |||
| 649 | __dec_zone_page_state(page, | 664 | __dec_zone_page_state(page, |
| 650 | PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); | 665 | PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); |
| 651 | } | 666 | } |
diff --git a/mm/swap_state.c b/mm/swap_state.c index ec42f01a8d02..f96e3ff1e791 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c | |||
| @@ -17,6 +17,7 @@ | |||
| 17 | #include <linux/backing-dev.h> | 17 | #include <linux/backing-dev.h> |
| 18 | #include <linux/pagevec.h> | 18 | #include <linux/pagevec.h> |
| 19 | #include <linux/migrate.h> | 19 | #include <linux/migrate.h> |
| 20 | #include <linux/memcontrol.h> | ||
| 20 | 21 | ||
| 21 | #include <asm/pgtable.h> | 22 | #include <asm/pgtable.h> |
| 22 | 23 | ||
| @@ -76,6 +77,11 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
| 76 | BUG_ON(PagePrivate(page)); | 77 | BUG_ON(PagePrivate(page)); |
| 77 | error = radix_tree_preload(gfp_mask); | 78 | error = radix_tree_preload(gfp_mask); |
| 78 | if (!error) { | 79 | if (!error) { |
| 80 | |||
| 81 | error = mem_cgroup_charge(page, current->mm); | ||
| 82 | if (error) | ||
| 83 | goto out; | ||
| 84 | |||
| 79 | write_lock_irq(&swapper_space.tree_lock); | 85 | write_lock_irq(&swapper_space.tree_lock); |
| 80 | error = radix_tree_insert(&swapper_space.page_tree, | 86 | error = radix_tree_insert(&swapper_space.page_tree, |
| 81 | entry.val, page); | 87 | entry.val, page); |
| @@ -86,10 +92,13 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) | |||
| 86 | total_swapcache_pages++; | 92 | total_swapcache_pages++; |
| 87 | __inc_zone_page_state(page, NR_FILE_PAGES); | 93 | __inc_zone_page_state(page, NR_FILE_PAGES); |
| 88 | INC_CACHE_INFO(add_total); | 94 | INC_CACHE_INFO(add_total); |
| 95 | } else { | ||
| 96 | mem_cgroup_uncharge_page(page); | ||
| 89 | } | 97 | } |
| 90 | write_unlock_irq(&swapper_space.tree_lock); | 98 | write_unlock_irq(&swapper_space.tree_lock); |
| 91 | radix_tree_preload_end(); | 99 | radix_tree_preload_end(); |
| 92 | } | 100 | } |
| 101 | out: | ||
| 93 | return error; | 102 | return error; |
| 94 | } | 103 | } |
| 95 | 104 | ||
| @@ -104,6 +113,7 @@ void __delete_from_swap_cache(struct page *page) | |||
| 104 | BUG_ON(PageWriteback(page)); | 113 | BUG_ON(PageWriteback(page)); |
| 105 | BUG_ON(PagePrivate(page)); | 114 | BUG_ON(PagePrivate(page)); |
| 106 | 115 | ||
| 116 | mem_cgroup_uncharge_page(page); | ||
| 107 | radix_tree_delete(&swapper_space.page_tree, page_private(page)); | 117 | radix_tree_delete(&swapper_space.page_tree, page_private(page)); |
| 108 | set_page_private(page, 0); | 118 | set_page_private(page, 0); |
| 109 | ClearPageSwapCache(page); | 119 | ClearPageSwapCache(page); |
diff --git a/mm/swapfile.c b/mm/swapfile.c index afae7b1f680b..fddc4cc4149b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -27,6 +27,7 @@ | |||
| 27 | #include <linux/mutex.h> | 27 | #include <linux/mutex.h> |
| 28 | #include <linux/capability.h> | 28 | #include <linux/capability.h> |
| 29 | #include <linux/syscalls.h> | 29 | #include <linux/syscalls.h> |
| 30 | #include <linux/memcontrol.h> | ||
| 30 | 31 | ||
| 31 | #include <asm/pgtable.h> | 32 | #include <asm/pgtable.h> |
| 32 | #include <asm/tlbflush.h> | 33 | #include <asm/tlbflush.h> |
| @@ -506,9 +507,12 @@ unsigned int count_swap_pages(int type, int free) | |||
| 506 | * just let do_wp_page work it out if a write is requested later - to | 507 | * just let do_wp_page work it out if a write is requested later - to |
| 507 | * force COW, vm_page_prot omits write permission from any private vma. | 508 | * force COW, vm_page_prot omits write permission from any private vma. |
| 508 | */ | 509 | */ |
| 509 | static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, | 510 | static int unuse_pte(struct vm_area_struct *vma, pte_t *pte, |
| 510 | unsigned long addr, swp_entry_t entry, struct page *page) | 511 | unsigned long addr, swp_entry_t entry, struct page *page) |
| 511 | { | 512 | { |
| 513 | if (mem_cgroup_charge(page, vma->vm_mm)) | ||
| 514 | return -ENOMEM; | ||
| 515 | |||
| 512 | inc_mm_counter(vma->vm_mm, anon_rss); | 516 | inc_mm_counter(vma->vm_mm, anon_rss); |
| 513 | get_page(page); | 517 | get_page(page); |
| 514 | set_pte_at(vma->vm_mm, addr, pte, | 518 | set_pte_at(vma->vm_mm, addr, pte, |
| @@ -520,6 +524,7 @@ static void unuse_pte(struct vm_area_struct *vma, pte_t *pte, | |||
| 520 | * immediately swapped out again after swapon. | 524 | * immediately swapped out again after swapon. |
| 521 | */ | 525 | */ |
| 522 | activate_page(page); | 526 | activate_page(page); |
| 527 | return 1; | ||
| 523 | } | 528 | } |
| 524 | 529 | ||
| 525 | static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | 530 | static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, |
| @@ -529,7 +534,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 529 | pte_t swp_pte = swp_entry_to_pte(entry); | 534 | pte_t swp_pte = swp_entry_to_pte(entry); |
| 530 | pte_t *pte; | 535 | pte_t *pte; |
| 531 | spinlock_t *ptl; | 536 | spinlock_t *ptl; |
| 532 | int found = 0; | 537 | int ret = 0; |
| 533 | 538 | ||
| 534 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); | 539 | pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); |
| 535 | do { | 540 | do { |
| @@ -538,13 +543,12 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 538 | * Test inline before going to call unuse_pte. | 543 | * Test inline before going to call unuse_pte. |
| 539 | */ | 544 | */ |
| 540 | if (unlikely(pte_same(*pte, swp_pte))) { | 545 | if (unlikely(pte_same(*pte, swp_pte))) { |
| 541 | unuse_pte(vma, pte++, addr, entry, page); | 546 | ret = unuse_pte(vma, pte++, addr, entry, page); |
| 542 | found = 1; | ||
| 543 | break; | 547 | break; |
| 544 | } | 548 | } |
| 545 | } while (pte++, addr += PAGE_SIZE, addr != end); | 549 | } while (pte++, addr += PAGE_SIZE, addr != end); |
| 546 | pte_unmap_unlock(pte - 1, ptl); | 550 | pte_unmap_unlock(pte - 1, ptl); |
| 547 | return found; | 551 | return ret; |
| 548 | } | 552 | } |
| 549 | 553 | ||
| 550 | static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | 554 | static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, |
| @@ -553,14 +557,16 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, | |||
| 553 | { | 557 | { |
| 554 | pmd_t *pmd; | 558 | pmd_t *pmd; |
| 555 | unsigned long next; | 559 | unsigned long next; |
| 560 | int ret; | ||
| 556 | 561 | ||
| 557 | pmd = pmd_offset(pud, addr); | 562 | pmd = pmd_offset(pud, addr); |
| 558 | do { | 563 | do { |
| 559 | next = pmd_addr_end(addr, end); | 564 | next = pmd_addr_end(addr, end); |
| 560 | if (pmd_none_or_clear_bad(pmd)) | 565 | if (pmd_none_or_clear_bad(pmd)) |
| 561 | continue; | 566 | continue; |
| 562 | if (unuse_pte_range(vma, pmd, addr, next, entry, page)) | 567 | ret = unuse_pte_range(vma, pmd, addr, next, entry, page); |
| 563 | return 1; | 568 | if (ret) |
| 569 | return ret; | ||
| 564 | } while (pmd++, addr = next, addr != end); | 570 | } while (pmd++, addr = next, addr != end); |
| 565 | return 0; | 571 | return 0; |
| 566 | } | 572 | } |
| @@ -571,14 +577,16 @@ static inline int unuse_pud_range(struct vm_area_struct *vma, pgd_t *pgd, | |||
| 571 | { | 577 | { |
| 572 | pud_t *pud; | 578 | pud_t *pud; |
| 573 | unsigned long next; | 579 | unsigned long next; |
| 580 | int ret; | ||
| 574 | 581 | ||
| 575 | pud = pud_offset(pgd, addr); | 582 | pud = pud_offset(pgd, addr); |
| 576 | do { | 583 | do { |
| 577 | next = pud_addr_end(addr, end); | 584 | next = pud_addr_end(addr, end); |
| 578 | if (pud_none_or_clear_bad(pud)) | 585 | if (pud_none_or_clear_bad(pud)) |
| 579 | continue; | 586 | continue; |
| 580 | if (unuse_pmd_range(vma, pud, addr, next, entry, page)) | 587 | ret = unuse_pmd_range(vma, pud, addr, next, entry, page); |
| 581 | return 1; | 588 | if (ret) |
| 589 | return ret; | ||
| 582 | } while (pud++, addr = next, addr != end); | 590 | } while (pud++, addr = next, addr != end); |
| 583 | return 0; | 591 | return 0; |
| 584 | } | 592 | } |
| @@ -588,6 +596,7 @@ static int unuse_vma(struct vm_area_struct *vma, | |||
| 588 | { | 596 | { |
| 589 | pgd_t *pgd; | 597 | pgd_t *pgd; |
| 590 | unsigned long addr, end, next; | 598 | unsigned long addr, end, next; |
| 599 | int ret; | ||
| 591 | 600 | ||
| 592 | if (page->mapping) { | 601 | if (page->mapping) { |
| 593 | addr = page_address_in_vma(page, vma); | 602 | addr = page_address_in_vma(page, vma); |
| @@ -605,8 +614,9 @@ static int unuse_vma(struct vm_area_struct *vma, | |||
| 605 | next = pgd_addr_end(addr, end); | 614 | next = pgd_addr_end(addr, end); |
| 606 | if (pgd_none_or_clear_bad(pgd)) | 615 | if (pgd_none_or_clear_bad(pgd)) |
| 607 | continue; | 616 | continue; |
| 608 | if (unuse_pud_range(vma, pgd, addr, next, entry, page)) | 617 | ret = unuse_pud_range(vma, pgd, addr, next, entry, page); |
| 609 | return 1; | 618 | if (ret) |
| 619 | return ret; | ||
| 610 | } while (pgd++, addr = next, addr != end); | 620 | } while (pgd++, addr = next, addr != end); |
| 611 | return 0; | 621 | return 0; |
| 612 | } | 622 | } |
| @@ -615,6 +625,7 @@ static int unuse_mm(struct mm_struct *mm, | |||
| 615 | swp_entry_t entry, struct page *page) | 625 | swp_entry_t entry, struct page *page) |
| 616 | { | 626 | { |
| 617 | struct vm_area_struct *vma; | 627 | struct vm_area_struct *vma; |
| 628 | int ret = 0; | ||
| 618 | 629 | ||
| 619 | if (!down_read_trylock(&mm->mmap_sem)) { | 630 | if (!down_read_trylock(&mm->mmap_sem)) { |
| 620 | /* | 631 | /* |
| @@ -627,15 +638,11 @@ static int unuse_mm(struct mm_struct *mm, | |||
| 627 | lock_page(page); | 638 | lock_page(page); |
| 628 | } | 639 | } |
| 629 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | 640 | for (vma = mm->mmap; vma; vma = vma->vm_next) { |
| 630 | if (vma->anon_vma && unuse_vma(vma, entry, page)) | 641 | if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) |
| 631 | break; | 642 | break; |
| 632 | } | 643 | } |
| 633 | up_read(&mm->mmap_sem); | 644 | up_read(&mm->mmap_sem); |
| 634 | /* | 645 | return (ret < 0)? ret: 0; |
| 635 | * Currently unuse_mm cannot fail, but leave error handling | ||
| 636 | * at call sites for now, since we change it from time to time. | ||
| 637 | */ | ||
| 638 | return 0; | ||
| 639 | } | 646 | } |
| 640 | 647 | ||
| 641 | /* | 648 | /* |
