diff options
Diffstat (limited to 'mm/memcontrol.c')
| -rw-r--r-- | mm/memcontrol.c | 166 |
1 files changed, 163 insertions, 3 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4d4805eb37c7..ebca767292dc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -21,6 +21,9 @@ | |||
| 21 | #include <linux/memcontrol.h> | 21 | #include <linux/memcontrol.h> |
| 22 | #include <linux/cgroup.h> | 22 | #include <linux/cgroup.h> |
| 23 | #include <linux/mm.h> | 23 | #include <linux/mm.h> |
| 24 | #include <linux/page-flags.h> | ||
| 25 | #include <linux/bit_spinlock.h> | ||
| 26 | #include <linux/rcupdate.h> | ||
| 24 | 27 | ||
| 25 | struct cgroup_subsys mem_cgroup_subsys; | 28 | struct cgroup_subsys mem_cgroup_subsys; |
| 26 | 29 | ||
| @@ -31,7 +34,9 @@ struct cgroup_subsys mem_cgroup_subsys; | |||
| 31 | * to help the administrator determine what knobs to tune. | 34 | * to help the administrator determine what knobs to tune. |
| 32 | * | 35 | * |
| 33 | * TODO: Add a water mark for the memory controller. Reclaim will begin when | 36 | * TODO: Add a water mark for the memory controller. Reclaim will begin when |
| 34 | * we hit the water mark. | 37 | * we hit the water mark. May be even add a low water mark, such that |
| 38 | * no reclaim occurs from a cgroup at it's low water mark, this is | ||
| 39 | * a feature that will be implemented much later in the future. | ||
| 35 | */ | 40 | */ |
| 36 | struct mem_cgroup { | 41 | struct mem_cgroup { |
| 37 | struct cgroup_subsys_state css; | 42 | struct cgroup_subsys_state css; |
| @@ -49,6 +54,14 @@ struct mem_cgroup { | |||
| 49 | }; | 54 | }; |
| 50 | 55 | ||
| 51 | /* | 56 | /* |
| 57 | * We use the lower bit of the page->page_cgroup pointer as a bit spin | ||
| 58 | * lock. We need to ensure that page->page_cgroup is atleast two | ||
| 59 | * byte aligned (based on comments from Nick Piggin) | ||
| 60 | */ | ||
| 61 | #define PAGE_CGROUP_LOCK_BIT 0x0 | ||
| 62 | #define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) | ||
| 63 | |||
| 64 | /* | ||
| 52 | * A page_cgroup page is associated with every page descriptor. The | 65 | * A page_cgroup page is associated with every page descriptor. The |
| 53 | * page_cgroup helps us identify information about the cgroup | 66 | * page_cgroup helps us identify information about the cgroup |
| 54 | */ | 67 | */ |
| @@ -56,6 +69,8 @@ struct page_cgroup { | |||
| 56 | struct list_head lru; /* per cgroup LRU list */ | 69 | struct list_head lru; /* per cgroup LRU list */ |
| 57 | struct page *page; | 70 | struct page *page; |
| 58 | struct mem_cgroup *mem_cgroup; | 71 | struct mem_cgroup *mem_cgroup; |
| 72 | atomic_t ref_cnt; /* Helpful when pages move b/w */ | ||
| 73 | /* mapped and cached states */ | ||
| 59 | }; | 74 | }; |
| 60 | 75 | ||
| 61 | 76 | ||
| @@ -88,14 +103,157 @@ void mm_free_cgroup(struct mm_struct *mm) | |||
| 88 | css_put(&mm->mem_cgroup->css); | 103 | css_put(&mm->mem_cgroup->css); |
| 89 | } | 104 | } |
| 90 | 105 | ||
| 106 | static inline int page_cgroup_locked(struct page *page) | ||
| 107 | { | ||
| 108 | return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, | ||
| 109 | &page->page_cgroup); | ||
| 110 | } | ||
| 111 | |||
| 91 | void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) | 112 | void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) |
| 92 | { | 113 | { |
| 93 | page->page_cgroup = (unsigned long)pc; | 114 | int locked; |
| 115 | |||
| 116 | /* | ||
| 117 | * While resetting the page_cgroup we might not hold the | ||
| 118 | * page_cgroup lock. free_hot_cold_page() is an example | ||
| 119 | * of such a scenario | ||
| 120 | */ | ||
| 121 | if (pc) | ||
| 122 | VM_BUG_ON(!page_cgroup_locked(page)); | ||
| 123 | locked = (page->page_cgroup & PAGE_CGROUP_LOCK); | ||
| 124 | page->page_cgroup = ((unsigned long)pc | locked); | ||
| 94 | } | 125 | } |
| 95 | 126 | ||
| 96 | struct page_cgroup *page_get_page_cgroup(struct page *page) | 127 | struct page_cgroup *page_get_page_cgroup(struct page *page) |
| 97 | { | 128 | { |
| 98 | return page->page_cgroup; | 129 | return (struct page_cgroup *) |
| 130 | (page->page_cgroup & ~PAGE_CGROUP_LOCK); | ||
| 131 | } | ||
| 132 | |||
| 133 | void __always_inline lock_page_cgroup(struct page *page) | ||
| 134 | { | ||
| 135 | bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
| 136 | VM_BUG_ON(!page_cgroup_locked(page)); | ||
| 137 | } | ||
| 138 | |||
| 139 | void __always_inline unlock_page_cgroup(struct page *page) | ||
| 140 | { | ||
| 141 | bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); | ||
| 142 | } | ||
| 143 | |||
| 144 | /* | ||
| 145 | * Charge the memory controller for page usage. | ||
| 146 | * Return | ||
| 147 | * 0 if the charge was successful | ||
| 148 | * < 0 if the cgroup is over its limit | ||
| 149 | */ | ||
| 150 | int mem_cgroup_charge(struct page *page, struct mm_struct *mm) | ||
| 151 | { | ||
| 152 | struct mem_cgroup *mem; | ||
| 153 | struct page_cgroup *pc, *race_pc; | ||
| 154 | |||
| 155 | /* | ||
| 156 | * Should page_cgroup's go to their own slab? | ||
| 157 | * One could optimize the performance of the charging routine | ||
| 158 | * by saving a bit in the page_flags and using it as a lock | ||
| 159 | * to see if the cgroup page already has a page_cgroup associated | ||
| 160 | * with it | ||
| 161 | */ | ||
| 162 | lock_page_cgroup(page); | ||
| 163 | pc = page_get_page_cgroup(page); | ||
| 164 | /* | ||
| 165 | * The page_cgroup exists and the page has already been accounted | ||
| 166 | */ | ||
| 167 | if (pc) { | ||
| 168 | atomic_inc(&pc->ref_cnt); | ||
| 169 | goto done; | ||
| 170 | } | ||
| 171 | |||
| 172 | unlock_page_cgroup(page); | ||
| 173 | |||
| 174 | pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL); | ||
| 175 | if (pc == NULL) | ||
| 176 | goto err; | ||
| 177 | |||
| 178 | rcu_read_lock(); | ||
| 179 | /* | ||
| 180 | * We always charge the cgroup the mm_struct belongs to | ||
| 181 | * the mm_struct's mem_cgroup changes on task migration if the | ||
| 182 | * thread group leader migrates. It's possible that mm is not | ||
| 183 | * set, if so charge the init_mm (happens for pagecache usage). | ||
| 184 | */ | ||
| 185 | if (!mm) | ||
| 186 | mm = &init_mm; | ||
| 187 | |||
| 188 | mem = rcu_dereference(mm->mem_cgroup); | ||
| 189 | /* | ||
| 190 | * For every charge from the cgroup, increment reference | ||
| 191 | * count | ||
| 192 | */ | ||
| 193 | css_get(&mem->css); | ||
| 194 | rcu_read_unlock(); | ||
| 195 | |||
| 196 | /* | ||
| 197 | * If we created the page_cgroup, we should free it on exceeding | ||
| 198 | * the cgroup limit. | ||
| 199 | */ | ||
| 200 | if (res_counter_charge(&mem->res, 1)) { | ||
| 201 | css_put(&mem->css); | ||
| 202 | goto free_pc; | ||
| 203 | } | ||
| 204 | |||
| 205 | lock_page_cgroup(page); | ||
| 206 | /* | ||
| 207 | * Check if somebody else beat us to allocating the page_cgroup | ||
| 208 | */ | ||
| 209 | race_pc = page_get_page_cgroup(page); | ||
| 210 | if (race_pc) { | ||
| 211 | kfree(pc); | ||
| 212 | pc = race_pc; | ||
| 213 | atomic_inc(&pc->ref_cnt); | ||
| 214 | res_counter_uncharge(&mem->res, 1); | ||
| 215 | css_put(&mem->css); | ||
| 216 | goto done; | ||
| 217 | } | ||
| 218 | |||
| 219 | atomic_set(&pc->ref_cnt, 1); | ||
| 220 | pc->mem_cgroup = mem; | ||
| 221 | pc->page = page; | ||
| 222 | page_assign_page_cgroup(page, pc); | ||
| 223 | |||
| 224 | done: | ||
| 225 | unlock_page_cgroup(page); | ||
| 226 | return 0; | ||
| 227 | free_pc: | ||
| 228 | kfree(pc); | ||
| 229 | return -ENOMEM; | ||
| 230 | err: | ||
| 231 | unlock_page_cgroup(page); | ||
| 232 | return -ENOMEM; | ||
| 233 | } | ||
| 234 | |||
| 235 | /* | ||
| 236 | * Uncharging is always a welcome operation, we never complain, simply | ||
| 237 | * uncharge. | ||
| 238 | */ | ||
| 239 | void mem_cgroup_uncharge(struct page_cgroup *pc) | ||
| 240 | { | ||
| 241 | struct mem_cgroup *mem; | ||
| 242 | struct page *page; | ||
| 243 | |||
| 244 | if (!pc) | ||
| 245 | return; | ||
| 246 | |||
| 247 | if (atomic_dec_and_test(&pc->ref_cnt)) { | ||
| 248 | page = pc->page; | ||
| 249 | lock_page_cgroup(page); | ||
| 250 | mem = pc->mem_cgroup; | ||
| 251 | css_put(&mem->css); | ||
| 252 | page_assign_page_cgroup(page, NULL); | ||
| 253 | unlock_page_cgroup(page); | ||
| 254 | res_counter_uncharge(&mem->res, 1); | ||
| 255 | kfree(pc); | ||
| 256 | } | ||
| 99 | } | 257 | } |
| 100 | 258 | ||
| 101 | static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, | 259 | static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft, |
| @@ -150,6 +308,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
| 150 | return NULL; | 308 | return NULL; |
| 151 | 309 | ||
| 152 | res_counter_init(&mem->res); | 310 | res_counter_init(&mem->res); |
| 311 | INIT_LIST_HEAD(&mem->active_list); | ||
| 312 | INIT_LIST_HEAD(&mem->inactive_list); | ||
| 153 | return &mem->css; | 313 | return &mem->css; |
| 154 | } | 314 | } |
| 155 | 315 | ||
