diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 37 | ||||
-rw-r--r-- | mm/memcontrol.c | 689 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 5 | ||||
-rw-r--r-- | mm/page_alloc.c | 50 | ||||
-rw-r--r-- | mm/shmem.c | 109 | ||||
-rw-r--r-- | mm/slab.c | 47 | ||||
-rw-r--r-- | mm/slub.c | 33 | ||||
-rw-r--r-- | mm/swap.c | 1 | ||||
-rw-r--r-- | mm/truncate.c | 10 |
10 files changed, 705 insertions, 278 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 35e12d186566..20e5642e9f9f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -151,6 +151,7 @@ void remove_from_page_cache(struct page *page) | |||
151 | spin_unlock_irq(&mapping->tree_lock); | 151 | spin_unlock_irq(&mapping->tree_lock); |
152 | mem_cgroup_uncharge_cache_page(page); | 152 | mem_cgroup_uncharge_cache_page(page); |
153 | } | 153 | } |
154 | EXPORT_SYMBOL(remove_from_page_cache); | ||
154 | 155 | ||
155 | static int sync_page(void *word) | 156 | static int sync_page(void *word) |
156 | { | 157 | { |
@@ -1275,7 +1276,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1275 | { | 1276 | { |
1276 | struct file *filp = iocb->ki_filp; | 1277 | struct file *filp = iocb->ki_filp; |
1277 | ssize_t retval; | 1278 | ssize_t retval; |
1278 | unsigned long seg; | 1279 | unsigned long seg = 0; |
1279 | size_t count; | 1280 | size_t count; |
1280 | loff_t *ppos = &iocb->ki_pos; | 1281 | loff_t *ppos = &iocb->ki_pos; |
1281 | 1282 | ||
@@ -1302,21 +1303,47 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, | |||
1302 | retval = mapping->a_ops->direct_IO(READ, iocb, | 1303 | retval = mapping->a_ops->direct_IO(READ, iocb, |
1303 | iov, pos, nr_segs); | 1304 | iov, pos, nr_segs); |
1304 | } | 1305 | } |
1305 | if (retval > 0) | 1306 | if (retval > 0) { |
1306 | *ppos = pos + retval; | 1307 | *ppos = pos + retval; |
1307 | if (retval) { | 1308 | count -= retval; |
1309 | } | ||
1310 | |||
1311 | /* | ||
1312 | * Btrfs can have a short DIO read if we encounter | ||
1313 | * compressed extents, so if there was an error, or if | ||
1314 | * we've already read everything we wanted to, or if | ||
1315 | * there was a short read because we hit EOF, go ahead | ||
1316 | * and return. Otherwise fallthrough to buffered io for | ||
1317 | * the rest of the read. | ||
1318 | */ | ||
1319 | if (retval < 0 || !count || *ppos >= size) { | ||
1308 | file_accessed(filp); | 1320 | file_accessed(filp); |
1309 | goto out; | 1321 | goto out; |
1310 | } | 1322 | } |
1311 | } | 1323 | } |
1312 | } | 1324 | } |
1313 | 1325 | ||
1326 | count = retval; | ||
1314 | for (seg = 0; seg < nr_segs; seg++) { | 1327 | for (seg = 0; seg < nr_segs; seg++) { |
1315 | read_descriptor_t desc; | 1328 | read_descriptor_t desc; |
1329 | loff_t offset = 0; | ||
1330 | |||
1331 | /* | ||
1332 | * If we did a short DIO read we need to skip the section of the | ||
1333 | * iov that we've already read data into. | ||
1334 | */ | ||
1335 | if (count) { | ||
1336 | if (count > iov[seg].iov_len) { | ||
1337 | count -= iov[seg].iov_len; | ||
1338 | continue; | ||
1339 | } | ||
1340 | offset = count; | ||
1341 | count = 0; | ||
1342 | } | ||
1316 | 1343 | ||
1317 | desc.written = 0; | 1344 | desc.written = 0; |
1318 | desc.arg.buf = iov[seg].iov_base; | 1345 | desc.arg.buf = iov[seg].iov_base + offset; |
1319 | desc.count = iov[seg].iov_len; | 1346 | desc.count = iov[seg].iov_len - offset; |
1320 | if (desc.count == 0) | 1347 | if (desc.count == 0) |
1321 | continue; | 1348 | continue; |
1322 | desc.error = 0; | 1349 | desc.error = 0; |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c8569bc298ff..c6ece0a57595 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -149,16 +149,35 @@ struct mem_cgroup_threshold { | |||
149 | u64 threshold; | 149 | u64 threshold; |
150 | }; | 150 | }; |
151 | 151 | ||
152 | /* For threshold */ | ||
152 | struct mem_cgroup_threshold_ary { | 153 | struct mem_cgroup_threshold_ary { |
153 | /* An array index points to threshold just below usage. */ | 154 | /* An array index points to threshold just below usage. */ |
154 | atomic_t current_threshold; | 155 | int current_threshold; |
155 | /* Size of entries[] */ | 156 | /* Size of entries[] */ |
156 | unsigned int size; | 157 | unsigned int size; |
157 | /* Array of thresholds */ | 158 | /* Array of thresholds */ |
158 | struct mem_cgroup_threshold entries[0]; | 159 | struct mem_cgroup_threshold entries[0]; |
159 | }; | 160 | }; |
160 | 161 | ||
162 | struct mem_cgroup_thresholds { | ||
163 | /* Primary thresholds array */ | ||
164 | struct mem_cgroup_threshold_ary *primary; | ||
165 | /* | ||
166 | * Spare threshold array. | ||
167 | * This is needed to make mem_cgroup_unregister_event() "never fail". | ||
168 | * It must be able to store at least primary->size - 1 entries. | ||
169 | */ | ||
170 | struct mem_cgroup_threshold_ary *spare; | ||
171 | }; | ||
172 | |||
173 | /* for OOM */ | ||
174 | struct mem_cgroup_eventfd_list { | ||
175 | struct list_head list; | ||
176 | struct eventfd_ctx *eventfd; | ||
177 | }; | ||
178 | |||
161 | static void mem_cgroup_threshold(struct mem_cgroup *mem); | 179 | static void mem_cgroup_threshold(struct mem_cgroup *mem); |
180 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem); | ||
162 | 181 | ||
163 | /* | 182 | /* |
164 | * The memory controller data structure. The memory controller controls both | 183 | * The memory controller data structure. The memory controller controls both |
@@ -207,6 +226,8 @@ struct mem_cgroup { | |||
207 | atomic_t refcnt; | 226 | atomic_t refcnt; |
208 | 227 | ||
209 | unsigned int swappiness; | 228 | unsigned int swappiness; |
229 | /* OOM-Killer disable */ | ||
230 | int oom_kill_disable; | ||
210 | 231 | ||
211 | /* set when res.limit == memsw.limit */ | 232 | /* set when res.limit == memsw.limit */ |
212 | bool memsw_is_minimum; | 233 | bool memsw_is_minimum; |
@@ -215,17 +236,19 @@ struct mem_cgroup { | |||
215 | struct mutex thresholds_lock; | 236 | struct mutex thresholds_lock; |
216 | 237 | ||
217 | /* thresholds for memory usage. RCU-protected */ | 238 | /* thresholds for memory usage. RCU-protected */ |
218 | struct mem_cgroup_threshold_ary *thresholds; | 239 | struct mem_cgroup_thresholds thresholds; |
219 | 240 | ||
220 | /* thresholds for mem+swap usage. RCU-protected */ | 241 | /* thresholds for mem+swap usage. RCU-protected */ |
221 | struct mem_cgroup_threshold_ary *memsw_thresholds; | 242 | struct mem_cgroup_thresholds memsw_thresholds; |
243 | |||
244 | /* For oom notifier event fd */ | ||
245 | struct list_head oom_notify; | ||
222 | 246 | ||
223 | /* | 247 | /* |
224 | * Should we move charges of a task when a task is moved into this | 248 | * Should we move charges of a task when a task is moved into this |
225 | * mem_cgroup ? And what type of charges should we move ? | 249 | * mem_cgroup ? And what type of charges should we move ? |
226 | */ | 250 | */ |
227 | unsigned long move_charge_at_immigrate; | 251 | unsigned long move_charge_at_immigrate; |
228 | |||
229 | /* | 252 | /* |
230 | * percpu counter. | 253 | * percpu counter. |
231 | */ | 254 | */ |
@@ -239,6 +262,7 @@ struct mem_cgroup { | |||
239 | */ | 262 | */ |
240 | enum move_type { | 263 | enum move_type { |
241 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ | 264 | MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ |
265 | MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ | ||
242 | NR_MOVE_TYPE, | 266 | NR_MOVE_TYPE, |
243 | }; | 267 | }; |
244 | 268 | ||
@@ -255,6 +279,18 @@ static struct move_charge_struct { | |||
255 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), | 279 | .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), |
256 | }; | 280 | }; |
257 | 281 | ||
282 | static bool move_anon(void) | ||
283 | { | ||
284 | return test_bit(MOVE_CHARGE_TYPE_ANON, | ||
285 | &mc.to->move_charge_at_immigrate); | ||
286 | } | ||
287 | |||
288 | static bool move_file(void) | ||
289 | { | ||
290 | return test_bit(MOVE_CHARGE_TYPE_FILE, | ||
291 | &mc.to->move_charge_at_immigrate); | ||
292 | } | ||
293 | |||
258 | /* | 294 | /* |
259 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft | 295 | * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft |
260 | * limit reclaim to prevent infinite loops, if they ever occur. | 296 | * limit reclaim to prevent infinite loops, if they ever occur. |
@@ -282,9 +318,12 @@ enum charge_type { | |||
282 | /* for encoding cft->private value on file */ | 318 | /* for encoding cft->private value on file */ |
283 | #define _MEM (0) | 319 | #define _MEM (0) |
284 | #define _MEMSWAP (1) | 320 | #define _MEMSWAP (1) |
321 | #define _OOM_TYPE (2) | ||
285 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) | 322 | #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) |
286 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) | 323 | #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) |
287 | #define MEMFILE_ATTR(val) ((val) & 0xffff) | 324 | #define MEMFILE_ATTR(val) ((val) & 0xffff) |
325 | /* Used for OOM nofiier */ | ||
326 | #define OOM_CONTROL (0) | ||
288 | 327 | ||
289 | /* | 328 | /* |
290 | * Reclaim flags for mem_cgroup_hierarchical_reclaim | 329 | * Reclaim flags for mem_cgroup_hierarchical_reclaim |
@@ -1293,14 +1332,62 @@ static void mem_cgroup_oom_unlock(struct mem_cgroup *mem) | |||
1293 | static DEFINE_MUTEX(memcg_oom_mutex); | 1332 | static DEFINE_MUTEX(memcg_oom_mutex); |
1294 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); | 1333 | static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); |
1295 | 1334 | ||
1335 | struct oom_wait_info { | ||
1336 | struct mem_cgroup *mem; | ||
1337 | wait_queue_t wait; | ||
1338 | }; | ||
1339 | |||
1340 | static int memcg_oom_wake_function(wait_queue_t *wait, | ||
1341 | unsigned mode, int sync, void *arg) | ||
1342 | { | ||
1343 | struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; | ||
1344 | struct oom_wait_info *oom_wait_info; | ||
1345 | |||
1346 | oom_wait_info = container_of(wait, struct oom_wait_info, wait); | ||
1347 | |||
1348 | if (oom_wait_info->mem == wake_mem) | ||
1349 | goto wakeup; | ||
1350 | /* if no hierarchy, no match */ | ||
1351 | if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) | ||
1352 | return 0; | ||
1353 | /* | ||
1354 | * Both of oom_wait_info->mem and wake_mem are stable under us. | ||
1355 | * Then we can use css_is_ancestor without taking care of RCU. | ||
1356 | */ | ||
1357 | if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && | ||
1358 | !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) | ||
1359 | return 0; | ||
1360 | |||
1361 | wakeup: | ||
1362 | return autoremove_wake_function(wait, mode, sync, arg); | ||
1363 | } | ||
1364 | |||
1365 | static void memcg_wakeup_oom(struct mem_cgroup *mem) | ||
1366 | { | ||
1367 | /* for filtering, pass "mem" as argument. */ | ||
1368 | __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); | ||
1369 | } | ||
1370 | |||
1371 | static void memcg_oom_recover(struct mem_cgroup *mem) | ||
1372 | { | ||
1373 | if (mem->oom_kill_disable && atomic_read(&mem->oom_lock)) | ||
1374 | memcg_wakeup_oom(mem); | ||
1375 | } | ||
1376 | |||
1296 | /* | 1377 | /* |
1297 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. | 1378 | * try to call OOM killer. returns false if we should exit memory-reclaim loop. |
1298 | */ | 1379 | */ |
1299 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | 1380 | bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) |
1300 | { | 1381 | { |
1301 | DEFINE_WAIT(wait); | 1382 | struct oom_wait_info owait; |
1302 | bool locked; | 1383 | bool locked, need_to_kill; |
1303 | 1384 | ||
1385 | owait.mem = mem; | ||
1386 | owait.wait.flags = 0; | ||
1387 | owait.wait.func = memcg_oom_wake_function; | ||
1388 | owait.wait.private = current; | ||
1389 | INIT_LIST_HEAD(&owait.wait.task_list); | ||
1390 | need_to_kill = true; | ||
1304 | /* At first, try to OOM lock hierarchy under mem.*/ | 1391 | /* At first, try to OOM lock hierarchy under mem.*/ |
1305 | mutex_lock(&memcg_oom_mutex); | 1392 | mutex_lock(&memcg_oom_mutex); |
1306 | locked = mem_cgroup_oom_lock(mem); | 1393 | locked = mem_cgroup_oom_lock(mem); |
@@ -1309,32 +1396,23 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) | |||
1309 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL | 1396 | * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL |
1310 | * under OOM is always welcomed, use TASK_KILLABLE here. | 1397 | * under OOM is always welcomed, use TASK_KILLABLE here. |
1311 | */ | 1398 | */ |
1312 | if (!locked) | 1399 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); |
1313 | prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE); | 1400 | if (!locked || mem->oom_kill_disable) |
1401 | need_to_kill = false; | ||
1402 | if (locked) | ||
1403 | mem_cgroup_oom_notify(mem); | ||
1314 | mutex_unlock(&memcg_oom_mutex); | 1404 | mutex_unlock(&memcg_oom_mutex); |
1315 | 1405 | ||
1316 | if (locked) | 1406 | if (need_to_kill) { |
1407 | finish_wait(&memcg_oom_waitq, &owait.wait); | ||
1317 | mem_cgroup_out_of_memory(mem, mask); | 1408 | mem_cgroup_out_of_memory(mem, mask); |
1318 | else { | 1409 | } else { |
1319 | schedule(); | 1410 | schedule(); |
1320 | finish_wait(&memcg_oom_waitq, &wait); | 1411 | finish_wait(&memcg_oom_waitq, &owait.wait); |
1321 | } | 1412 | } |
1322 | mutex_lock(&memcg_oom_mutex); | 1413 | mutex_lock(&memcg_oom_mutex); |
1323 | mem_cgroup_oom_unlock(mem); | 1414 | mem_cgroup_oom_unlock(mem); |
1324 | /* | 1415 | memcg_wakeup_oom(mem); |
1325 | * Here, we use global waitq .....more fine grained waitq ? | ||
1326 | * Assume following hierarchy. | ||
1327 | * A/ | ||
1328 | * 01 | ||
1329 | * 02 | ||
1330 | * assume OOM happens both in A and 01 at the same time. Tthey are | ||
1331 | * mutually exclusive by lock. (kill in 01 helps A.) | ||
1332 | * When we use per memcg waitq, we have to wake up waiters on A and 02 | ||
1333 | * in addtion to waiters on 01. We use global waitq for avoiding mess. | ||
1334 | * It will not be a big problem. | ||
1335 | * (And a task may be moved to other groups while it's waiting for OOM.) | ||
1336 | */ | ||
1337 | wake_up_all(&memcg_oom_waitq); | ||
1338 | mutex_unlock(&memcg_oom_mutex); | 1416 | mutex_unlock(&memcg_oom_mutex); |
1339 | 1417 | ||
1340 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | 1418 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) |
@@ -2118,15 +2196,6 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | |||
2118 | /* If swapout, usage of swap doesn't decrease */ | 2196 | /* If swapout, usage of swap doesn't decrease */ |
2119 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2197 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
2120 | uncharge_memsw = false; | 2198 | uncharge_memsw = false; |
2121 | /* | ||
2122 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||
2123 | * In those cases, all pages freed continously can be expected to be in | ||
2124 | * the same cgroup and we have chance to coalesce uncharges. | ||
2125 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||
2126 | * because we want to do uncharge as soon as possible. | ||
2127 | */ | ||
2128 | if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) | ||
2129 | goto direct_uncharge; | ||
2130 | 2199 | ||
2131 | batch = ¤t->memcg_batch; | 2200 | batch = ¤t->memcg_batch; |
2132 | /* | 2201 | /* |
@@ -2137,6 +2206,17 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | |||
2137 | if (!batch->memcg) | 2206 | if (!batch->memcg) |
2138 | batch->memcg = mem; | 2207 | batch->memcg = mem; |
2139 | /* | 2208 | /* |
2209 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||
2210 | * In those cases, all pages freed continously can be expected to be in | ||
2211 | * the same cgroup and we have chance to coalesce uncharges. | ||
2212 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||
2213 | * because we want to do uncharge as soon as possible. | ||
2214 | */ | ||
2215 | |||
2216 | if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) | ||
2217 | goto direct_uncharge; | ||
2218 | |||
2219 | /* | ||
2140 | * In typical case, batch->memcg == mem. This means we can | 2220 | * In typical case, batch->memcg == mem. This means we can |
2141 | * merge a series of uncharges to an uncharge of res_counter. | 2221 | * merge a series of uncharges to an uncharge of res_counter. |
2142 | * If not, we uncharge res_counter ony by one. | 2222 | * If not, we uncharge res_counter ony by one. |
@@ -2152,6 +2232,8 @@ direct_uncharge: | |||
2152 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 2232 | res_counter_uncharge(&mem->res, PAGE_SIZE); |
2153 | if (uncharge_memsw) | 2233 | if (uncharge_memsw) |
2154 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | 2234 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); |
2235 | if (unlikely(batch->memcg != mem)) | ||
2236 | memcg_oom_recover(mem); | ||
2155 | return; | 2237 | return; |
2156 | } | 2238 | } |
2157 | 2239 | ||
@@ -2188,7 +2270,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
2188 | switch (ctype) { | 2270 | switch (ctype) { |
2189 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: | 2271 | case MEM_CGROUP_CHARGE_TYPE_MAPPED: |
2190 | case MEM_CGROUP_CHARGE_TYPE_DROP: | 2272 | case MEM_CGROUP_CHARGE_TYPE_DROP: |
2191 | if (page_mapped(page)) | 2273 | /* See mem_cgroup_prepare_migration() */ |
2274 | if (page_mapped(page) || PageCgroupMigration(pc)) | ||
2192 | goto unlock_out; | 2275 | goto unlock_out; |
2193 | break; | 2276 | break; |
2194 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: | 2277 | case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: |
@@ -2288,6 +2371,7 @@ void mem_cgroup_uncharge_end(void) | |||
2288 | res_counter_uncharge(&batch->memcg->res, batch->bytes); | 2371 | res_counter_uncharge(&batch->memcg->res, batch->bytes); |
2289 | if (batch->memsw_bytes) | 2372 | if (batch->memsw_bytes) |
2290 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); | 2373 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); |
2374 | memcg_oom_recover(batch->memcg); | ||
2291 | /* forget this pointer (for sanity check) */ | 2375 | /* forget this pointer (for sanity check) */ |
2292 | batch->memcg = NULL; | 2376 | batch->memcg = NULL; |
2293 | } | 2377 | } |
@@ -2410,10 +2494,12 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, | |||
2410 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old | 2494 | * Before starting migration, account PAGE_SIZE to mem_cgroup that the old |
2411 | * page belongs to. | 2495 | * page belongs to. |
2412 | */ | 2496 | */ |
2413 | int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | 2497 | int mem_cgroup_prepare_migration(struct page *page, |
2498 | struct page *newpage, struct mem_cgroup **ptr) | ||
2414 | { | 2499 | { |
2415 | struct page_cgroup *pc; | 2500 | struct page_cgroup *pc; |
2416 | struct mem_cgroup *mem = NULL; | 2501 | struct mem_cgroup *mem = NULL; |
2502 | enum charge_type ctype; | ||
2417 | int ret = 0; | 2503 | int ret = 0; |
2418 | 2504 | ||
2419 | if (mem_cgroup_disabled()) | 2505 | if (mem_cgroup_disabled()) |
@@ -2424,69 +2510,125 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) | |||
2424 | if (PageCgroupUsed(pc)) { | 2510 | if (PageCgroupUsed(pc)) { |
2425 | mem = pc->mem_cgroup; | 2511 | mem = pc->mem_cgroup; |
2426 | css_get(&mem->css); | 2512 | css_get(&mem->css); |
2513 | /* | ||
2514 | * At migrating an anonymous page, its mapcount goes down | ||
2515 | * to 0 and uncharge() will be called. But, even if it's fully | ||
2516 | * unmapped, migration may fail and this page has to be | ||
2517 | * charged again. We set MIGRATION flag here and delay uncharge | ||
2518 | * until end_migration() is called | ||
2519 | * | ||
2520 | * Corner Case Thinking | ||
2521 | * A) | ||
2522 | * When the old page was mapped as Anon and it's unmap-and-freed | ||
2523 | * while migration was ongoing. | ||
2524 | * If unmap finds the old page, uncharge() of it will be delayed | ||
2525 | * until end_migration(). If unmap finds a new page, it's | ||
2526 | * uncharged when it make mapcount to be 1->0. If unmap code | ||
2527 | * finds swap_migration_entry, the new page will not be mapped | ||
2528 | * and end_migration() will find it(mapcount==0). | ||
2529 | * | ||
2530 | * B) | ||
2531 | * When the old page was mapped but migraion fails, the kernel | ||
2532 | * remaps it. A charge for it is kept by MIGRATION flag even | ||
2533 | * if mapcount goes down to 0. We can do remap successfully | ||
2534 | * without charging it again. | ||
2535 | * | ||
2536 | * C) | ||
2537 | * The "old" page is under lock_page() until the end of | ||
2538 | * migration, so, the old page itself will not be swapped-out. | ||
2539 | * If the new page is swapped out before end_migraton, our | ||
2540 | * hook to usual swap-out path will catch the event. | ||
2541 | */ | ||
2542 | if (PageAnon(page)) | ||
2543 | SetPageCgroupMigration(pc); | ||
2427 | } | 2544 | } |
2428 | unlock_page_cgroup(pc); | 2545 | unlock_page_cgroup(pc); |
2546 | /* | ||
2547 | * If the page is not charged at this point, | ||
2548 | * we return here. | ||
2549 | */ | ||
2550 | if (!mem) | ||
2551 | return 0; | ||
2429 | 2552 | ||
2430 | *ptr = mem; | 2553 | *ptr = mem; |
2431 | if (mem) { | 2554 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); |
2432 | ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); | 2555 | css_put(&mem->css);/* drop extra refcnt */ |
2433 | css_put(&mem->css); | 2556 | if (ret || *ptr == NULL) { |
2557 | if (PageAnon(page)) { | ||
2558 | lock_page_cgroup(pc); | ||
2559 | ClearPageCgroupMigration(pc); | ||
2560 | unlock_page_cgroup(pc); | ||
2561 | /* | ||
2562 | * The old page may be fully unmapped while we kept it. | ||
2563 | */ | ||
2564 | mem_cgroup_uncharge_page(page); | ||
2565 | } | ||
2566 | return -ENOMEM; | ||
2434 | } | 2567 | } |
2568 | /* | ||
2569 | * We charge new page before it's used/mapped. So, even if unlock_page() | ||
2570 | * is called before end_migration, we can catch all events on this new | ||
2571 | * page. In the case new page is migrated but not remapped, new page's | ||
2572 | * mapcount will be finally 0 and we call uncharge in end_migration(). | ||
2573 | */ | ||
2574 | pc = lookup_page_cgroup(newpage); | ||
2575 | if (PageAnon(page)) | ||
2576 | ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | ||
2577 | else if (page_is_file_cache(page)) | ||
2578 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
2579 | else | ||
2580 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
2581 | __mem_cgroup_commit_charge(mem, pc, ctype); | ||
2435 | return ret; | 2582 | return ret; |
2436 | } | 2583 | } |
2437 | 2584 | ||
2438 | /* remove redundant charge if migration failed*/ | 2585 | /* remove redundant charge if migration failed*/ |
2439 | void mem_cgroup_end_migration(struct mem_cgroup *mem, | 2586 | void mem_cgroup_end_migration(struct mem_cgroup *mem, |
2440 | struct page *oldpage, struct page *newpage) | 2587 | struct page *oldpage, struct page *newpage) |
2441 | { | 2588 | { |
2442 | struct page *target, *unused; | 2589 | struct page *used, *unused; |
2443 | struct page_cgroup *pc; | 2590 | struct page_cgroup *pc; |
2444 | enum charge_type ctype; | ||
2445 | 2591 | ||
2446 | if (!mem) | 2592 | if (!mem) |
2447 | return; | 2593 | return; |
2594 | /* blocks rmdir() */ | ||
2448 | cgroup_exclude_rmdir(&mem->css); | 2595 | cgroup_exclude_rmdir(&mem->css); |
2449 | /* at migration success, oldpage->mapping is NULL. */ | 2596 | /* at migration success, oldpage->mapping is NULL. */ |
2450 | if (oldpage->mapping) { | 2597 | if (oldpage->mapping) { |
2451 | target = oldpage; | 2598 | used = oldpage; |
2452 | unused = NULL; | 2599 | unused = newpage; |
2453 | } else { | 2600 | } else { |
2454 | target = newpage; | 2601 | used = newpage; |
2455 | unused = oldpage; | 2602 | unused = oldpage; |
2456 | } | 2603 | } |
2457 | |||
2458 | if (PageAnon(target)) | ||
2459 | ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; | ||
2460 | else if (page_is_file_cache(target)) | ||
2461 | ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; | ||
2462 | else | ||
2463 | ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; | ||
2464 | |||
2465 | /* unused page is not on radix-tree now. */ | ||
2466 | if (unused) | ||
2467 | __mem_cgroup_uncharge_common(unused, ctype); | ||
2468 | |||
2469 | pc = lookup_page_cgroup(target); | ||
2470 | /* | 2604 | /* |
2471 | * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. | 2605 | * We disallowed uncharge of pages under migration because mapcount |
2472 | * So, double-counting is effectively avoided. | 2606 | * of the page goes down to zero, temporarly. |
2607 | * Clear the flag and check the page should be charged. | ||
2473 | */ | 2608 | */ |
2474 | __mem_cgroup_commit_charge(mem, pc, ctype); | 2609 | pc = lookup_page_cgroup(oldpage); |
2610 | lock_page_cgroup(pc); | ||
2611 | ClearPageCgroupMigration(pc); | ||
2612 | unlock_page_cgroup(pc); | ||
2613 | |||
2614 | if (unused != oldpage) | ||
2615 | pc = lookup_page_cgroup(unused); | ||
2616 | __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); | ||
2475 | 2617 | ||
2618 | pc = lookup_page_cgroup(used); | ||
2476 | /* | 2619 | /* |
2477 | * Both of oldpage and newpage are still under lock_page(). | 2620 | * If a page is a file cache, radix-tree replacement is very atomic |
2478 | * Then, we don't have to care about race in radix-tree. | 2621 | * and we can skip this check. When it was an Anon page, its mapcount |
2479 | * But we have to be careful that this page is unmapped or not. | 2622 | * goes down to 0. But because we added MIGRATION flage, it's not |
2480 | * | 2623 | * uncharged yet. There are several case but page->mapcount check |
2481 | * There is a case for !page_mapped(). At the start of | 2624 | * and USED bit check in mem_cgroup_uncharge_page() will do enough |
2482 | * migration, oldpage was mapped. But now, it's zapped. | 2625 | * check. (see prepare_charge() also) |
2483 | * But we know *target* page is not freed/reused under us. | ||
2484 | * mem_cgroup_uncharge_page() does all necessary checks. | ||
2485 | */ | 2626 | */ |
2486 | if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) | 2627 | if (PageAnon(used)) |
2487 | mem_cgroup_uncharge_page(target); | 2628 | mem_cgroup_uncharge_page(used); |
2488 | /* | 2629 | /* |
2489 | * At migration, we may charge account against cgroup which has no tasks | 2630 | * At migration, we may charge account against cgroup which has no |
2631 | * tasks. | ||
2490 | * So, rmdir()->pre_destroy() can be called while we do this charge. | 2632 | * So, rmdir()->pre_destroy() can be called while we do this charge. |
2491 | * In that case, we need to call pre_destroy() again. check it here. | 2633 | * In that case, we need to call pre_destroy() again. check it here. |
2492 | */ | 2634 | */ |
@@ -2524,10 +2666,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2524 | unsigned long long val) | 2666 | unsigned long long val) |
2525 | { | 2667 | { |
2526 | int retry_count; | 2668 | int retry_count; |
2527 | u64 memswlimit; | 2669 | u64 memswlimit, memlimit; |
2528 | int ret = 0; | 2670 | int ret = 0; |
2529 | int children = mem_cgroup_count_children(memcg); | 2671 | int children = mem_cgroup_count_children(memcg); |
2530 | u64 curusage, oldusage; | 2672 | u64 curusage, oldusage; |
2673 | int enlarge; | ||
2531 | 2674 | ||
2532 | /* | 2675 | /* |
2533 | * For keeping hierarchical_reclaim simple, how long we should retry | 2676 | * For keeping hierarchical_reclaim simple, how long we should retry |
@@ -2538,6 +2681,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2538 | 2681 | ||
2539 | oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 2682 | oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
2540 | 2683 | ||
2684 | enlarge = 0; | ||
2541 | while (retry_count) { | 2685 | while (retry_count) { |
2542 | if (signal_pending(current)) { | 2686 | if (signal_pending(current)) { |
2543 | ret = -EINTR; | 2687 | ret = -EINTR; |
@@ -2555,6 +2699,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2555 | mutex_unlock(&set_limit_mutex); | 2699 | mutex_unlock(&set_limit_mutex); |
2556 | break; | 2700 | break; |
2557 | } | 2701 | } |
2702 | |||
2703 | memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); | ||
2704 | if (memlimit < val) | ||
2705 | enlarge = 1; | ||
2706 | |||
2558 | ret = res_counter_set_limit(&memcg->res, val); | 2707 | ret = res_counter_set_limit(&memcg->res, val); |
2559 | if (!ret) { | 2708 | if (!ret) { |
2560 | if (memswlimit == val) | 2709 | if (memswlimit == val) |
@@ -2576,6 +2725,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2576 | else | 2725 | else |
2577 | oldusage = curusage; | 2726 | oldusage = curusage; |
2578 | } | 2727 | } |
2728 | if (!ret && enlarge) | ||
2729 | memcg_oom_recover(memcg); | ||
2579 | 2730 | ||
2580 | return ret; | 2731 | return ret; |
2581 | } | 2732 | } |
@@ -2584,9 +2735,10 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
2584 | unsigned long long val) | 2735 | unsigned long long val) |
2585 | { | 2736 | { |
2586 | int retry_count; | 2737 | int retry_count; |
2587 | u64 memlimit, oldusage, curusage; | 2738 | u64 memlimit, memswlimit, oldusage, curusage; |
2588 | int children = mem_cgroup_count_children(memcg); | 2739 | int children = mem_cgroup_count_children(memcg); |
2589 | int ret = -EBUSY; | 2740 | int ret = -EBUSY; |
2741 | int enlarge = 0; | ||
2590 | 2742 | ||
2591 | /* see mem_cgroup_resize_res_limit */ | 2743 | /* see mem_cgroup_resize_res_limit */ |
2592 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; | 2744 | retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; |
@@ -2608,6 +2760,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
2608 | mutex_unlock(&set_limit_mutex); | 2760 | mutex_unlock(&set_limit_mutex); |
2609 | break; | 2761 | break; |
2610 | } | 2762 | } |
2763 | memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); | ||
2764 | if (memswlimit < val) | ||
2765 | enlarge = 1; | ||
2611 | ret = res_counter_set_limit(&memcg->memsw, val); | 2766 | ret = res_counter_set_limit(&memcg->memsw, val); |
2612 | if (!ret) { | 2767 | if (!ret) { |
2613 | if (memlimit == val) | 2768 | if (memlimit == val) |
@@ -2630,6 +2785,8 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, | |||
2630 | else | 2785 | else |
2631 | oldusage = curusage; | 2786 | oldusage = curusage; |
2632 | } | 2787 | } |
2788 | if (!ret && enlarge) | ||
2789 | memcg_oom_recover(memcg); | ||
2633 | return ret; | 2790 | return ret; |
2634 | } | 2791 | } |
2635 | 2792 | ||
@@ -2821,6 +2978,7 @@ move_account: | |||
2821 | if (ret) | 2978 | if (ret) |
2822 | break; | 2979 | break; |
2823 | } | 2980 | } |
2981 | memcg_oom_recover(mem); | ||
2824 | /* it seems parent cgroup doesn't have enough mem */ | 2982 | /* it seems parent cgroup doesn't have enough mem */ |
2825 | if (ret == -ENOMEM) | 2983 | if (ret == -ENOMEM) |
2826 | goto try_to_free; | 2984 | goto try_to_free; |
@@ -3311,9 +3469,9 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | |||
3311 | 3469 | ||
3312 | rcu_read_lock(); | 3470 | rcu_read_lock(); |
3313 | if (!swap) | 3471 | if (!swap) |
3314 | t = rcu_dereference(memcg->thresholds); | 3472 | t = rcu_dereference(memcg->thresholds.primary); |
3315 | else | 3473 | else |
3316 | t = rcu_dereference(memcg->memsw_thresholds); | 3474 | t = rcu_dereference(memcg->memsw_thresholds.primary); |
3317 | 3475 | ||
3318 | if (!t) | 3476 | if (!t) |
3319 | goto unlock; | 3477 | goto unlock; |
@@ -3325,7 +3483,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | |||
3325 | * If it's not true, a threshold was crossed after last | 3483 | * If it's not true, a threshold was crossed after last |
3326 | * call of __mem_cgroup_threshold(). | 3484 | * call of __mem_cgroup_threshold(). |
3327 | */ | 3485 | */ |
3328 | i = atomic_read(&t->current_threshold); | 3486 | i = t->current_threshold; |
3329 | 3487 | ||
3330 | /* | 3488 | /* |
3331 | * Iterate backward over array of thresholds starting from | 3489 | * Iterate backward over array of thresholds starting from |
@@ -3349,7 +3507,7 @@ static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) | |||
3349 | eventfd_signal(t->entries[i].eventfd, 1); | 3507 | eventfd_signal(t->entries[i].eventfd, 1); |
3350 | 3508 | ||
3351 | /* Update current_threshold */ | 3509 | /* Update current_threshold */ |
3352 | atomic_set(&t->current_threshold, i - 1); | 3510 | t->current_threshold = i - 1; |
3353 | unlock: | 3511 | unlock: |
3354 | rcu_read_unlock(); | 3512 | rcu_read_unlock(); |
3355 | } | 3513 | } |
@@ -3369,106 +3527,117 @@ static int compare_thresholds(const void *a, const void *b) | |||
3369 | return _a->threshold - _b->threshold; | 3527 | return _a->threshold - _b->threshold; |
3370 | } | 3528 | } |
3371 | 3529 | ||
3372 | static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft, | 3530 | static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem, void *data) |
3373 | struct eventfd_ctx *eventfd, const char *args) | 3531 | { |
3532 | struct mem_cgroup_eventfd_list *ev; | ||
3533 | |||
3534 | list_for_each_entry(ev, &mem->oom_notify, list) | ||
3535 | eventfd_signal(ev->eventfd, 1); | ||
3536 | return 0; | ||
3537 | } | ||
3538 | |||
3539 | static void mem_cgroup_oom_notify(struct mem_cgroup *mem) | ||
3540 | { | ||
3541 | mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_notify_cb); | ||
3542 | } | ||
3543 | |||
3544 | static int mem_cgroup_usage_register_event(struct cgroup *cgrp, | ||
3545 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | ||
3374 | { | 3546 | { |
3375 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 3547 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
3376 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | 3548 | struct mem_cgroup_thresholds *thresholds; |
3549 | struct mem_cgroup_threshold_ary *new; | ||
3377 | int type = MEMFILE_TYPE(cft->private); | 3550 | int type = MEMFILE_TYPE(cft->private); |
3378 | u64 threshold, usage; | 3551 | u64 threshold, usage; |
3379 | int size; | 3552 | int i, size, ret; |
3380 | int i, ret; | ||
3381 | 3553 | ||
3382 | ret = res_counter_memparse_write_strategy(args, &threshold); | 3554 | ret = res_counter_memparse_write_strategy(args, &threshold); |
3383 | if (ret) | 3555 | if (ret) |
3384 | return ret; | 3556 | return ret; |
3385 | 3557 | ||
3386 | mutex_lock(&memcg->thresholds_lock); | 3558 | mutex_lock(&memcg->thresholds_lock); |
3559 | |||
3387 | if (type == _MEM) | 3560 | if (type == _MEM) |
3388 | thresholds = memcg->thresholds; | 3561 | thresholds = &memcg->thresholds; |
3389 | else if (type == _MEMSWAP) | 3562 | else if (type == _MEMSWAP) |
3390 | thresholds = memcg->memsw_thresholds; | 3563 | thresholds = &memcg->memsw_thresholds; |
3391 | else | 3564 | else |
3392 | BUG(); | 3565 | BUG(); |
3393 | 3566 | ||
3394 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); | 3567 | usage = mem_cgroup_usage(memcg, type == _MEMSWAP); |
3395 | 3568 | ||
3396 | /* Check if a threshold crossed before adding a new one */ | 3569 | /* Check if a threshold crossed before adding a new one */ |
3397 | if (thresholds) | 3570 | if (thresholds->primary) |
3398 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | 3571 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); |
3399 | 3572 | ||
3400 | if (thresholds) | 3573 | size = thresholds->primary ? thresholds->primary->size + 1 : 1; |
3401 | size = thresholds->size + 1; | ||
3402 | else | ||
3403 | size = 1; | ||
3404 | 3574 | ||
3405 | /* Allocate memory for new array of thresholds */ | 3575 | /* Allocate memory for new array of thresholds */ |
3406 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | 3576 | new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), |
3407 | size * sizeof(struct mem_cgroup_threshold), | ||
3408 | GFP_KERNEL); | 3577 | GFP_KERNEL); |
3409 | if (!thresholds_new) { | 3578 | if (!new) { |
3410 | ret = -ENOMEM; | 3579 | ret = -ENOMEM; |
3411 | goto unlock; | 3580 | goto unlock; |
3412 | } | 3581 | } |
3413 | thresholds_new->size = size; | 3582 | new->size = size; |
3414 | 3583 | ||
3415 | /* Copy thresholds (if any) to new array */ | 3584 | /* Copy thresholds (if any) to new array */ |
3416 | if (thresholds) | 3585 | if (thresholds->primary) { |
3417 | memcpy(thresholds_new->entries, thresholds->entries, | 3586 | memcpy(new->entries, thresholds->primary->entries, (size - 1) * |
3418 | thresholds->size * | ||
3419 | sizeof(struct mem_cgroup_threshold)); | 3587 | sizeof(struct mem_cgroup_threshold)); |
3588 | } | ||
3589 | |||
3420 | /* Add new threshold */ | 3590 | /* Add new threshold */ |
3421 | thresholds_new->entries[size - 1].eventfd = eventfd; | 3591 | new->entries[size - 1].eventfd = eventfd; |
3422 | thresholds_new->entries[size - 1].threshold = threshold; | 3592 | new->entries[size - 1].threshold = threshold; |
3423 | 3593 | ||
3424 | /* Sort thresholds. Registering of new threshold isn't time-critical */ | 3594 | /* Sort thresholds. Registering of new threshold isn't time-critical */ |
3425 | sort(thresholds_new->entries, size, | 3595 | sort(new->entries, size, sizeof(struct mem_cgroup_threshold), |
3426 | sizeof(struct mem_cgroup_threshold), | ||
3427 | compare_thresholds, NULL); | 3596 | compare_thresholds, NULL); |
3428 | 3597 | ||
3429 | /* Find current threshold */ | 3598 | /* Find current threshold */ |
3430 | atomic_set(&thresholds_new->current_threshold, -1); | 3599 | new->current_threshold = -1; |
3431 | for (i = 0; i < size; i++) { | 3600 | for (i = 0; i < size; i++) { |
3432 | if (thresholds_new->entries[i].threshold < usage) { | 3601 | if (new->entries[i].threshold < usage) { |
3433 | /* | 3602 | /* |
3434 | * thresholds_new->current_threshold will not be used | 3603 | * new->current_threshold will not be used until |
3435 | * until rcu_assign_pointer(), so it's safe to increment | 3604 | * rcu_assign_pointer(), so it's safe to increment |
3436 | * it here. | 3605 | * it here. |
3437 | */ | 3606 | */ |
3438 | atomic_inc(&thresholds_new->current_threshold); | 3607 | ++new->current_threshold; |
3439 | } | 3608 | } |
3440 | } | 3609 | } |
3441 | 3610 | ||
3442 | if (type == _MEM) | 3611 | /* Free old spare buffer and save old primary buffer as spare */ |
3443 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | 3612 | kfree(thresholds->spare); |
3444 | else | 3613 | thresholds->spare = thresholds->primary; |
3445 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | 3614 | |
3615 | rcu_assign_pointer(thresholds->primary, new); | ||
3446 | 3616 | ||
3447 | /* To be sure that nobody uses thresholds before freeing it */ | 3617 | /* To be sure that nobody uses thresholds */ |
3448 | synchronize_rcu(); | 3618 | synchronize_rcu(); |
3449 | 3619 | ||
3450 | kfree(thresholds); | ||
3451 | unlock: | 3620 | unlock: |
3452 | mutex_unlock(&memcg->thresholds_lock); | 3621 | mutex_unlock(&memcg->thresholds_lock); |
3453 | 3622 | ||
3454 | return ret; | 3623 | return ret; |
3455 | } | 3624 | } |
3456 | 3625 | ||
3457 | static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, | 3626 | static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, |
3458 | struct eventfd_ctx *eventfd) | 3627 | struct cftype *cft, struct eventfd_ctx *eventfd) |
3459 | { | 3628 | { |
3460 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | 3629 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); |
3461 | struct mem_cgroup_threshold_ary *thresholds, *thresholds_new; | 3630 | struct mem_cgroup_thresholds *thresholds; |
3631 | struct mem_cgroup_threshold_ary *new; | ||
3462 | int type = MEMFILE_TYPE(cft->private); | 3632 | int type = MEMFILE_TYPE(cft->private); |
3463 | u64 usage; | 3633 | u64 usage; |
3464 | int size = 0; | 3634 | int i, j, size; |
3465 | int i, j, ret; | ||
3466 | 3635 | ||
3467 | mutex_lock(&memcg->thresholds_lock); | 3636 | mutex_lock(&memcg->thresholds_lock); |
3468 | if (type == _MEM) | 3637 | if (type == _MEM) |
3469 | thresholds = memcg->thresholds; | 3638 | thresholds = &memcg->thresholds; |
3470 | else if (type == _MEMSWAP) | 3639 | else if (type == _MEMSWAP) |
3471 | thresholds = memcg->memsw_thresholds; | 3640 | thresholds = &memcg->memsw_thresholds; |
3472 | else | 3641 | else |
3473 | BUG(); | 3642 | BUG(); |
3474 | 3643 | ||
@@ -3484,59 +3653,136 @@ static int mem_cgroup_unregister_event(struct cgroup *cgrp, struct cftype *cft, | |||
3484 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); | 3653 | __mem_cgroup_threshold(memcg, type == _MEMSWAP); |
3485 | 3654 | ||
3486 | /* Calculate new number of threshold */ | 3655 | /* Calculate new number of threshold */ |
3487 | for (i = 0; i < thresholds->size; i++) { | 3656 | size = 0; |
3488 | if (thresholds->entries[i].eventfd != eventfd) | 3657 | for (i = 0; i < thresholds->primary->size; i++) { |
3658 | if (thresholds->primary->entries[i].eventfd != eventfd) | ||
3489 | size++; | 3659 | size++; |
3490 | } | 3660 | } |
3491 | 3661 | ||
3662 | new = thresholds->spare; | ||
3663 | |||
3492 | /* Set thresholds array to NULL if we don't have thresholds */ | 3664 | /* Set thresholds array to NULL if we don't have thresholds */ |
3493 | if (!size) { | 3665 | if (!size) { |
3494 | thresholds_new = NULL; | 3666 | kfree(new); |
3495 | goto assign; | 3667 | new = NULL; |
3668 | goto swap_buffers; | ||
3496 | } | 3669 | } |
3497 | 3670 | ||
3498 | /* Allocate memory for new array of thresholds */ | 3671 | new->size = size; |
3499 | thresholds_new = kmalloc(sizeof(*thresholds_new) + | ||
3500 | size * sizeof(struct mem_cgroup_threshold), | ||
3501 | GFP_KERNEL); | ||
3502 | if (!thresholds_new) { | ||
3503 | ret = -ENOMEM; | ||
3504 | goto unlock; | ||
3505 | } | ||
3506 | thresholds_new->size = size; | ||
3507 | 3672 | ||
3508 | /* Copy thresholds and find current threshold */ | 3673 | /* Copy thresholds and find current threshold */ |
3509 | atomic_set(&thresholds_new->current_threshold, -1); | 3674 | new->current_threshold = -1; |
3510 | for (i = 0, j = 0; i < thresholds->size; i++) { | 3675 | for (i = 0, j = 0; i < thresholds->primary->size; i++) { |
3511 | if (thresholds->entries[i].eventfd == eventfd) | 3676 | if (thresholds->primary->entries[i].eventfd == eventfd) |
3512 | continue; | 3677 | continue; |
3513 | 3678 | ||
3514 | thresholds_new->entries[j] = thresholds->entries[i]; | 3679 | new->entries[j] = thresholds->primary->entries[i]; |
3515 | if (thresholds_new->entries[j].threshold < usage) { | 3680 | if (new->entries[j].threshold < usage) { |
3516 | /* | 3681 | /* |
3517 | * thresholds_new->current_threshold will not be used | 3682 | * new->current_threshold will not be used |
3518 | * until rcu_assign_pointer(), so it's safe to increment | 3683 | * until rcu_assign_pointer(), so it's safe to increment |
3519 | * it here. | 3684 | * it here. |
3520 | */ | 3685 | */ |
3521 | atomic_inc(&thresholds_new->current_threshold); | 3686 | ++new->current_threshold; |
3522 | } | 3687 | } |
3523 | j++; | 3688 | j++; |
3524 | } | 3689 | } |
3525 | 3690 | ||
3526 | assign: | 3691 | swap_buffers: |
3527 | if (type == _MEM) | 3692 | /* Swap primary and spare array */ |
3528 | rcu_assign_pointer(memcg->thresholds, thresholds_new); | 3693 | thresholds->spare = thresholds->primary; |
3529 | else | 3694 | rcu_assign_pointer(thresholds->primary, new); |
3530 | rcu_assign_pointer(memcg->memsw_thresholds, thresholds_new); | ||
3531 | 3695 | ||
3532 | /* To be sure that nobody uses thresholds before freeing it */ | 3696 | /* To be sure that nobody uses thresholds */ |
3533 | synchronize_rcu(); | 3697 | synchronize_rcu(); |
3534 | 3698 | ||
3535 | kfree(thresholds); | ||
3536 | unlock: | ||
3537 | mutex_unlock(&memcg->thresholds_lock); | 3699 | mutex_unlock(&memcg->thresholds_lock); |
3700 | } | ||
3538 | 3701 | ||
3539 | return ret; | 3702 | static int mem_cgroup_oom_register_event(struct cgroup *cgrp, |
3703 | struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) | ||
3704 | { | ||
3705 | struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); | ||
3706 | struct mem_cgroup_eventfd_list *event; | ||
3707 | int type = MEMFILE_TYPE(cft->private); | ||
3708 | |||
3709 | BUG_ON(type != _OOM_TYPE); | ||
3710 | event = kmalloc(sizeof(*event), GFP_KERNEL); | ||
3711 | if (!event) | ||
3712 | return -ENOMEM; | ||
3713 | |||
3714 | mutex_lock(&memcg_oom_mutex); | ||
3715 | |||
3716 | event->eventfd = eventfd; | ||
3717 | list_add(&event->list, &memcg->oom_notify); | ||
3718 | |||
3719 | /* already in OOM ? */ | ||
3720 | if (atomic_read(&memcg->oom_lock)) | ||
3721 | eventfd_signal(eventfd, 1); | ||
3722 | mutex_unlock(&memcg_oom_mutex); | ||
3723 | |||
3724 | return 0; | ||
3725 | } | ||
3726 | |||
3727 | static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, | ||
3728 | struct cftype *cft, struct eventfd_ctx *eventfd) | ||
3729 | { | ||
3730 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
3731 | struct mem_cgroup_eventfd_list *ev, *tmp; | ||
3732 | int type = MEMFILE_TYPE(cft->private); | ||
3733 | |||
3734 | BUG_ON(type != _OOM_TYPE); | ||
3735 | |||
3736 | mutex_lock(&memcg_oom_mutex); | ||
3737 | |||
3738 | list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { | ||
3739 | if (ev->eventfd == eventfd) { | ||
3740 | list_del(&ev->list); | ||
3741 | kfree(ev); | ||
3742 | } | ||
3743 | } | ||
3744 | |||
3745 | mutex_unlock(&memcg_oom_mutex); | ||
3746 | } | ||
3747 | |||
3748 | static int mem_cgroup_oom_control_read(struct cgroup *cgrp, | ||
3749 | struct cftype *cft, struct cgroup_map_cb *cb) | ||
3750 | { | ||
3751 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
3752 | |||
3753 | cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); | ||
3754 | |||
3755 | if (atomic_read(&mem->oom_lock)) | ||
3756 | cb->fill(cb, "under_oom", 1); | ||
3757 | else | ||
3758 | cb->fill(cb, "under_oom", 0); | ||
3759 | return 0; | ||
3760 | } | ||
3761 | |||
3762 | /* | ||
3763 | */ | ||
3764 | static int mem_cgroup_oom_control_write(struct cgroup *cgrp, | ||
3765 | struct cftype *cft, u64 val) | ||
3766 | { | ||
3767 | struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); | ||
3768 | struct mem_cgroup *parent; | ||
3769 | |||
3770 | /* cannot set to root cgroup and only 0 and 1 are allowed */ | ||
3771 | if (!cgrp->parent || !((val == 0) || (val == 1))) | ||
3772 | return -EINVAL; | ||
3773 | |||
3774 | parent = mem_cgroup_from_cont(cgrp->parent); | ||
3775 | |||
3776 | cgroup_lock(); | ||
3777 | /* oom-kill-disable is a flag for subhierarchy. */ | ||
3778 | if ((parent->use_hierarchy) || | ||
3779 | (mem->use_hierarchy && !list_empty(&cgrp->children))) { | ||
3780 | cgroup_unlock(); | ||
3781 | return -EINVAL; | ||
3782 | } | ||
3783 | mem->oom_kill_disable = val; | ||
3784 | cgroup_unlock(); | ||
3785 | return 0; | ||
3540 | } | 3786 | } |
3541 | 3787 | ||
3542 | static struct cftype mem_cgroup_files[] = { | 3788 | static struct cftype mem_cgroup_files[] = { |
@@ -3544,8 +3790,8 @@ static struct cftype mem_cgroup_files[] = { | |||
3544 | .name = "usage_in_bytes", | 3790 | .name = "usage_in_bytes", |
3545 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), | 3791 | .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), |
3546 | .read_u64 = mem_cgroup_read, | 3792 | .read_u64 = mem_cgroup_read, |
3547 | .register_event = mem_cgroup_register_event, | 3793 | .register_event = mem_cgroup_usage_register_event, |
3548 | .unregister_event = mem_cgroup_unregister_event, | 3794 | .unregister_event = mem_cgroup_usage_unregister_event, |
3549 | }, | 3795 | }, |
3550 | { | 3796 | { |
3551 | .name = "max_usage_in_bytes", | 3797 | .name = "max_usage_in_bytes", |
@@ -3594,6 +3840,14 @@ static struct cftype mem_cgroup_files[] = { | |||
3594 | .read_u64 = mem_cgroup_move_charge_read, | 3840 | .read_u64 = mem_cgroup_move_charge_read, |
3595 | .write_u64 = mem_cgroup_move_charge_write, | 3841 | .write_u64 = mem_cgroup_move_charge_write, |
3596 | }, | 3842 | }, |
3843 | { | ||
3844 | .name = "oom_control", | ||
3845 | .read_map = mem_cgroup_oom_control_read, | ||
3846 | .write_u64 = mem_cgroup_oom_control_write, | ||
3847 | .register_event = mem_cgroup_oom_register_event, | ||
3848 | .unregister_event = mem_cgroup_oom_unregister_event, | ||
3849 | .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), | ||
3850 | }, | ||
3597 | }; | 3851 | }; |
3598 | 3852 | ||
3599 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP | 3853 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP |
@@ -3602,8 +3856,8 @@ static struct cftype memsw_cgroup_files[] = { | |||
3602 | .name = "memsw.usage_in_bytes", | 3856 | .name = "memsw.usage_in_bytes", |
3603 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), | 3857 | .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), |
3604 | .read_u64 = mem_cgroup_read, | 3858 | .read_u64 = mem_cgroup_read, |
3605 | .register_event = mem_cgroup_register_event, | 3859 | .register_event = mem_cgroup_usage_register_event, |
3606 | .unregister_event = mem_cgroup_unregister_event, | 3860 | .unregister_event = mem_cgroup_usage_unregister_event, |
3607 | }, | 3861 | }, |
3608 | { | 3862 | { |
3609 | .name = "memsw.max_usage_in_bytes", | 3863 | .name = "memsw.max_usage_in_bytes", |
@@ -3831,6 +4085,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3831 | } else { | 4085 | } else { |
3832 | parent = mem_cgroup_from_cont(cont->parent); | 4086 | parent = mem_cgroup_from_cont(cont->parent); |
3833 | mem->use_hierarchy = parent->use_hierarchy; | 4087 | mem->use_hierarchy = parent->use_hierarchy; |
4088 | mem->oom_kill_disable = parent->oom_kill_disable; | ||
3834 | } | 4089 | } |
3835 | 4090 | ||
3836 | if (parent && parent->use_hierarchy) { | 4091 | if (parent && parent->use_hierarchy) { |
@@ -3849,6 +4104,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3849 | } | 4104 | } |
3850 | mem->last_scanned_child = 0; | 4105 | mem->last_scanned_child = 0; |
3851 | spin_lock_init(&mem->reclaim_param_lock); | 4106 | spin_lock_init(&mem->reclaim_param_lock); |
4107 | INIT_LIST_HEAD(&mem->oom_notify); | ||
3852 | 4108 | ||
3853 | if (parent) | 4109 | if (parent) |
3854 | mem->swappiness = get_swappiness(parent); | 4110 | mem->swappiness = get_swappiness(parent); |
@@ -3976,6 +4232,80 @@ enum mc_target_type { | |||
3976 | MC_TARGET_SWAP, | 4232 | MC_TARGET_SWAP, |
3977 | }; | 4233 | }; |
3978 | 4234 | ||
4235 | static struct page *mc_handle_present_pte(struct vm_area_struct *vma, | ||
4236 | unsigned long addr, pte_t ptent) | ||
4237 | { | ||
4238 | struct page *page = vm_normal_page(vma, addr, ptent); | ||
4239 | |||
4240 | if (!page || !page_mapped(page)) | ||
4241 | return NULL; | ||
4242 | if (PageAnon(page)) { | ||
4243 | /* we don't move shared anon */ | ||
4244 | if (!move_anon() || page_mapcount(page) > 2) | ||
4245 | return NULL; | ||
4246 | } else if (!move_file()) | ||
4247 | /* we ignore mapcount for file pages */ | ||
4248 | return NULL; | ||
4249 | if (!get_page_unless_zero(page)) | ||
4250 | return NULL; | ||
4251 | |||
4252 | return page; | ||
4253 | } | ||
4254 | |||
4255 | static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, | ||
4256 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | ||
4257 | { | ||
4258 | int usage_count; | ||
4259 | struct page *page = NULL; | ||
4260 | swp_entry_t ent = pte_to_swp_entry(ptent); | ||
4261 | |||
4262 | if (!move_anon() || non_swap_entry(ent)) | ||
4263 | return NULL; | ||
4264 | usage_count = mem_cgroup_count_swap_user(ent, &page); | ||
4265 | if (usage_count > 1) { /* we don't move shared anon */ | ||
4266 | if (page) | ||
4267 | put_page(page); | ||
4268 | return NULL; | ||
4269 | } | ||
4270 | if (do_swap_account) | ||
4271 | entry->val = ent.val; | ||
4272 | |||
4273 | return page; | ||
4274 | } | ||
4275 | |||
4276 | static struct page *mc_handle_file_pte(struct vm_area_struct *vma, | ||
4277 | unsigned long addr, pte_t ptent, swp_entry_t *entry) | ||
4278 | { | ||
4279 | struct page *page = NULL; | ||
4280 | struct inode *inode; | ||
4281 | struct address_space *mapping; | ||
4282 | pgoff_t pgoff; | ||
4283 | |||
4284 | if (!vma->vm_file) /* anonymous vma */ | ||
4285 | return NULL; | ||
4286 | if (!move_file()) | ||
4287 | return NULL; | ||
4288 | |||
4289 | inode = vma->vm_file->f_path.dentry->d_inode; | ||
4290 | mapping = vma->vm_file->f_mapping; | ||
4291 | if (pte_none(ptent)) | ||
4292 | pgoff = linear_page_index(vma, addr); | ||
4293 | else /* pte_file(ptent) is true */ | ||
4294 | pgoff = pte_to_pgoff(ptent); | ||
4295 | |||
4296 | /* page is moved even if it's not RSS of this task(page-faulted). */ | ||
4297 | if (!mapping_cap_swap_backed(mapping)) { /* normal file */ | ||
4298 | page = find_get_page(mapping, pgoff); | ||
4299 | } else { /* shmem/tmpfs file. we should take account of swap too. */ | ||
4300 | swp_entry_t ent; | ||
4301 | mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); | ||
4302 | if (do_swap_account) | ||
4303 | entry->val = ent.val; | ||
4304 | } | ||
4305 | |||
4306 | return page; | ||
4307 | } | ||
4308 | |||
3979 | static int is_target_pte_for_mc(struct vm_area_struct *vma, | 4309 | static int is_target_pte_for_mc(struct vm_area_struct *vma, |
3980 | unsigned long addr, pte_t ptent, union mc_target *target) | 4310 | unsigned long addr, pte_t ptent, union mc_target *target) |
3981 | { | 4311 | { |
@@ -3983,43 +4313,16 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, | |||
3983 | struct page_cgroup *pc; | 4313 | struct page_cgroup *pc; |
3984 | int ret = 0; | 4314 | int ret = 0; |
3985 | swp_entry_t ent = { .val = 0 }; | 4315 | swp_entry_t ent = { .val = 0 }; |
3986 | int usage_count = 0; | ||
3987 | bool move_anon = test_bit(MOVE_CHARGE_TYPE_ANON, | ||
3988 | &mc.to->move_charge_at_immigrate); | ||
3989 | 4316 | ||
3990 | if (!pte_present(ptent)) { | 4317 | if (pte_present(ptent)) |
3991 | /* TODO: handle swap of shmes/tmpfs */ | 4318 | page = mc_handle_present_pte(vma, addr, ptent); |
3992 | if (pte_none(ptent) || pte_file(ptent)) | 4319 | else if (is_swap_pte(ptent)) |
3993 | return 0; | 4320 | page = mc_handle_swap_pte(vma, addr, ptent, &ent); |
3994 | else if (is_swap_pte(ptent)) { | 4321 | else if (pte_none(ptent) || pte_file(ptent)) |
3995 | ent = pte_to_swp_entry(ptent); | 4322 | page = mc_handle_file_pte(vma, addr, ptent, &ent); |
3996 | if (!move_anon || non_swap_entry(ent)) | 4323 | |
3997 | return 0; | 4324 | if (!page && !ent.val) |
3998 | usage_count = mem_cgroup_count_swap_user(ent, &page); | ||
3999 | } | ||
4000 | } else { | ||
4001 | page = vm_normal_page(vma, addr, ptent); | ||
4002 | if (!page || !page_mapped(page)) | ||
4003 | return 0; | ||
4004 | /* | ||
4005 | * TODO: We don't move charges of file(including shmem/tmpfs) | ||
4006 | * pages for now. | ||
4007 | */ | ||
4008 | if (!move_anon || !PageAnon(page)) | ||
4009 | return 0; | ||
4010 | if (!get_page_unless_zero(page)) | ||
4011 | return 0; | ||
4012 | usage_count = page_mapcount(page); | ||
4013 | } | ||
4014 | if (usage_count > 1) { | ||
4015 | /* | ||
4016 | * TODO: We don't move charges of shared(used by multiple | ||
4017 | * processes) pages for now. | ||
4018 | */ | ||
4019 | if (page) | ||
4020 | put_page(page); | ||
4021 | return 0; | 4325 | return 0; |
4022 | } | ||
4023 | if (page) { | 4326 | if (page) { |
4024 | pc = lookup_page_cgroup(page); | 4327 | pc = lookup_page_cgroup(page); |
4025 | /* | 4328 | /* |
@@ -4035,8 +4338,8 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, | |||
4035 | if (!ret || !target) | 4338 | if (!ret || !target) |
4036 | put_page(page); | 4339 | put_page(page); |
4037 | } | 4340 | } |
4038 | /* throught */ | 4341 | /* There is a swap entry and a page doesn't exist or isn't charged */ |
4039 | if (ent.val && do_swap_account && !ret && | 4342 | if (ent.val && !ret && |
4040 | css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { | 4343 | css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { |
4041 | ret = MC_TARGET_SWAP; | 4344 | ret = MC_TARGET_SWAP; |
4042 | if (target) | 4345 | if (target) |
@@ -4077,9 +4380,6 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) | |||
4077 | }; | 4380 | }; |
4078 | if (is_vm_hugetlb_page(vma)) | 4381 | if (is_vm_hugetlb_page(vma)) |
4079 | continue; | 4382 | continue; |
4080 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
4081 | if (vma->vm_flags & VM_SHARED) | ||
4082 | continue; | ||
4083 | walk_page_range(vma->vm_start, vma->vm_end, | 4383 | walk_page_range(vma->vm_start, vma->vm_end, |
4084 | &mem_cgroup_count_precharge_walk); | 4384 | &mem_cgroup_count_precharge_walk); |
4085 | } | 4385 | } |
@@ -4102,6 +4402,7 @@ static void mem_cgroup_clear_mc(void) | |||
4102 | if (mc.precharge) { | 4402 | if (mc.precharge) { |
4103 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); | 4403 | __mem_cgroup_cancel_charge(mc.to, mc.precharge); |
4104 | mc.precharge = 0; | 4404 | mc.precharge = 0; |
4405 | memcg_oom_recover(mc.to); | ||
4105 | } | 4406 | } |
4106 | /* | 4407 | /* |
4107 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so | 4408 | * we didn't uncharge from mc.from at mem_cgroup_move_account(), so |
@@ -4110,6 +4411,7 @@ static void mem_cgroup_clear_mc(void) | |||
4110 | if (mc.moved_charge) { | 4411 | if (mc.moved_charge) { |
4111 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); | 4412 | __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); |
4112 | mc.moved_charge = 0; | 4413 | mc.moved_charge = 0; |
4414 | memcg_oom_recover(mc.from); | ||
4113 | } | 4415 | } |
4114 | /* we must fixup refcnts and charges */ | 4416 | /* we must fixup refcnts and charges */ |
4115 | if (mc.moved_swap) { | 4417 | if (mc.moved_swap) { |
@@ -4274,9 +4576,6 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) | |||
4274 | }; | 4576 | }; |
4275 | if (is_vm_hugetlb_page(vma)) | 4577 | if (is_vm_hugetlb_page(vma)) |
4276 | continue; | 4578 | continue; |
4277 | /* TODO: We don't move charges of shmem/tmpfs pages for now. */ | ||
4278 | if (vma->vm_flags & VM_SHARED) | ||
4279 | continue; | ||
4280 | ret = walk_page_range(vma->vm_start, vma->vm_end, | 4579 | ret = walk_page_range(vma->vm_start, vma->vm_end, |
4281 | &mem_cgroup_move_charge_walk); | 4580 | &mem_cgroup_move_charge_walk); |
4282 | if (ret) | 4581 | if (ret) |
diff --git a/mm/migrate.c b/mm/migrate.c index 09e2471afa0f..4205b1d6049e 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
@@ -590,7 +590,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, | |||
590 | } | 590 | } |
591 | 591 | ||
592 | /* charge against new page */ | 592 | /* charge against new page */ |
593 | charge = mem_cgroup_prepare_migration(page, &mem); | 593 | charge = mem_cgroup_prepare_migration(page, newpage, &mem); |
594 | if (charge == -ENOMEM) { | 594 | if (charge == -ENOMEM) { |
595 | rc = -ENOMEM; | 595 | rc = -ENOMEM; |
596 | goto unlock; | 596 | goto unlock; |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b68e802a7a7d..709aedfaa014 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -479,12 +479,9 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) | |||
479 | read_lock(&tasklist_lock); | 479 | read_lock(&tasklist_lock); |
480 | retry: | 480 | retry: |
481 | p = select_bad_process(&points, mem); | 481 | p = select_bad_process(&points, mem); |
482 | if (PTR_ERR(p) == -1UL) | 482 | if (!p || PTR_ERR(p) == -1UL) |
483 | goto out; | 483 | goto out; |
484 | 484 | ||
485 | if (!p) | ||
486 | p = current; | ||
487 | |||
488 | if (oom_kill_process(p, gfp_mask, 0, points, mem, | 485 | if (oom_kill_process(p, gfp_mask, 0, points, mem, |
489 | "Memory cgroup out of memory")) | 486 | "Memory cgroup out of memory")) |
490 | goto retry; | 487 | goto retry; |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 08b349931ebc..431214b941ac 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -57,6 +57,22 @@ | |||
57 | #include <asm/div64.h> | 57 | #include <asm/div64.h> |
58 | #include "internal.h" | 58 | #include "internal.h" |
59 | 59 | ||
60 | #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID | ||
61 | DEFINE_PER_CPU(int, numa_node); | ||
62 | EXPORT_PER_CPU_SYMBOL(numa_node); | ||
63 | #endif | ||
64 | |||
65 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES | ||
66 | /* | ||
67 | * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. | ||
68 | * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. | ||
69 | * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() | ||
70 | * defined in <linux/topology.h>. | ||
71 | */ | ||
72 | DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ | ||
73 | EXPORT_PER_CPU_SYMBOL(_numa_mem_); | ||
74 | #endif | ||
75 | |||
60 | /* | 76 | /* |
61 | * Array of node states. | 77 | * Array of node states. |
62 | */ | 78 | */ |
@@ -2856,6 +2872,24 @@ static void build_zonelist_cache(pg_data_t *pgdat) | |||
2856 | zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); | 2872 | zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); |
2857 | } | 2873 | } |
2858 | 2874 | ||
2875 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES | ||
2876 | /* | ||
2877 | * Return node id of node used for "local" allocations. | ||
2878 | * I.e., first node id of first zone in arg node's generic zonelist. | ||
2879 | * Used for initializing percpu 'numa_mem', which is used primarily | ||
2880 | * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. | ||
2881 | */ | ||
2882 | int local_memory_node(int node) | ||
2883 | { | ||
2884 | struct zone *zone; | ||
2885 | |||
2886 | (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), | ||
2887 | gfp_zone(GFP_KERNEL), | ||
2888 | NULL, | ||
2889 | &zone); | ||
2890 | return zone->node; | ||
2891 | } | ||
2892 | #endif | ||
2859 | 2893 | ||
2860 | #else /* CONFIG_NUMA */ | 2894 | #else /* CONFIG_NUMA */ |
2861 | 2895 | ||
@@ -2970,9 +3004,23 @@ static __init_refok int __build_all_zonelists(void *data) | |||
2970 | * needs the percpu allocator in order to allocate its pagesets | 3004 | * needs the percpu allocator in order to allocate its pagesets |
2971 | * (a chicken-egg dilemma). | 3005 | * (a chicken-egg dilemma). |
2972 | */ | 3006 | */ |
2973 | for_each_possible_cpu(cpu) | 3007 | for_each_possible_cpu(cpu) { |
2974 | setup_pageset(&per_cpu(boot_pageset, cpu), 0); | 3008 | setup_pageset(&per_cpu(boot_pageset, cpu), 0); |
2975 | 3009 | ||
3010 | #ifdef CONFIG_HAVE_MEMORYLESS_NODES | ||
3011 | /* | ||
3012 | * We now know the "local memory node" for each node-- | ||
3013 | * i.e., the node of the first zone in the generic zonelist. | ||
3014 | * Set up numa_mem percpu variable for on-line cpus. During | ||
3015 | * boot, only the boot cpu should be on-line; we'll init the | ||
3016 | * secondary cpus' numa_mem as they come on-line. During | ||
3017 | * node/memory hotplug, we'll fixup all on-line cpus. | ||
3018 | */ | ||
3019 | if (cpu_online(cpu)) | ||
3020 | set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); | ||
3021 | #endif | ||
3022 | } | ||
3023 | |||
2976 | return 0; | 3024 | return 0; |
2977 | } | 3025 | } |
2978 | 3026 | ||
diff --git a/mm/shmem.c b/mm/shmem.c index 4ef9797bd430..7e5030ae18ff 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -727,10 +727,11 @@ done2: | |||
727 | if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) { | 727 | if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) { |
728 | /* | 728 | /* |
729 | * Call truncate_inode_pages again: racing shmem_unuse_inode | 729 | * Call truncate_inode_pages again: racing shmem_unuse_inode |
730 | * may have swizzled a page in from swap since vmtruncate or | 730 | * may have swizzled a page in from swap since |
731 | * generic_delete_inode did it, before we lowered next_index. | 731 | * truncate_pagecache or generic_delete_inode did it, before we |
732 | * Also, though shmem_getpage checks i_size before adding to | 732 | * lowered next_index. Also, though shmem_getpage checks |
733 | * cache, no recheck after: so fix the narrow window there too. | 733 | * i_size before adding to cache, no recheck after: so fix the |
734 | * narrow window there too. | ||
734 | * | 735 | * |
735 | * Recalling truncate_inode_pages_range and unmap_mapping_range | 736 | * Recalling truncate_inode_pages_range and unmap_mapping_range |
736 | * every time for punch_hole (which never got a chance to clear | 737 | * every time for punch_hole (which never got a chance to clear |
@@ -760,19 +761,16 @@ done2: | |||
760 | } | 761 | } |
761 | } | 762 | } |
762 | 763 | ||
763 | static void shmem_truncate(struct inode *inode) | ||
764 | { | ||
765 | shmem_truncate_range(inode, inode->i_size, (loff_t)-1); | ||
766 | } | ||
767 | |||
768 | static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | 764 | static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) |
769 | { | 765 | { |
770 | struct inode *inode = dentry->d_inode; | 766 | struct inode *inode = dentry->d_inode; |
771 | struct page *page = NULL; | ||
772 | int error; | 767 | int error; |
773 | 768 | ||
774 | if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { | 769 | if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { |
775 | if (attr->ia_size < inode->i_size) { | 770 | loff_t newsize = attr->ia_size; |
771 | struct page *page = NULL; | ||
772 | |||
773 | if (newsize < inode->i_size) { | ||
776 | /* | 774 | /* |
777 | * If truncating down to a partial page, then | 775 | * If truncating down to a partial page, then |
778 | * if that page is already allocated, hold it | 776 | * if that page is already allocated, hold it |
@@ -780,9 +778,9 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | |||
780 | * truncate_partial_page cannnot miss it were | 778 | * truncate_partial_page cannnot miss it were |
781 | * it assigned to swap. | 779 | * it assigned to swap. |
782 | */ | 780 | */ |
783 | if (attr->ia_size & (PAGE_CACHE_SIZE-1)) { | 781 | if (newsize & (PAGE_CACHE_SIZE-1)) { |
784 | (void) shmem_getpage(inode, | 782 | (void) shmem_getpage(inode, |
785 | attr->ia_size>>PAGE_CACHE_SHIFT, | 783 | newsize >> PAGE_CACHE_SHIFT, |
786 | &page, SGP_READ, NULL); | 784 | &page, SGP_READ, NULL); |
787 | if (page) | 785 | if (page) |
788 | unlock_page(page); | 786 | unlock_page(page); |
@@ -794,24 +792,29 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | |||
794 | * if it's being fully truncated to zero-length: the | 792 | * if it's being fully truncated to zero-length: the |
795 | * nrpages check is efficient enough in that case. | 793 | * nrpages check is efficient enough in that case. |
796 | */ | 794 | */ |
797 | if (attr->ia_size) { | 795 | if (newsize) { |
798 | struct shmem_inode_info *info = SHMEM_I(inode); | 796 | struct shmem_inode_info *info = SHMEM_I(inode); |
799 | spin_lock(&info->lock); | 797 | spin_lock(&info->lock); |
800 | info->flags &= ~SHMEM_PAGEIN; | 798 | info->flags &= ~SHMEM_PAGEIN; |
801 | spin_unlock(&info->lock); | 799 | spin_unlock(&info->lock); |
802 | } | 800 | } |
803 | } | 801 | } |
802 | |||
803 | error = simple_setsize(inode, newsize); | ||
804 | if (page) | ||
805 | page_cache_release(page); | ||
806 | if (error) | ||
807 | return error; | ||
808 | shmem_truncate_range(inode, newsize, (loff_t)-1); | ||
804 | } | 809 | } |
805 | 810 | ||
806 | error = inode_change_ok(inode, attr); | 811 | error = inode_change_ok(inode, attr); |
807 | if (!error) | 812 | if (!error) |
808 | error = inode_setattr(inode, attr); | 813 | generic_setattr(inode, attr); |
809 | #ifdef CONFIG_TMPFS_POSIX_ACL | 814 | #ifdef CONFIG_TMPFS_POSIX_ACL |
810 | if (!error && (attr->ia_valid & ATTR_MODE)) | 815 | if (!error && (attr->ia_valid & ATTR_MODE)) |
811 | error = generic_acl_chmod(inode); | 816 | error = generic_acl_chmod(inode); |
812 | #endif | 817 | #endif |
813 | if (page) | ||
814 | page_cache_release(page); | ||
815 | return error; | 818 | return error; |
816 | } | 819 | } |
817 | 820 | ||
@@ -819,11 +822,11 @@ static void shmem_delete_inode(struct inode *inode) | |||
819 | { | 822 | { |
820 | struct shmem_inode_info *info = SHMEM_I(inode); | 823 | struct shmem_inode_info *info = SHMEM_I(inode); |
821 | 824 | ||
822 | if (inode->i_op->truncate == shmem_truncate) { | 825 | if (inode->i_mapping->a_ops == &shmem_aops) { |
823 | truncate_inode_pages(inode->i_mapping, 0); | 826 | truncate_inode_pages(inode->i_mapping, 0); |
824 | shmem_unacct_size(info->flags, inode->i_size); | 827 | shmem_unacct_size(info->flags, inode->i_size); |
825 | inode->i_size = 0; | 828 | inode->i_size = 0; |
826 | shmem_truncate(inode); | 829 | shmem_truncate_range(inode, 0, (loff_t)-1); |
827 | if (!list_empty(&info->swaplist)) { | 830 | if (!list_empty(&info->swaplist)) { |
828 | mutex_lock(&shmem_swaplist_mutex); | 831 | mutex_lock(&shmem_swaplist_mutex); |
829 | list_del_init(&info->swaplist); | 832 | list_del_init(&info->swaplist); |
@@ -2022,7 +2025,6 @@ static const struct inode_operations shmem_symlink_inline_operations = { | |||
2022 | }; | 2025 | }; |
2023 | 2026 | ||
2024 | static const struct inode_operations shmem_symlink_inode_operations = { | 2027 | static const struct inode_operations shmem_symlink_inode_operations = { |
2025 | .truncate = shmem_truncate, | ||
2026 | .readlink = generic_readlink, | 2028 | .readlink = generic_readlink, |
2027 | .follow_link = shmem_follow_link, | 2029 | .follow_link = shmem_follow_link, |
2028 | .put_link = shmem_put_link, | 2030 | .put_link = shmem_put_link, |
@@ -2433,14 +2435,13 @@ static const struct file_operations shmem_file_operations = { | |||
2433 | .write = do_sync_write, | 2435 | .write = do_sync_write, |
2434 | .aio_read = shmem_file_aio_read, | 2436 | .aio_read = shmem_file_aio_read, |
2435 | .aio_write = generic_file_aio_write, | 2437 | .aio_write = generic_file_aio_write, |
2436 | .fsync = simple_sync_file, | 2438 | .fsync = noop_fsync, |
2437 | .splice_read = generic_file_splice_read, | 2439 | .splice_read = generic_file_splice_read, |
2438 | .splice_write = generic_file_splice_write, | 2440 | .splice_write = generic_file_splice_write, |
2439 | #endif | 2441 | #endif |
2440 | }; | 2442 | }; |
2441 | 2443 | ||
2442 | static const struct inode_operations shmem_inode_operations = { | 2444 | static const struct inode_operations shmem_inode_operations = { |
2443 | .truncate = shmem_truncate, | ||
2444 | .setattr = shmem_notify_change, | 2445 | .setattr = shmem_notify_change, |
2445 | .truncate_range = shmem_truncate_range, | 2446 | .truncate_range = shmem_truncate_range, |
2446 | #ifdef CONFIG_TMPFS_POSIX_ACL | 2447 | #ifdef CONFIG_TMPFS_POSIX_ACL |
@@ -2559,6 +2560,45 @@ out4: | |||
2559 | return error; | 2560 | return error; |
2560 | } | 2561 | } |
2561 | 2562 | ||
2563 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
2564 | /** | ||
2565 | * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file | ||
2566 | * @inode: the inode to be searched | ||
2567 | * @pgoff: the offset to be searched | ||
2568 | * @pagep: the pointer for the found page to be stored | ||
2569 | * @ent: the pointer for the found swap entry to be stored | ||
2570 | * | ||
2571 | * If a page is found, refcount of it is incremented. Callers should handle | ||
2572 | * these refcount. | ||
2573 | */ | ||
2574 | void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, | ||
2575 | struct page **pagep, swp_entry_t *ent) | ||
2576 | { | ||
2577 | swp_entry_t entry = { .val = 0 }, *ptr; | ||
2578 | struct page *page = NULL; | ||
2579 | struct shmem_inode_info *info = SHMEM_I(inode); | ||
2580 | |||
2581 | if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | ||
2582 | goto out; | ||
2583 | |||
2584 | spin_lock(&info->lock); | ||
2585 | ptr = shmem_swp_entry(info, pgoff, NULL); | ||
2586 | #ifdef CONFIG_SWAP | ||
2587 | if (ptr && ptr->val) { | ||
2588 | entry.val = ptr->val; | ||
2589 | page = find_get_page(&swapper_space, entry.val); | ||
2590 | } else | ||
2591 | #endif | ||
2592 | page = find_get_page(inode->i_mapping, pgoff); | ||
2593 | if (ptr) | ||
2594 | shmem_swp_unmap(ptr); | ||
2595 | spin_unlock(&info->lock); | ||
2596 | out: | ||
2597 | *pagep = page; | ||
2598 | *ent = entry; | ||
2599 | } | ||
2600 | #endif | ||
2601 | |||
2562 | #else /* !CONFIG_SHMEM */ | 2602 | #else /* !CONFIG_SHMEM */ |
2563 | 2603 | ||
2564 | /* | 2604 | /* |
@@ -2598,6 +2638,31 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) | |||
2598 | return 0; | 2638 | return 0; |
2599 | } | 2639 | } |
2600 | 2640 | ||
2641 | #ifdef CONFIG_CGROUP_MEM_RES_CTLR | ||
2642 | /** | ||
2643 | * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file | ||
2644 | * @inode: the inode to be searched | ||
2645 | * @pgoff: the offset to be searched | ||
2646 | * @pagep: the pointer for the found page to be stored | ||
2647 | * @ent: the pointer for the found swap entry to be stored | ||
2648 | * | ||
2649 | * If a page is found, refcount of it is incremented. Callers should handle | ||
2650 | * these refcount. | ||
2651 | */ | ||
2652 | void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, | ||
2653 | struct page **pagep, swp_entry_t *ent) | ||
2654 | { | ||
2655 | struct page *page = NULL; | ||
2656 | |||
2657 | if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) | ||
2658 | goto out; | ||
2659 | page = find_get_page(inode->i_mapping, pgoff); | ||
2660 | out: | ||
2661 | *pagep = page; | ||
2662 | *ent = (swp_entry_t){ .val = 0 }; | ||
2663 | } | ||
2664 | #endif | ||
2665 | |||
2601 | #define shmem_vm_ops generic_file_vm_ops | 2666 | #define shmem_vm_ops generic_file_vm_ops |
2602 | #define shmem_file_operations ramfs_file_operations | 2667 | #define shmem_file_operations ramfs_file_operations |
2603 | #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) | 2668 | #define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) |
@@ -821,7 +821,7 @@ static void init_reap_node(int cpu) | |||
821 | { | 821 | { |
822 | int node; | 822 | int node; |
823 | 823 | ||
824 | node = next_node(cpu_to_node(cpu), node_online_map); | 824 | node = next_node(cpu_to_mem(cpu), node_online_map); |
825 | if (node == MAX_NUMNODES) | 825 | if (node == MAX_NUMNODES) |
826 | node = first_node(node_online_map); | 826 | node = first_node(node_online_map); |
827 | 827 | ||
@@ -1050,7 +1050,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) | |||
1050 | struct array_cache *alien = NULL; | 1050 | struct array_cache *alien = NULL; |
1051 | int node; | 1051 | int node; |
1052 | 1052 | ||
1053 | node = numa_node_id(); | 1053 | node = numa_mem_id(); |
1054 | 1054 | ||
1055 | /* | 1055 | /* |
1056 | * Make sure we are not freeing a object from another node to the array | 1056 | * Make sure we are not freeing a object from another node to the array |
@@ -1129,7 +1129,7 @@ static void __cpuinit cpuup_canceled(long cpu) | |||
1129 | { | 1129 | { |
1130 | struct kmem_cache *cachep; | 1130 | struct kmem_cache *cachep; |
1131 | struct kmem_list3 *l3 = NULL; | 1131 | struct kmem_list3 *l3 = NULL; |
1132 | int node = cpu_to_node(cpu); | 1132 | int node = cpu_to_mem(cpu); |
1133 | const struct cpumask *mask = cpumask_of_node(node); | 1133 | const struct cpumask *mask = cpumask_of_node(node); |
1134 | 1134 | ||
1135 | list_for_each_entry(cachep, &cache_chain, next) { | 1135 | list_for_each_entry(cachep, &cache_chain, next) { |
@@ -1194,7 +1194,7 @@ static int __cpuinit cpuup_prepare(long cpu) | |||
1194 | { | 1194 | { |
1195 | struct kmem_cache *cachep; | 1195 | struct kmem_cache *cachep; |
1196 | struct kmem_list3 *l3 = NULL; | 1196 | struct kmem_list3 *l3 = NULL; |
1197 | int node = cpu_to_node(cpu); | 1197 | int node = cpu_to_mem(cpu); |
1198 | int err; | 1198 | int err; |
1199 | 1199 | ||
1200 | /* | 1200 | /* |
@@ -1321,7 +1321,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, | |||
1321 | mutex_unlock(&cache_chain_mutex); | 1321 | mutex_unlock(&cache_chain_mutex); |
1322 | break; | 1322 | break; |
1323 | } | 1323 | } |
1324 | return err ? NOTIFY_BAD : NOTIFY_OK; | 1324 | return notifier_from_errno(err); |
1325 | } | 1325 | } |
1326 | 1326 | ||
1327 | static struct notifier_block __cpuinitdata cpucache_notifier = { | 1327 | static struct notifier_block __cpuinitdata cpucache_notifier = { |
@@ -1479,7 +1479,7 @@ void __init kmem_cache_init(void) | |||
1479 | * 6) Resize the head arrays of the kmalloc caches to their final sizes. | 1479 | * 6) Resize the head arrays of the kmalloc caches to their final sizes. |
1480 | */ | 1480 | */ |
1481 | 1481 | ||
1482 | node = numa_node_id(); | 1482 | node = numa_mem_id(); |
1483 | 1483 | ||
1484 | /* 1) create the cache_cache */ | 1484 | /* 1) create the cache_cache */ |
1485 | INIT_LIST_HEAD(&cache_chain); | 1485 | INIT_LIST_HEAD(&cache_chain); |
@@ -2121,7 +2121,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) | |||
2121 | } | 2121 | } |
2122 | } | 2122 | } |
2123 | } | 2123 | } |
2124 | cachep->nodelists[numa_node_id()]->next_reap = | 2124 | cachep->nodelists[numa_mem_id()]->next_reap = |
2125 | jiffies + REAPTIMEOUT_LIST3 + | 2125 | jiffies + REAPTIMEOUT_LIST3 + |
2126 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; | 2126 | ((unsigned long)cachep) % REAPTIMEOUT_LIST3; |
2127 | 2127 | ||
@@ -2452,7 +2452,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) | |||
2452 | { | 2452 | { |
2453 | #ifdef CONFIG_SMP | 2453 | #ifdef CONFIG_SMP |
2454 | check_irq_off(); | 2454 | check_irq_off(); |
2455 | assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); | 2455 | assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock); |
2456 | #endif | 2456 | #endif |
2457 | } | 2457 | } |
2458 | 2458 | ||
@@ -2479,7 +2479,7 @@ static void do_drain(void *arg) | |||
2479 | { | 2479 | { |
2480 | struct kmem_cache *cachep = arg; | 2480 | struct kmem_cache *cachep = arg; |
2481 | struct array_cache *ac; | 2481 | struct array_cache *ac; |
2482 | int node = numa_node_id(); | 2482 | int node = numa_mem_id(); |
2483 | 2483 | ||
2484 | check_irq_off(); | 2484 | check_irq_off(); |
2485 | ac = cpu_cache_get(cachep); | 2485 | ac = cpu_cache_get(cachep); |
@@ -3012,7 +3012,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) | |||
3012 | 3012 | ||
3013 | retry: | 3013 | retry: |
3014 | check_irq_off(); | 3014 | check_irq_off(); |
3015 | node = numa_node_id(); | 3015 | node = numa_mem_id(); |
3016 | ac = cpu_cache_get(cachep); | 3016 | ac = cpu_cache_get(cachep); |
3017 | batchcount = ac->batchcount; | 3017 | batchcount = ac->batchcount; |
3018 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { | 3018 | if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { |
@@ -3216,10 +3216,10 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) | |||
3216 | 3216 | ||
3217 | if (in_interrupt() || (flags & __GFP_THISNODE)) | 3217 | if (in_interrupt() || (flags & __GFP_THISNODE)) |
3218 | return NULL; | 3218 | return NULL; |
3219 | nid_alloc = nid_here = numa_node_id(); | 3219 | nid_alloc = nid_here = numa_mem_id(); |
3220 | get_mems_allowed(); | 3220 | get_mems_allowed(); |
3221 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) | 3221 | if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) |
3222 | nid_alloc = cpuset_mem_spread_node(); | 3222 | nid_alloc = cpuset_slab_spread_node(); |
3223 | else if (current->mempolicy) | 3223 | else if (current->mempolicy) |
3224 | nid_alloc = slab_node(current->mempolicy); | 3224 | nid_alloc = slab_node(current->mempolicy); |
3225 | put_mems_allowed(); | 3225 | put_mems_allowed(); |
@@ -3281,7 +3281,7 @@ retry: | |||
3281 | if (local_flags & __GFP_WAIT) | 3281 | if (local_flags & __GFP_WAIT) |
3282 | local_irq_enable(); | 3282 | local_irq_enable(); |
3283 | kmem_flagcheck(cache, flags); | 3283 | kmem_flagcheck(cache, flags); |
3284 | obj = kmem_getpages(cache, local_flags, numa_node_id()); | 3284 | obj = kmem_getpages(cache, local_flags, numa_mem_id()); |
3285 | if (local_flags & __GFP_WAIT) | 3285 | if (local_flags & __GFP_WAIT) |
3286 | local_irq_disable(); | 3286 | local_irq_disable(); |
3287 | if (obj) { | 3287 | if (obj) { |
@@ -3389,6 +3389,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3389 | { | 3389 | { |
3390 | unsigned long save_flags; | 3390 | unsigned long save_flags; |
3391 | void *ptr; | 3391 | void *ptr; |
3392 | int slab_node = numa_mem_id(); | ||
3392 | 3393 | ||
3393 | flags &= gfp_allowed_mask; | 3394 | flags &= gfp_allowed_mask; |
3394 | 3395 | ||
@@ -3401,7 +3402,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3401 | local_irq_save(save_flags); | 3402 | local_irq_save(save_flags); |
3402 | 3403 | ||
3403 | if (nodeid == -1) | 3404 | if (nodeid == -1) |
3404 | nodeid = numa_node_id(); | 3405 | nodeid = slab_node; |
3405 | 3406 | ||
3406 | if (unlikely(!cachep->nodelists[nodeid])) { | 3407 | if (unlikely(!cachep->nodelists[nodeid])) { |
3407 | /* Node not bootstrapped yet */ | 3408 | /* Node not bootstrapped yet */ |
@@ -3409,7 +3410,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, | |||
3409 | goto out; | 3410 | goto out; |
3410 | } | 3411 | } |
3411 | 3412 | ||
3412 | if (nodeid == numa_node_id()) { | 3413 | if (nodeid == slab_node) { |
3413 | /* | 3414 | /* |
3414 | * Use the locally cached objects if possible. | 3415 | * Use the locally cached objects if possible. |
3415 | * However ____cache_alloc does not allow fallback | 3416 | * However ____cache_alloc does not allow fallback |
@@ -3453,8 +3454,8 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) | |||
3453 | * We may just have run out of memory on the local node. | 3454 | * We may just have run out of memory on the local node. |
3454 | * ____cache_alloc_node() knows how to locate memory on other nodes | 3455 | * ____cache_alloc_node() knows how to locate memory on other nodes |
3455 | */ | 3456 | */ |
3456 | if (!objp) | 3457 | if (!objp) |
3457 | objp = ____cache_alloc_node(cache, flags, numa_node_id()); | 3458 | objp = ____cache_alloc_node(cache, flags, numa_mem_id()); |
3458 | 3459 | ||
3459 | out: | 3460 | out: |
3460 | return objp; | 3461 | return objp; |
@@ -3551,7 +3552,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) | |||
3551 | { | 3552 | { |
3552 | int batchcount; | 3553 | int batchcount; |
3553 | struct kmem_list3 *l3; | 3554 | struct kmem_list3 *l3; |
3554 | int node = numa_node_id(); | 3555 | int node = numa_mem_id(); |
3555 | 3556 | ||
3556 | batchcount = ac->batchcount; | 3557 | batchcount = ac->batchcount; |
3557 | #if DEBUG | 3558 | #if DEBUG |
@@ -3985,7 +3986,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
3985 | return -ENOMEM; | 3986 | return -ENOMEM; |
3986 | 3987 | ||
3987 | for_each_online_cpu(i) { | 3988 | for_each_online_cpu(i) { |
3988 | new->new[i] = alloc_arraycache(cpu_to_node(i), limit, | 3989 | new->new[i] = alloc_arraycache(cpu_to_mem(i), limit, |
3989 | batchcount, gfp); | 3990 | batchcount, gfp); |
3990 | if (!new->new[i]) { | 3991 | if (!new->new[i]) { |
3991 | for (i--; i >= 0; i--) | 3992 | for (i--; i >= 0; i--) |
@@ -4007,9 +4008,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, | |||
4007 | struct array_cache *ccold = new->new[i]; | 4008 | struct array_cache *ccold = new->new[i]; |
4008 | if (!ccold) | 4009 | if (!ccold) |
4009 | continue; | 4010 | continue; |
4010 | spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); | 4011 | spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); |
4011 | free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); | 4012 | free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); |
4012 | spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); | 4013 | spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); |
4013 | kfree(ccold); | 4014 | kfree(ccold); |
4014 | } | 4015 | } |
4015 | kfree(new); | 4016 | kfree(new); |
@@ -4115,7 +4116,7 @@ static void cache_reap(struct work_struct *w) | |||
4115 | { | 4116 | { |
4116 | struct kmem_cache *searchp; | 4117 | struct kmem_cache *searchp; |
4117 | struct kmem_list3 *l3; | 4118 | struct kmem_list3 *l3; |
4118 | int node = numa_node_id(); | 4119 | int node = numa_mem_id(); |
4119 | struct delayed_work *work = to_delayed_work(w); | 4120 | struct delayed_work *work = to_delayed_work(w); |
4120 | 4121 | ||
4121 | if (!mutex_trylock(&cache_chain_mutex)) | 4122 | if (!mutex_trylock(&cache_chain_mutex)) |
@@ -2137,7 +2137,7 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) | |||
2137 | 2137 | ||
2138 | for_each_node_state(node, N_NORMAL_MEMORY) { | 2138 | for_each_node_state(node, N_NORMAL_MEMORY) { |
2139 | struct kmem_cache_node *n = s->node[node]; | 2139 | struct kmem_cache_node *n = s->node[node]; |
2140 | if (n && n != &s->local_node) | 2140 | if (n) |
2141 | kmem_cache_free(kmalloc_caches, n); | 2141 | kmem_cache_free(kmalloc_caches, n); |
2142 | s->node[node] = NULL; | 2142 | s->node[node] = NULL; |
2143 | } | 2143 | } |
@@ -2146,33 +2146,22 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) | |||
2146 | static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) | 2146 | static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) |
2147 | { | 2147 | { |
2148 | int node; | 2148 | int node; |
2149 | int local_node; | ||
2150 | |||
2151 | if (slab_state >= UP && (s < kmalloc_caches || | ||
2152 | s >= kmalloc_caches + KMALLOC_CACHES)) | ||
2153 | local_node = page_to_nid(virt_to_page(s)); | ||
2154 | else | ||
2155 | local_node = 0; | ||
2156 | 2149 | ||
2157 | for_each_node_state(node, N_NORMAL_MEMORY) { | 2150 | for_each_node_state(node, N_NORMAL_MEMORY) { |
2158 | struct kmem_cache_node *n; | 2151 | struct kmem_cache_node *n; |
2159 | 2152 | ||
2160 | if (local_node == node) | 2153 | if (slab_state == DOWN) { |
2161 | n = &s->local_node; | 2154 | early_kmem_cache_node_alloc(gfpflags, node); |
2162 | else { | 2155 | continue; |
2163 | if (slab_state == DOWN) { | 2156 | } |
2164 | early_kmem_cache_node_alloc(gfpflags, node); | 2157 | n = kmem_cache_alloc_node(kmalloc_caches, |
2165 | continue; | 2158 | gfpflags, node); |
2166 | } | ||
2167 | n = kmem_cache_alloc_node(kmalloc_caches, | ||
2168 | gfpflags, node); | ||
2169 | |||
2170 | if (!n) { | ||
2171 | free_kmem_cache_nodes(s); | ||
2172 | return 0; | ||
2173 | } | ||
2174 | 2159 | ||
2160 | if (!n) { | ||
2161 | free_kmem_cache_nodes(s); | ||
2162 | return 0; | ||
2175 | } | 2163 | } |
2164 | |||
2176 | s->node[node] = n; | 2165 | s->node[node] = n; |
2177 | init_kmem_cache_node(n, s); | 2166 | init_kmem_cache_node(n, s); |
2178 | } | 2167 | } |
@@ -224,6 +224,7 @@ void __lru_cache_add(struct page *page, enum lru_list lru) | |||
224 | ____pagevec_lru_add(pvec, lru); | 224 | ____pagevec_lru_add(pvec, lru); |
225 | put_cpu_var(lru_add_pvecs); | 225 | put_cpu_var(lru_add_pvecs); |
226 | } | 226 | } |
227 | EXPORT_SYMBOL(__lru_cache_add); | ||
227 | 228 | ||
228 | /** | 229 | /** |
229 | * lru_cache_add_lru - add a page to a page list | 230 | * lru_cache_add_lru - add a page to a page list |
diff --git a/mm/truncate.c b/mm/truncate.c index f42675a3615d..937571b8b233 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -548,18 +548,18 @@ EXPORT_SYMBOL(truncate_pagecache); | |||
548 | * NOTE! We have to be ready to update the memory sharing | 548 | * NOTE! We have to be ready to update the memory sharing |
549 | * between the file and the memory map for a potential last | 549 | * between the file and the memory map for a potential last |
550 | * incomplete page. Ugly, but necessary. | 550 | * incomplete page. Ugly, but necessary. |
551 | * | ||
552 | * This function is deprecated and simple_setsize or truncate_pagecache | ||
553 | * should be used instead. | ||
551 | */ | 554 | */ |
552 | int vmtruncate(struct inode *inode, loff_t offset) | 555 | int vmtruncate(struct inode *inode, loff_t offset) |
553 | { | 556 | { |
554 | loff_t oldsize; | ||
555 | int error; | 557 | int error; |
556 | 558 | ||
557 | error = inode_newsize_ok(inode, offset); | 559 | error = simple_setsize(inode, offset); |
558 | if (error) | 560 | if (error) |
559 | return error; | 561 | return error; |
560 | oldsize = inode->i_size; | 562 | |
561 | i_size_write(inode, offset); | ||
562 | truncate_pagecache(inode, oldsize, offset); | ||
563 | if (inode->i_op->truncate) | 563 | if (inode->i_op->truncate) |
564 | inode->i_op->truncate(inode); | 564 | inode->i_op->truncate(inode); |
565 | 565 | ||