diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/filemap.c | 15 | ||||
-rw-r--r-- | mm/memcontrol.c | 419 | ||||
-rw-r--r-- | mm/memory.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 75 | ||||
-rw-r--r-- | mm/page_alloc.c | 22 | ||||
-rw-r--r-- | mm/rmap.c | 4 | ||||
-rw-r--r-- | mm/shmem.c | 71 | ||||
-rw-r--r-- | mm/shmem_acl.c | 171 | ||||
-rw-r--r-- | mm/truncate.c | 6 |
10 files changed, 433 insertions, 353 deletions
diff --git a/mm/Makefile b/mm/Makefile index 82131d0f8d85..7a68d2ab5560 100644 --- a/mm/Makefile +++ b/mm/Makefile | |||
@@ -22,7 +22,6 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o | |||
22 | obj-$(CONFIG_NUMA) += mempolicy.o | 22 | obj-$(CONFIG_NUMA) += mempolicy.o |
23 | obj-$(CONFIG_SPARSEMEM) += sparse.o | 23 | obj-$(CONFIG_SPARSEMEM) += sparse.o |
24 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o | 24 | obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o |
25 | obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o | ||
26 | obj-$(CONFIG_SLOB) += slob.o | 25 | obj-$(CONFIG_SLOB) += slob.o |
27 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o | 26 | obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o |
28 | obj-$(CONFIG_KSM) += ksm.o | 27 | obj-$(CONFIG_KSM) += ksm.o |
diff --git a/mm/filemap.c b/mm/filemap.c index 8b4d88f9249e..96ac6b0eb6cb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
@@ -2240,7 +2240,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2240 | size_t count, ssize_t written) | 2240 | size_t count, ssize_t written) |
2241 | { | 2241 | { |
2242 | struct file *file = iocb->ki_filp; | 2242 | struct file *file = iocb->ki_filp; |
2243 | struct address_space *mapping = file->f_mapping; | ||
2244 | ssize_t status; | 2243 | ssize_t status; |
2245 | struct iov_iter i; | 2244 | struct iov_iter i; |
2246 | 2245 | ||
@@ -2252,15 +2251,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, | |||
2252 | *ppos = pos + status; | 2251 | *ppos = pos + status; |
2253 | } | 2252 | } |
2254 | 2253 | ||
2255 | /* | ||
2256 | * If we get here for O_DIRECT writes then we must have fallen through | ||
2257 | * to buffered writes (block instantiation inside i_size). So we sync | ||
2258 | * the file data here, to try to honour O_DIRECT expectations. | ||
2259 | */ | ||
2260 | if (unlikely(file->f_flags & O_DIRECT) && written) | ||
2261 | status = filemap_write_and_wait_range(mapping, | ||
2262 | pos, pos + written - 1); | ||
2263 | |||
2264 | return written ? written : status; | 2254 | return written ? written : status; |
2265 | } | 2255 | } |
2266 | EXPORT_SYMBOL(generic_file_buffered_write); | 2256 | EXPORT_SYMBOL(generic_file_buffered_write); |
@@ -2359,10 +2349,7 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, | |||
2359 | * semantics. | 2349 | * semantics. |
2360 | */ | 2350 | */ |
2361 | endbyte = pos + written_buffered - written - 1; | 2351 | endbyte = pos + written_buffered - written - 1; |
2362 | err = do_sync_mapping_range(file->f_mapping, pos, endbyte, | 2352 | err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); |
2363 | SYNC_FILE_RANGE_WAIT_BEFORE| | ||
2364 | SYNC_FILE_RANGE_WRITE| | ||
2365 | SYNC_FILE_RANGE_WAIT_AFTER); | ||
2366 | if (err == 0) { | 2353 | if (err == 0) { |
2367 | written = written_buffered; | 2354 | written = written_buffered; |
2368 | invalidate_mapping_pages(mapping, | 2355 | invalidate_mapping_pages(mapping, |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9eee80d6d490..488b644e0e8e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
@@ -38,6 +38,7 @@ | |||
38 | #include <linux/vmalloc.h> | 38 | #include <linux/vmalloc.h> |
39 | #include <linux/mm_inline.h> | 39 | #include <linux/mm_inline.h> |
40 | #include <linux/page_cgroup.h> | 40 | #include <linux/page_cgroup.h> |
41 | #include <linux/cpu.h> | ||
41 | #include "internal.h" | 42 | #include "internal.h" |
42 | 43 | ||
43 | #include <asm/uaccess.h> | 44 | #include <asm/uaccess.h> |
@@ -54,7 +55,6 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ | |||
54 | #define do_swap_account (0) | 55 | #define do_swap_account (0) |
55 | #endif | 56 | #endif |
56 | 57 | ||
57 | static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ | ||
58 | #define SOFTLIMIT_EVENTS_THRESH (1000) | 58 | #define SOFTLIMIT_EVENTS_THRESH (1000) |
59 | 59 | ||
60 | /* | 60 | /* |
@@ -66,7 +66,7 @@ enum mem_cgroup_stat_index { | |||
66 | */ | 66 | */ |
67 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ | 67 | MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ |
68 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ | 68 | MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ |
69 | MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ | 69 | MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ |
70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ | 70 | MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ |
71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ | 71 | MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ |
72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ | 72 | MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ |
@@ -275,6 +275,7 @@ enum charge_type { | |||
275 | static void mem_cgroup_get(struct mem_cgroup *mem); | 275 | static void mem_cgroup_get(struct mem_cgroup *mem); |
276 | static void mem_cgroup_put(struct mem_cgroup *mem); | 276 | static void mem_cgroup_put(struct mem_cgroup *mem); |
277 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); | 277 | static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); |
278 | static void drain_all_stock_async(void); | ||
278 | 279 | ||
279 | static struct mem_cgroup_per_zone * | 280 | static struct mem_cgroup_per_zone * |
280 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) | 281 | mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) |
@@ -763,7 +764,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) | |||
763 | task_unlock(task); | 764 | task_unlock(task); |
764 | if (!curr) | 765 | if (!curr) |
765 | return 0; | 766 | return 0; |
766 | if (curr->use_hierarchy) | 767 | /* |
768 | * We should check use_hierarchy of "mem" not "curr". Because checking | ||
769 | * use_hierarchy of "curr" here make this function true if hierarchy is | ||
770 | * enabled in "curr" and "curr" is a child of "mem" in *cgroup* | ||
771 | * hierarchy(even if use_hierarchy is disabled in "mem"). | ||
772 | */ | ||
773 | if (mem->use_hierarchy) | ||
767 | ret = css_is_ancestor(&curr->css, &mem->css); | 774 | ret = css_is_ancestor(&curr->css, &mem->css); |
768 | else | 775 | else |
769 | ret = (curr == mem); | 776 | ret = (curr == mem); |
@@ -1012,7 +1019,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) | |||
1012 | static char memcg_name[PATH_MAX]; | 1019 | static char memcg_name[PATH_MAX]; |
1013 | int ret; | 1020 | int ret; |
1014 | 1021 | ||
1015 | if (!memcg) | 1022 | if (!memcg || !p) |
1016 | return; | 1023 | return; |
1017 | 1024 | ||
1018 | 1025 | ||
@@ -1142,6 +1149,8 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, | |||
1142 | victim = mem_cgroup_select_victim(root_mem); | 1149 | victim = mem_cgroup_select_victim(root_mem); |
1143 | if (victim == root_mem) { | 1150 | if (victim == root_mem) { |
1144 | loop++; | 1151 | loop++; |
1152 | if (loop >= 1) | ||
1153 | drain_all_stock_async(); | ||
1145 | if (loop >= 2) { | 1154 | if (loop >= 2) { |
1146 | /* | 1155 | /* |
1147 | * If we have not been able to reclaim | 1156 | * If we have not been able to reclaim |
@@ -1228,7 +1237,7 @@ static void record_last_oom(struct mem_cgroup *mem) | |||
1228 | * Currently used to update mapped file statistics, but the routine can be | 1237 | * Currently used to update mapped file statistics, but the routine can be |
1229 | * generalized to update other statistics as well. | 1238 | * generalized to update other statistics as well. |
1230 | */ | 1239 | */ |
1231 | void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | 1240 | void mem_cgroup_update_file_mapped(struct page *page, int val) |
1232 | { | 1241 | { |
1233 | struct mem_cgroup *mem; | 1242 | struct mem_cgroup *mem; |
1234 | struct mem_cgroup_stat *stat; | 1243 | struct mem_cgroup_stat *stat; |
@@ -1236,9 +1245,6 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | |||
1236 | int cpu; | 1245 | int cpu; |
1237 | struct page_cgroup *pc; | 1246 | struct page_cgroup *pc; |
1238 | 1247 | ||
1239 | if (!page_is_file_cache(page)) | ||
1240 | return; | ||
1241 | |||
1242 | pc = lookup_page_cgroup(page); | 1248 | pc = lookup_page_cgroup(page); |
1243 | if (unlikely(!pc)) | 1249 | if (unlikely(!pc)) |
1244 | return; | 1250 | return; |
@@ -1258,12 +1264,139 @@ void mem_cgroup_update_mapped_file_stat(struct page *page, int val) | |||
1258 | stat = &mem->stat; | 1264 | stat = &mem->stat; |
1259 | cpustat = &stat->cpustat[cpu]; | 1265 | cpustat = &stat->cpustat[cpu]; |
1260 | 1266 | ||
1261 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); | 1267 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val); |
1262 | done: | 1268 | done: |
1263 | unlock_page_cgroup(pc); | 1269 | unlock_page_cgroup(pc); |
1264 | } | 1270 | } |
1265 | 1271 | ||
1266 | /* | 1272 | /* |
1273 | * size of first charge trial. "32" comes from vmscan.c's magic value. | ||
1274 | * TODO: maybe necessary to use big numbers in big irons. | ||
1275 | */ | ||
1276 | #define CHARGE_SIZE (32 * PAGE_SIZE) | ||
1277 | struct memcg_stock_pcp { | ||
1278 | struct mem_cgroup *cached; /* this never be root cgroup */ | ||
1279 | int charge; | ||
1280 | struct work_struct work; | ||
1281 | }; | ||
1282 | static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); | ||
1283 | static atomic_t memcg_drain_count; | ||
1284 | |||
1285 | /* | ||
1286 | * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed | ||
1287 | * from local stock and true is returned. If the stock is 0 or charges from a | ||
1288 | * cgroup which is not current target, returns false. This stock will be | ||
1289 | * refilled. | ||
1290 | */ | ||
1291 | static bool consume_stock(struct mem_cgroup *mem) | ||
1292 | { | ||
1293 | struct memcg_stock_pcp *stock; | ||
1294 | bool ret = true; | ||
1295 | |||
1296 | stock = &get_cpu_var(memcg_stock); | ||
1297 | if (mem == stock->cached && stock->charge) | ||
1298 | stock->charge -= PAGE_SIZE; | ||
1299 | else /* need to call res_counter_charge */ | ||
1300 | ret = false; | ||
1301 | put_cpu_var(memcg_stock); | ||
1302 | return ret; | ||
1303 | } | ||
1304 | |||
1305 | /* | ||
1306 | * Returns stocks cached in percpu to res_counter and reset cached information. | ||
1307 | */ | ||
1308 | static void drain_stock(struct memcg_stock_pcp *stock) | ||
1309 | { | ||
1310 | struct mem_cgroup *old = stock->cached; | ||
1311 | |||
1312 | if (stock->charge) { | ||
1313 | res_counter_uncharge(&old->res, stock->charge); | ||
1314 | if (do_swap_account) | ||
1315 | res_counter_uncharge(&old->memsw, stock->charge); | ||
1316 | } | ||
1317 | stock->cached = NULL; | ||
1318 | stock->charge = 0; | ||
1319 | } | ||
1320 | |||
1321 | /* | ||
1322 | * This must be called under preempt disabled or must be called by | ||
1323 | * a thread which is pinned to local cpu. | ||
1324 | */ | ||
1325 | static void drain_local_stock(struct work_struct *dummy) | ||
1326 | { | ||
1327 | struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); | ||
1328 | drain_stock(stock); | ||
1329 | } | ||
1330 | |||
1331 | /* | ||
1332 | * Cache charges(val) which is from res_counter, to local per_cpu area. | ||
1333 | * This will be consumed by consumt_stock() function, later. | ||
1334 | */ | ||
1335 | static void refill_stock(struct mem_cgroup *mem, int val) | ||
1336 | { | ||
1337 | struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); | ||
1338 | |||
1339 | if (stock->cached != mem) { /* reset if necessary */ | ||
1340 | drain_stock(stock); | ||
1341 | stock->cached = mem; | ||
1342 | } | ||
1343 | stock->charge += val; | ||
1344 | put_cpu_var(memcg_stock); | ||
1345 | } | ||
1346 | |||
1347 | /* | ||
1348 | * Tries to drain stocked charges in other cpus. This function is asynchronous | ||
1349 | * and just put a work per cpu for draining localy on each cpu. Caller can | ||
1350 | * expects some charges will be back to res_counter later but cannot wait for | ||
1351 | * it. | ||
1352 | */ | ||
1353 | static void drain_all_stock_async(void) | ||
1354 | { | ||
1355 | int cpu; | ||
1356 | /* This function is for scheduling "drain" in asynchronous way. | ||
1357 | * The result of "drain" is not directly handled by callers. Then, | ||
1358 | * if someone is calling drain, we don't have to call drain more. | ||
1359 | * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if | ||
1360 | * there is a race. We just do loose check here. | ||
1361 | */ | ||
1362 | if (atomic_read(&memcg_drain_count)) | ||
1363 | return; | ||
1364 | /* Notify other cpus that system-wide "drain" is running */ | ||
1365 | atomic_inc(&memcg_drain_count); | ||
1366 | get_online_cpus(); | ||
1367 | for_each_online_cpu(cpu) { | ||
1368 | struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); | ||
1369 | schedule_work_on(cpu, &stock->work); | ||
1370 | } | ||
1371 | put_online_cpus(); | ||
1372 | atomic_dec(&memcg_drain_count); | ||
1373 | /* We don't wait for flush_work */ | ||
1374 | } | ||
1375 | |||
1376 | /* This is a synchronous drain interface. */ | ||
1377 | static void drain_all_stock_sync(void) | ||
1378 | { | ||
1379 | /* called when force_empty is called */ | ||
1380 | atomic_inc(&memcg_drain_count); | ||
1381 | schedule_on_each_cpu(drain_local_stock); | ||
1382 | atomic_dec(&memcg_drain_count); | ||
1383 | } | ||
1384 | |||
1385 | static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb, | ||
1386 | unsigned long action, | ||
1387 | void *hcpu) | ||
1388 | { | ||
1389 | int cpu = (unsigned long)hcpu; | ||
1390 | struct memcg_stock_pcp *stock; | ||
1391 | |||
1392 | if (action != CPU_DEAD) | ||
1393 | return NOTIFY_OK; | ||
1394 | stock = &per_cpu(memcg_stock, cpu); | ||
1395 | drain_stock(stock); | ||
1396 | return NOTIFY_OK; | ||
1397 | } | ||
1398 | |||
1399 | /* | ||
1267 | * Unlike exported interface, "oom" parameter is added. if oom==true, | 1400 | * Unlike exported interface, "oom" parameter is added. if oom==true, |
1268 | * oom-killer can be invoked. | 1401 | * oom-killer can be invoked. |
1269 | */ | 1402 | */ |
@@ -1274,6 +1407,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1274 | struct mem_cgroup *mem, *mem_over_limit; | 1407 | struct mem_cgroup *mem, *mem_over_limit; |
1275 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; | 1408 | int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; |
1276 | struct res_counter *fail_res; | 1409 | struct res_counter *fail_res; |
1410 | int csize = CHARGE_SIZE; | ||
1277 | 1411 | ||
1278 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { | 1412 | if (unlikely(test_thread_flag(TIF_MEMDIE))) { |
1279 | /* Don't account this! */ | 1413 | /* Don't account this! */ |
@@ -1298,23 +1432,25 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1298 | return 0; | 1432 | return 0; |
1299 | 1433 | ||
1300 | VM_BUG_ON(css_is_removed(&mem->css)); | 1434 | VM_BUG_ON(css_is_removed(&mem->css)); |
1435 | if (mem_cgroup_is_root(mem)) | ||
1436 | goto done; | ||
1301 | 1437 | ||
1302 | while (1) { | 1438 | while (1) { |
1303 | int ret = 0; | 1439 | int ret = 0; |
1304 | unsigned long flags = 0; | 1440 | unsigned long flags = 0; |
1305 | 1441 | ||
1306 | if (mem_cgroup_is_root(mem)) | 1442 | if (consume_stock(mem)) |
1307 | goto done; | 1443 | goto charged; |
1308 | ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); | 1444 | |
1445 | ret = res_counter_charge(&mem->res, csize, &fail_res); | ||
1309 | if (likely(!ret)) { | 1446 | if (likely(!ret)) { |
1310 | if (!do_swap_account) | 1447 | if (!do_swap_account) |
1311 | break; | 1448 | break; |
1312 | ret = res_counter_charge(&mem->memsw, PAGE_SIZE, | 1449 | ret = res_counter_charge(&mem->memsw, csize, &fail_res); |
1313 | &fail_res); | ||
1314 | if (likely(!ret)) | 1450 | if (likely(!ret)) |
1315 | break; | 1451 | break; |
1316 | /* mem+swap counter fails */ | 1452 | /* mem+swap counter fails */ |
1317 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 1453 | res_counter_uncharge(&mem->res, csize); |
1318 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; | 1454 | flags |= MEM_CGROUP_RECLAIM_NOSWAP; |
1319 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1455 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1320 | memsw); | 1456 | memsw); |
@@ -1323,6 +1459,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1323 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, | 1459 | mem_over_limit = mem_cgroup_from_res_counter(fail_res, |
1324 | res); | 1460 | res); |
1325 | 1461 | ||
1462 | /* reduce request size and retry */ | ||
1463 | if (csize > PAGE_SIZE) { | ||
1464 | csize = PAGE_SIZE; | ||
1465 | continue; | ||
1466 | } | ||
1326 | if (!(gfp_mask & __GFP_WAIT)) | 1467 | if (!(gfp_mask & __GFP_WAIT)) |
1327 | goto nomem; | 1468 | goto nomem; |
1328 | 1469 | ||
@@ -1344,14 +1485,15 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
1344 | 1485 | ||
1345 | if (!nr_retries--) { | 1486 | if (!nr_retries--) { |
1346 | if (oom) { | 1487 | if (oom) { |
1347 | mutex_lock(&memcg_tasklist); | ||
1348 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); | 1488 | mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); |
1349 | mutex_unlock(&memcg_tasklist); | ||
1350 | record_last_oom(mem_over_limit); | 1489 | record_last_oom(mem_over_limit); |
1351 | } | 1490 | } |
1352 | goto nomem; | 1491 | goto nomem; |
1353 | } | 1492 | } |
1354 | } | 1493 | } |
1494 | if (csize > PAGE_SIZE) | ||
1495 | refill_stock(mem, csize - PAGE_SIZE); | ||
1496 | charged: | ||
1355 | /* | 1497 | /* |
1356 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. | 1498 | * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. |
1357 | * if they exceeds softlimit. | 1499 | * if they exceeds softlimit. |
@@ -1366,6 +1508,21 @@ nomem: | |||
1366 | } | 1508 | } |
1367 | 1509 | ||
1368 | /* | 1510 | /* |
1511 | * Somemtimes we have to undo a charge we got by try_charge(). | ||
1512 | * This function is for that and do uncharge, put css's refcnt. | ||
1513 | * gotten by try_charge(). | ||
1514 | */ | ||
1515 | static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) | ||
1516 | { | ||
1517 | if (!mem_cgroup_is_root(mem)) { | ||
1518 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1519 | if (do_swap_account) | ||
1520 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1521 | } | ||
1522 | css_put(&mem->css); | ||
1523 | } | ||
1524 | |||
1525 | /* | ||
1369 | * A helper function to get mem_cgroup from ID. must be called under | 1526 | * A helper function to get mem_cgroup from ID. must be called under |
1370 | * rcu_read_lock(). The caller must check css_is_removed() or some if | 1527 | * rcu_read_lock(). The caller must check css_is_removed() or some if |
1371 | * it's concern. (dropping refcnt from swap can be called against removed | 1528 | * it's concern. (dropping refcnt from swap can be called against removed |
@@ -1428,12 +1585,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1428 | lock_page_cgroup(pc); | 1585 | lock_page_cgroup(pc); |
1429 | if (unlikely(PageCgroupUsed(pc))) { | 1586 | if (unlikely(PageCgroupUsed(pc))) { |
1430 | unlock_page_cgroup(pc); | 1587 | unlock_page_cgroup(pc); |
1431 | if (!mem_cgroup_is_root(mem)) { | 1588 | mem_cgroup_cancel_charge(mem); |
1432 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1433 | if (do_swap_account) | ||
1434 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1435 | } | ||
1436 | css_put(&mem->css); | ||
1437 | return; | 1589 | return; |
1438 | } | 1590 | } |
1439 | 1591 | ||
@@ -1466,27 +1618,22 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, | |||
1466 | } | 1618 | } |
1467 | 1619 | ||
1468 | /** | 1620 | /** |
1469 | * mem_cgroup_move_account - move account of the page | 1621 | * __mem_cgroup_move_account - move account of the page |
1470 | * @pc: page_cgroup of the page. | 1622 | * @pc: page_cgroup of the page. |
1471 | * @from: mem_cgroup which the page is moved from. | 1623 | * @from: mem_cgroup which the page is moved from. |
1472 | * @to: mem_cgroup which the page is moved to. @from != @to. | 1624 | * @to: mem_cgroup which the page is moved to. @from != @to. |
1473 | * | 1625 | * |
1474 | * The caller must confirm following. | 1626 | * The caller must confirm following. |
1475 | * - page is not on LRU (isolate_page() is useful.) | 1627 | * - page is not on LRU (isolate_page() is useful.) |
1476 | * | 1628 | * - the pc is locked, used, and ->mem_cgroup points to @from. |
1477 | * returns 0 at success, | ||
1478 | * returns -EBUSY when lock is busy or "pc" is unstable. | ||
1479 | * | 1629 | * |
1480 | * This function does "uncharge" from old cgroup but doesn't do "charge" to | 1630 | * This function does "uncharge" from old cgroup but doesn't do "charge" to |
1481 | * new cgroup. It should be done by a caller. | 1631 | * new cgroup. It should be done by a caller. |
1482 | */ | 1632 | */ |
1483 | 1633 | ||
1484 | static int mem_cgroup_move_account(struct page_cgroup *pc, | 1634 | static void __mem_cgroup_move_account(struct page_cgroup *pc, |
1485 | struct mem_cgroup *from, struct mem_cgroup *to) | 1635 | struct mem_cgroup *from, struct mem_cgroup *to) |
1486 | { | 1636 | { |
1487 | struct mem_cgroup_per_zone *from_mz, *to_mz; | ||
1488 | int nid, zid; | ||
1489 | int ret = -EBUSY; | ||
1490 | struct page *page; | 1637 | struct page *page; |
1491 | int cpu; | 1638 | int cpu; |
1492 | struct mem_cgroup_stat *stat; | 1639 | struct mem_cgroup_stat *stat; |
@@ -1494,38 +1641,27 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1494 | 1641 | ||
1495 | VM_BUG_ON(from == to); | 1642 | VM_BUG_ON(from == to); |
1496 | VM_BUG_ON(PageLRU(pc->page)); | 1643 | VM_BUG_ON(PageLRU(pc->page)); |
1497 | 1644 | VM_BUG_ON(!PageCgroupLocked(pc)); | |
1498 | nid = page_cgroup_nid(pc); | 1645 | VM_BUG_ON(!PageCgroupUsed(pc)); |
1499 | zid = page_cgroup_zid(pc); | 1646 | VM_BUG_ON(pc->mem_cgroup != from); |
1500 | from_mz = mem_cgroup_zoneinfo(from, nid, zid); | ||
1501 | to_mz = mem_cgroup_zoneinfo(to, nid, zid); | ||
1502 | |||
1503 | if (!trylock_page_cgroup(pc)) | ||
1504 | return ret; | ||
1505 | |||
1506 | if (!PageCgroupUsed(pc)) | ||
1507 | goto out; | ||
1508 | |||
1509 | if (pc->mem_cgroup != from) | ||
1510 | goto out; | ||
1511 | 1647 | ||
1512 | if (!mem_cgroup_is_root(from)) | 1648 | if (!mem_cgroup_is_root(from)) |
1513 | res_counter_uncharge(&from->res, PAGE_SIZE); | 1649 | res_counter_uncharge(&from->res, PAGE_SIZE); |
1514 | mem_cgroup_charge_statistics(from, pc, false); | 1650 | mem_cgroup_charge_statistics(from, pc, false); |
1515 | 1651 | ||
1516 | page = pc->page; | 1652 | page = pc->page; |
1517 | if (page_is_file_cache(page) && page_mapped(page)) { | 1653 | if (page_mapped(page) && !PageAnon(page)) { |
1518 | cpu = smp_processor_id(); | 1654 | cpu = smp_processor_id(); |
1519 | /* Update mapped_file data for mem_cgroup "from" */ | 1655 | /* Update mapped_file data for mem_cgroup "from" */ |
1520 | stat = &from->stat; | 1656 | stat = &from->stat; |
1521 | cpustat = &stat->cpustat[cpu]; | 1657 | cpustat = &stat->cpustat[cpu]; |
1522 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | 1658 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, |
1523 | -1); | 1659 | -1); |
1524 | 1660 | ||
1525 | /* Update mapped_file data for mem_cgroup "to" */ | 1661 | /* Update mapped_file data for mem_cgroup "to" */ |
1526 | stat = &to->stat; | 1662 | stat = &to->stat; |
1527 | cpustat = &stat->cpustat[cpu]; | 1663 | cpustat = &stat->cpustat[cpu]; |
1528 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, | 1664 | __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, |
1529 | 1); | 1665 | 1); |
1530 | } | 1666 | } |
1531 | 1667 | ||
@@ -1536,15 +1672,28 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, | |||
1536 | css_get(&to->css); | 1672 | css_get(&to->css); |
1537 | pc->mem_cgroup = to; | 1673 | pc->mem_cgroup = to; |
1538 | mem_cgroup_charge_statistics(to, pc, true); | 1674 | mem_cgroup_charge_statistics(to, pc, true); |
1539 | ret = 0; | ||
1540 | out: | ||
1541 | unlock_page_cgroup(pc); | ||
1542 | /* | 1675 | /* |
1543 | * We charges against "to" which may not have any tasks. Then, "to" | 1676 | * We charges against "to" which may not have any tasks. Then, "to" |
1544 | * can be under rmdir(). But in current implementation, caller of | 1677 | * can be under rmdir(). But in current implementation, caller of |
1545 | * this function is just force_empty() and it's garanteed that | 1678 | * this function is just force_empty() and it's garanteed that |
1546 | * "to" is never removed. So, we don't check rmdir status here. | 1679 | * "to" is never removed. So, we don't check rmdir status here. |
1547 | */ | 1680 | */ |
1681 | } | ||
1682 | |||
1683 | /* | ||
1684 | * check whether the @pc is valid for moving account and call | ||
1685 | * __mem_cgroup_move_account() | ||
1686 | */ | ||
1687 | static int mem_cgroup_move_account(struct page_cgroup *pc, | ||
1688 | struct mem_cgroup *from, struct mem_cgroup *to) | ||
1689 | { | ||
1690 | int ret = -EINVAL; | ||
1691 | lock_page_cgroup(pc); | ||
1692 | if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { | ||
1693 | __mem_cgroup_move_account(pc, from, to); | ||
1694 | ret = 0; | ||
1695 | } | ||
1696 | unlock_page_cgroup(pc); | ||
1548 | return ret; | 1697 | return ret; |
1549 | } | 1698 | } |
1550 | 1699 | ||
@@ -1566,45 +1715,27 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, | |||
1566 | if (!pcg) | 1715 | if (!pcg) |
1567 | return -EINVAL; | 1716 | return -EINVAL; |
1568 | 1717 | ||
1718 | ret = -EBUSY; | ||
1719 | if (!get_page_unless_zero(page)) | ||
1720 | goto out; | ||
1721 | if (isolate_lru_page(page)) | ||
1722 | goto put; | ||
1569 | 1723 | ||
1570 | parent = mem_cgroup_from_cont(pcg); | 1724 | parent = mem_cgroup_from_cont(pcg); |
1571 | |||
1572 | |||
1573 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); | 1725 | ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); |
1574 | if (ret || !parent) | 1726 | if (ret || !parent) |
1575 | return ret; | 1727 | goto put_back; |
1576 | |||
1577 | if (!get_page_unless_zero(page)) { | ||
1578 | ret = -EBUSY; | ||
1579 | goto uncharge; | ||
1580 | } | ||
1581 | |||
1582 | ret = isolate_lru_page(page); | ||
1583 | |||
1584 | if (ret) | ||
1585 | goto cancel; | ||
1586 | 1728 | ||
1587 | ret = mem_cgroup_move_account(pc, child, parent); | 1729 | ret = mem_cgroup_move_account(pc, child, parent); |
1588 | 1730 | if (!ret) | |
1731 | css_put(&parent->css); /* drop extra refcnt by try_charge() */ | ||
1732 | else | ||
1733 | mem_cgroup_cancel_charge(parent); /* does css_put */ | ||
1734 | put_back: | ||
1589 | putback_lru_page(page); | 1735 | putback_lru_page(page); |
1590 | if (!ret) { | 1736 | put: |
1591 | put_page(page); | ||
1592 | /* drop extra refcnt by try_charge() */ | ||
1593 | css_put(&parent->css); | ||
1594 | return 0; | ||
1595 | } | ||
1596 | |||
1597 | cancel: | ||
1598 | put_page(page); | 1737 | put_page(page); |
1599 | uncharge: | 1738 | out: |
1600 | /* drop extra refcnt by try_charge() */ | ||
1601 | css_put(&parent->css); | ||
1602 | /* uncharge if move fails */ | ||
1603 | if (!mem_cgroup_is_root(parent)) { | ||
1604 | res_counter_uncharge(&parent->res, PAGE_SIZE); | ||
1605 | if (do_swap_account) | ||
1606 | res_counter_uncharge(&parent->memsw, PAGE_SIZE); | ||
1607 | } | ||
1608 | return ret; | 1739 | return ret; |
1609 | } | 1740 | } |
1610 | 1741 | ||
@@ -1821,14 +1952,53 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) | |||
1821 | return; | 1952 | return; |
1822 | if (!mem) | 1953 | if (!mem) |
1823 | return; | 1954 | return; |
1824 | if (!mem_cgroup_is_root(mem)) { | 1955 | mem_cgroup_cancel_charge(mem); |
1825 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1826 | if (do_swap_account) | ||
1827 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1828 | } | ||
1829 | css_put(&mem->css); | ||
1830 | } | 1956 | } |
1831 | 1957 | ||
1958 | static void | ||
1959 | __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) | ||
1960 | { | ||
1961 | struct memcg_batch_info *batch = NULL; | ||
1962 | bool uncharge_memsw = true; | ||
1963 | /* If swapout, usage of swap doesn't decrease */ | ||
1964 | if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | ||
1965 | uncharge_memsw = false; | ||
1966 | /* | ||
1967 | * do_batch > 0 when unmapping pages or inode invalidate/truncate. | ||
1968 | * In those cases, all pages freed continously can be expected to be in | ||
1969 | * the same cgroup and we have chance to coalesce uncharges. | ||
1970 | * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) | ||
1971 | * because we want to do uncharge as soon as possible. | ||
1972 | */ | ||
1973 | if (!current->memcg_batch.do_batch || test_thread_flag(TIF_MEMDIE)) | ||
1974 | goto direct_uncharge; | ||
1975 | |||
1976 | batch = ¤t->memcg_batch; | ||
1977 | /* | ||
1978 | * In usual, we do css_get() when we remember memcg pointer. | ||
1979 | * But in this case, we keep res->usage until end of a series of | ||
1980 | * uncharges. Then, it's ok to ignore memcg's refcnt. | ||
1981 | */ | ||
1982 | if (!batch->memcg) | ||
1983 | batch->memcg = mem; | ||
1984 | /* | ||
1985 | * In typical case, batch->memcg == mem. This means we can | ||
1986 | * merge a series of uncharges to an uncharge of res_counter. | ||
1987 | * If not, we uncharge res_counter ony by one. | ||
1988 | */ | ||
1989 | if (batch->memcg != mem) | ||
1990 | goto direct_uncharge; | ||
1991 | /* remember freed charge and uncharge it later */ | ||
1992 | batch->bytes += PAGE_SIZE; | ||
1993 | if (uncharge_memsw) | ||
1994 | batch->memsw_bytes += PAGE_SIZE; | ||
1995 | return; | ||
1996 | direct_uncharge: | ||
1997 | res_counter_uncharge(&mem->res, PAGE_SIZE); | ||
1998 | if (uncharge_memsw) | ||
1999 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
2000 | return; | ||
2001 | } | ||
1832 | 2002 | ||
1833 | /* | 2003 | /* |
1834 | * uncharge if !page_mapped(page) | 2004 | * uncharge if !page_mapped(page) |
@@ -1877,12 +2047,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) | |||
1877 | break; | 2047 | break; |
1878 | } | 2048 | } |
1879 | 2049 | ||
1880 | if (!mem_cgroup_is_root(mem)) { | 2050 | if (!mem_cgroup_is_root(mem)) |
1881 | res_counter_uncharge(&mem->res, PAGE_SIZE); | 2051 | __do_uncharge(mem, ctype); |
1882 | if (do_swap_account && | ||
1883 | (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) | ||
1884 | res_counter_uncharge(&mem->memsw, PAGE_SIZE); | ||
1885 | } | ||
1886 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) | 2052 | if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) |
1887 | mem_cgroup_swap_statistics(mem, true); | 2053 | mem_cgroup_swap_statistics(mem, true); |
1888 | mem_cgroup_charge_statistics(mem, pc, false); | 2054 | mem_cgroup_charge_statistics(mem, pc, false); |
@@ -1928,6 +2094,50 @@ void mem_cgroup_uncharge_cache_page(struct page *page) | |||
1928 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); | 2094 | __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); |
1929 | } | 2095 | } |
1930 | 2096 | ||
2097 | /* | ||
2098 | * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. | ||
2099 | * In that cases, pages are freed continuously and we can expect pages | ||
2100 | * are in the same memcg. All these calls itself limits the number of | ||
2101 | * pages freed at once, then uncharge_start/end() is called properly. | ||
2102 | * This may be called prural(2) times in a context, | ||
2103 | */ | ||
2104 | |||
2105 | void mem_cgroup_uncharge_start(void) | ||
2106 | { | ||
2107 | current->memcg_batch.do_batch++; | ||
2108 | /* We can do nest. */ | ||
2109 | if (current->memcg_batch.do_batch == 1) { | ||
2110 | current->memcg_batch.memcg = NULL; | ||
2111 | current->memcg_batch.bytes = 0; | ||
2112 | current->memcg_batch.memsw_bytes = 0; | ||
2113 | } | ||
2114 | } | ||
2115 | |||
2116 | void mem_cgroup_uncharge_end(void) | ||
2117 | { | ||
2118 | struct memcg_batch_info *batch = ¤t->memcg_batch; | ||
2119 | |||
2120 | if (!batch->do_batch) | ||
2121 | return; | ||
2122 | |||
2123 | batch->do_batch--; | ||
2124 | if (batch->do_batch) /* If stacked, do nothing. */ | ||
2125 | return; | ||
2126 | |||
2127 | if (!batch->memcg) | ||
2128 | return; | ||
2129 | /* | ||
2130 | * This "batch->memcg" is valid without any css_get/put etc... | ||
2131 | * bacause we hide charges behind us. | ||
2132 | */ | ||
2133 | if (batch->bytes) | ||
2134 | res_counter_uncharge(&batch->memcg->res, batch->bytes); | ||
2135 | if (batch->memsw_bytes) | ||
2136 | res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); | ||
2137 | /* forget this pointer (for sanity check) */ | ||
2138 | batch->memcg = NULL; | ||
2139 | } | ||
2140 | |||
1931 | #ifdef CONFIG_SWAP | 2141 | #ifdef CONFIG_SWAP |
1932 | /* | 2142 | /* |
1933 | * called after __delete_from_swap_cache() and drop "page" account. | 2143 | * called after __delete_from_swap_cache() and drop "page" account. |
@@ -2103,7 +2313,6 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2103 | unsigned long long val) | 2313 | unsigned long long val) |
2104 | { | 2314 | { |
2105 | int retry_count; | 2315 | int retry_count; |
2106 | int progress; | ||
2107 | u64 memswlimit; | 2316 | u64 memswlimit; |
2108 | int ret = 0; | 2317 | int ret = 0; |
2109 | int children = mem_cgroup_count_children(memcg); | 2318 | int children = mem_cgroup_count_children(memcg); |
@@ -2147,8 +2356,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, | |||
2147 | if (!ret) | 2356 | if (!ret) |
2148 | break; | 2357 | break; |
2149 | 2358 | ||
2150 | progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, | 2359 | mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, |
2151 | GFP_KERNEL, | ||
2152 | MEM_CGROUP_RECLAIM_SHRINK); | 2360 | MEM_CGROUP_RECLAIM_SHRINK); |
2153 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); | 2361 | curusage = res_counter_read_u64(&memcg->res, RES_USAGE); |
2154 | /* Usage is reduced ? */ | 2362 | /* Usage is reduced ? */ |
@@ -2387,6 +2595,7 @@ move_account: | |||
2387 | goto out; | 2595 | goto out; |
2388 | /* This is for making all *used* pages to be on LRU. */ | 2596 | /* This is for making all *used* pages to be on LRU. */ |
2389 | lru_add_drain_all(); | 2597 | lru_add_drain_all(); |
2598 | drain_all_stock_sync(); | ||
2390 | ret = 0; | 2599 | ret = 0; |
2391 | for_each_node_state(node, N_HIGH_MEMORY) { | 2600 | for_each_node_state(node, N_HIGH_MEMORY) { |
2392 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { | 2601 | for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { |
@@ -2544,6 +2753,7 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) | |||
2544 | val += idx_val; | 2753 | val += idx_val; |
2545 | mem_cgroup_get_recursive_idx_stat(mem, | 2754 | mem_cgroup_get_recursive_idx_stat(mem, |
2546 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); | 2755 | MEM_CGROUP_STAT_SWAPOUT, &idx_val); |
2756 | val += idx_val; | ||
2547 | val <<= PAGE_SHIFT; | 2757 | val <<= PAGE_SHIFT; |
2548 | } else | 2758 | } else |
2549 | val = res_counter_read_u64(&mem->memsw, name); | 2759 | val = res_counter_read_u64(&mem->memsw, name); |
@@ -2663,7 +2873,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) | |||
2663 | enum { | 2873 | enum { |
2664 | MCS_CACHE, | 2874 | MCS_CACHE, |
2665 | MCS_RSS, | 2875 | MCS_RSS, |
2666 | MCS_MAPPED_FILE, | 2876 | MCS_FILE_MAPPED, |
2667 | MCS_PGPGIN, | 2877 | MCS_PGPGIN, |
2668 | MCS_PGPGOUT, | 2878 | MCS_PGPGOUT, |
2669 | MCS_SWAP, | 2879 | MCS_SWAP, |
@@ -2707,8 +2917,8 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) | |||
2707 | s->stat[MCS_CACHE] += val * PAGE_SIZE; | 2917 | s->stat[MCS_CACHE] += val * PAGE_SIZE; |
2708 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); | 2918 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); |
2709 | s->stat[MCS_RSS] += val * PAGE_SIZE; | 2919 | s->stat[MCS_RSS] += val * PAGE_SIZE; |
2710 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); | 2920 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED); |
2711 | s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; | 2921 | s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; |
2712 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); | 2922 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); |
2713 | s->stat[MCS_PGPGIN] += val; | 2923 | s->stat[MCS_PGPGIN] += val; |
2714 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); | 2924 | val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); |
@@ -3100,11 +3310,18 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) | |||
3100 | 3310 | ||
3101 | /* root ? */ | 3311 | /* root ? */ |
3102 | if (cont->parent == NULL) { | 3312 | if (cont->parent == NULL) { |
3313 | int cpu; | ||
3103 | enable_swap_cgroup(); | 3314 | enable_swap_cgroup(); |
3104 | parent = NULL; | 3315 | parent = NULL; |
3105 | root_mem_cgroup = mem; | 3316 | root_mem_cgroup = mem; |
3106 | if (mem_cgroup_soft_limit_tree_init()) | 3317 | if (mem_cgroup_soft_limit_tree_init()) |
3107 | goto free_out; | 3318 | goto free_out; |
3319 | for_each_possible_cpu(cpu) { | ||
3320 | struct memcg_stock_pcp *stock = | ||
3321 | &per_cpu(memcg_stock, cpu); | ||
3322 | INIT_WORK(&stock->work, drain_local_stock); | ||
3323 | } | ||
3324 | hotcpu_notifier(memcg_stock_cpu_callback, 0); | ||
3108 | 3325 | ||
3109 | } else { | 3326 | } else { |
3110 | parent = mem_cgroup_from_cont(cont->parent); | 3327 | parent = mem_cgroup_from_cont(cont->parent); |
@@ -3173,12 +3390,10 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, | |||
3173 | struct task_struct *p, | 3390 | struct task_struct *p, |
3174 | bool threadgroup) | 3391 | bool threadgroup) |
3175 | { | 3392 | { |
3176 | mutex_lock(&memcg_tasklist); | ||
3177 | /* | 3393 | /* |
3178 | * FIXME: It's better to move charges of this process from old | 3394 | * FIXME: It's better to move charges of this process from old |
3179 | * memcg to new memcg. But it's just on TODO-List now. | 3395 | * memcg to new memcg. But it's just on TODO-List now. |
3180 | */ | 3396 | */ |
3181 | mutex_unlock(&memcg_tasklist); | ||
3182 | } | 3397 | } |
3183 | 3398 | ||
3184 | struct cgroup_subsys mem_cgroup_subsys = { | 3399 | struct cgroup_subsys mem_cgroup_subsys = { |
diff --git a/mm/memory.c b/mm/memory.c index db09106ed44b..09e4b1be7b67 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
@@ -956,6 +956,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
956 | details = NULL; | 956 | details = NULL; |
957 | 957 | ||
958 | BUG_ON(addr >= end); | 958 | BUG_ON(addr >= end); |
959 | mem_cgroup_uncharge_start(); | ||
959 | tlb_start_vma(tlb, vma); | 960 | tlb_start_vma(tlb, vma); |
960 | pgd = pgd_offset(vma->vm_mm, addr); | 961 | pgd = pgd_offset(vma->vm_mm, addr); |
961 | do { | 962 | do { |
@@ -968,6 +969,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, | |||
968 | zap_work, details); | 969 | zap_work, details); |
969 | } while (pgd++, addr = next, (addr != end && *zap_work > 0)); | 970 | } while (pgd++, addr = next, (addr != end && *zap_work > 0)); |
970 | tlb_end_vma(tlb, vma); | 971 | tlb_end_vma(tlb, vma); |
972 | mem_cgroup_uncharge_end(); | ||
971 | 973 | ||
972 | return addr; | 974 | return addr; |
973 | } | 975 | } |
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 492c98624fc1..f52481b1c1e5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
@@ -196,27 +196,46 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) | |||
196 | /* | 196 | /* |
197 | * Determine the type of allocation constraint. | 197 | * Determine the type of allocation constraint. |
198 | */ | 198 | */ |
199 | static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, | ||
200 | gfp_t gfp_mask) | ||
201 | { | ||
202 | #ifdef CONFIG_NUMA | 199 | #ifdef CONFIG_NUMA |
200 | static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | ||
201 | gfp_t gfp_mask, nodemask_t *nodemask) | ||
202 | { | ||
203 | struct zone *zone; | 203 | struct zone *zone; |
204 | struct zoneref *z; | 204 | struct zoneref *z; |
205 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); | 205 | enum zone_type high_zoneidx = gfp_zone(gfp_mask); |
206 | nodemask_t nodes = node_states[N_HIGH_MEMORY]; | ||
207 | 206 | ||
208 | for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) | 207 | /* |
209 | if (cpuset_zone_allowed_softwall(zone, gfp_mask)) | 208 | * Reach here only when __GFP_NOFAIL is used. So, we should avoid |
210 | node_clear(zone_to_nid(zone), nodes); | 209 | * to kill current.We have to random task kill in this case. |
211 | else | 210 | * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. |
212 | return CONSTRAINT_CPUSET; | 211 | */ |
212 | if (gfp_mask & __GFP_THISNODE) | ||
213 | return CONSTRAINT_NONE; | ||
213 | 214 | ||
214 | if (!nodes_empty(nodes)) | 215 | /* |
216 | * The nodemask here is a nodemask passed to alloc_pages(). Now, | ||
217 | * cpuset doesn't use this nodemask for its hardwall/softwall/hierarchy | ||
218 | * feature. mempolicy is an only user of nodemask here. | ||
219 | * check mempolicy's nodemask contains all N_HIGH_MEMORY | ||
220 | */ | ||
221 | if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) | ||
215 | return CONSTRAINT_MEMORY_POLICY; | 222 | return CONSTRAINT_MEMORY_POLICY; |
216 | #endif | 223 | |
224 | /* Check this allocation failure is caused by cpuset's wall function */ | ||
225 | for_each_zone_zonelist_nodemask(zone, z, zonelist, | ||
226 | high_zoneidx, nodemask) | ||
227 | if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) | ||
228 | return CONSTRAINT_CPUSET; | ||
217 | 229 | ||
218 | return CONSTRAINT_NONE; | 230 | return CONSTRAINT_NONE; |
219 | } | 231 | } |
232 | #else | ||
233 | static enum oom_constraint constrained_alloc(struct zonelist *zonelist, | ||
234 | gfp_t gfp_mask, nodemask_t *nodemask) | ||
235 | { | ||
236 | return CONSTRAINT_NONE; | ||
237 | } | ||
238 | #endif | ||
220 | 239 | ||
221 | /* | 240 | /* |
222 | * Simple selection loop. We chose the process with the highest | 241 | * Simple selection loop. We chose the process with the highest |
@@ -337,7 +356,8 @@ static void dump_tasks(const struct mem_cgroup *mem) | |||
337 | } while_each_thread(g, p); | 356 | } while_each_thread(g, p); |
338 | } | 357 | } |
339 | 358 | ||
340 | static void dump_header(gfp_t gfp_mask, int order, struct mem_cgroup *mem) | 359 | static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, |
360 | struct mem_cgroup *mem) | ||
341 | { | 361 | { |
342 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " | 362 | pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " |
343 | "oom_adj=%d\n", | 363 | "oom_adj=%d\n", |
@@ -346,12 +366,14 @@ static void dump_header(gfp_t gfp_mask, int order, struct mem_cgroup *mem) | |||
346 | cpuset_print_task_mems_allowed(current); | 366 | cpuset_print_task_mems_allowed(current); |
347 | task_unlock(current); | 367 | task_unlock(current); |
348 | dump_stack(); | 368 | dump_stack(); |
349 | mem_cgroup_print_oom_info(mem, current); | 369 | mem_cgroup_print_oom_info(mem, p); |
350 | show_mem(); | 370 | show_mem(); |
351 | if (sysctl_oom_dump_tasks) | 371 | if (sysctl_oom_dump_tasks) |
352 | dump_tasks(mem); | 372 | dump_tasks(mem); |
353 | } | 373 | } |
354 | 374 | ||
375 | #define K(x) ((x) << (PAGE_SHIFT-10)) | ||
376 | |||
355 | /* | 377 | /* |
356 | * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO | 378 | * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO |
357 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO | 379 | * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO |
@@ -365,15 +387,23 @@ static void __oom_kill_task(struct task_struct *p, int verbose) | |||
365 | return; | 387 | return; |
366 | } | 388 | } |
367 | 389 | ||
390 | task_lock(p); | ||
368 | if (!p->mm) { | 391 | if (!p->mm) { |
369 | WARN_ON(1); | 392 | WARN_ON(1); |
370 | printk(KERN_WARNING "tried to kill an mm-less task!\n"); | 393 | printk(KERN_WARNING "tried to kill an mm-less task %d (%s)!\n", |
394 | task_pid_nr(p), p->comm); | ||
395 | task_unlock(p); | ||
371 | return; | 396 | return; |
372 | } | 397 | } |
373 | 398 | ||
374 | if (verbose) | 399 | if (verbose) |
375 | printk(KERN_ERR "Killed process %d (%s)\n", | 400 | printk(KERN_ERR "Killed process %d (%s) " |
376 | task_pid_nr(p), p->comm); | 401 | "vsz:%lukB, anon-rss:%lukB, file-rss:%lukB\n", |
402 | task_pid_nr(p), p->comm, | ||
403 | K(p->mm->total_vm), | ||
404 | K(get_mm_counter(p->mm, anon_rss)), | ||
405 | K(get_mm_counter(p->mm, file_rss))); | ||
406 | task_unlock(p); | ||
377 | 407 | ||
378 | /* | 408 | /* |
379 | * We give our sacrificial lamb high priority and access to | 409 | * We give our sacrificial lamb high priority and access to |
@@ -411,7 +441,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, | |||
411 | struct task_struct *c; | 441 | struct task_struct *c; |
412 | 442 | ||
413 | if (printk_ratelimit()) | 443 | if (printk_ratelimit()) |
414 | dump_header(gfp_mask, order, mem); | 444 | dump_header(p, gfp_mask, order, mem); |
415 | 445 | ||
416 | /* | 446 | /* |
417 | * If the task is already exiting, don't alarm the sysadmin or kill | 447 | * If the task is already exiting, don't alarm the sysadmin or kill |
@@ -547,7 +577,7 @@ retry: | |||
547 | /* Found nothing?!?! Either we hang forever, or we panic. */ | 577 | /* Found nothing?!?! Either we hang forever, or we panic. */ |
548 | if (!p) { | 578 | if (!p) { |
549 | read_unlock(&tasklist_lock); | 579 | read_unlock(&tasklist_lock); |
550 | dump_header(gfp_mask, order, NULL); | 580 | dump_header(NULL, gfp_mask, order, NULL); |
551 | panic("Out of memory and no killable processes...\n"); | 581 | panic("Out of memory and no killable processes...\n"); |
552 | } | 582 | } |
553 | 583 | ||
@@ -603,7 +633,8 @@ rest_and_return: | |||
603 | * OR try to be smart about which process to kill. Note that we | 633 | * OR try to be smart about which process to kill. Note that we |
604 | * don't have to be perfect here, we just have to be good. | 634 | * don't have to be perfect here, we just have to be good. |
605 | */ | 635 | */ |
606 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | 636 | void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, |
637 | int order, nodemask_t *nodemask) | ||
607 | { | 638 | { |
608 | unsigned long freed = 0; | 639 | unsigned long freed = 0; |
609 | enum oom_constraint constraint; | 640 | enum oom_constraint constraint; |
@@ -614,7 +645,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
614 | return; | 645 | return; |
615 | 646 | ||
616 | if (sysctl_panic_on_oom == 2) { | 647 | if (sysctl_panic_on_oom == 2) { |
617 | dump_header(gfp_mask, order, NULL); | 648 | dump_header(NULL, gfp_mask, order, NULL); |
618 | panic("out of memory. Compulsory panic_on_oom is selected.\n"); | 649 | panic("out of memory. Compulsory panic_on_oom is selected.\n"); |
619 | } | 650 | } |
620 | 651 | ||
@@ -622,7 +653,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
622 | * Check if there were limitations on the allocation (only relevant for | 653 | * Check if there were limitations on the allocation (only relevant for |
623 | * NUMA) that may require different handling. | 654 | * NUMA) that may require different handling. |
624 | */ | 655 | */ |
625 | constraint = constrained_alloc(zonelist, gfp_mask); | 656 | constraint = constrained_alloc(zonelist, gfp_mask, nodemask); |
626 | read_lock(&tasklist_lock); | 657 | read_lock(&tasklist_lock); |
627 | 658 | ||
628 | switch (constraint) { | 659 | switch (constraint) { |
@@ -633,7 +664,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) | |||
633 | 664 | ||
634 | case CONSTRAINT_NONE: | 665 | case CONSTRAINT_NONE: |
635 | if (sysctl_panic_on_oom) { | 666 | if (sysctl_panic_on_oom) { |
636 | dump_header(gfp_mask, order, NULL); | 667 | dump_header(NULL, gfp_mask, order, NULL); |
637 | panic("out of memory. panic_on_oom is selected\n"); | 668 | panic("out of memory. panic_on_oom is selected\n"); |
638 | } | 669 | } |
639 | /* Fall-through */ | 670 | /* Fall-through */ |
diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6867b4d391fd..74af449b1f1d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c | |||
@@ -1654,12 +1654,22 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, | |||
1654 | if (page) | 1654 | if (page) |
1655 | goto out; | 1655 | goto out; |
1656 | 1656 | ||
1657 | /* The OOM killer will not help higher order allocs */ | 1657 | if (!(gfp_mask & __GFP_NOFAIL)) { |
1658 | if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) | 1658 | /* The OOM killer will not help higher order allocs */ |
1659 | goto out; | 1659 | if (order > PAGE_ALLOC_COSTLY_ORDER) |
1660 | 1660 | goto out; | |
1661 | /* | ||
1662 | * GFP_THISNODE contains __GFP_NORETRY and we never hit this. | ||
1663 | * Sanity check for bare calls of __GFP_THISNODE, not real OOM. | ||
1664 | * The caller should handle page allocation failure by itself if | ||
1665 | * it specifies __GFP_THISNODE. | ||
1666 | * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. | ||
1667 | */ | ||
1668 | if (gfp_mask & __GFP_THISNODE) | ||
1669 | goto out; | ||
1670 | } | ||
1661 | /* Exhausted what can be done so it's blamo time */ | 1671 | /* Exhausted what can be done so it's blamo time */ |
1662 | out_of_memory(zonelist, gfp_mask, order); | 1672 | out_of_memory(zonelist, gfp_mask, order, nodemask); |
1663 | 1673 | ||
1664 | out: | 1674 | out: |
1665 | clear_zonelist_oom(zonelist, gfp_mask); | 1675 | clear_zonelist_oom(zonelist, gfp_mask); |
@@ -3123,7 +3133,7 @@ static int __cpuinit process_zones(int cpu) | |||
3123 | 3133 | ||
3124 | if (percpu_pagelist_fraction) | 3134 | if (percpu_pagelist_fraction) |
3125 | setup_pagelist_highmark(zone_pcp(zone, cpu), | 3135 | setup_pagelist_highmark(zone_pcp(zone, cpu), |
3126 | (zone->present_pages / percpu_pagelist_fraction)); | 3136 | (zone->present_pages / percpu_pagelist_fraction)); |
3127 | } | 3137 | } |
3128 | 3138 | ||
3129 | return 0; | 3139 | return 0; |
@@ -721,7 +721,7 @@ void page_add_file_rmap(struct page *page) | |||
721 | { | 721 | { |
722 | if (atomic_inc_and_test(&page->_mapcount)) { | 722 | if (atomic_inc_and_test(&page->_mapcount)) { |
723 | __inc_zone_page_state(page, NR_FILE_MAPPED); | 723 | __inc_zone_page_state(page, NR_FILE_MAPPED); |
724 | mem_cgroup_update_mapped_file_stat(page, 1); | 724 | mem_cgroup_update_file_mapped(page, 1); |
725 | } | 725 | } |
726 | } | 726 | } |
727 | 727 | ||
@@ -753,8 +753,8 @@ void page_remove_rmap(struct page *page) | |||
753 | __dec_zone_page_state(page, NR_ANON_PAGES); | 753 | __dec_zone_page_state(page, NR_ANON_PAGES); |
754 | } else { | 754 | } else { |
755 | __dec_zone_page_state(page, NR_FILE_MAPPED); | 755 | __dec_zone_page_state(page, NR_FILE_MAPPED); |
756 | mem_cgroup_update_file_mapped(page, -1); | ||
756 | } | 757 | } |
757 | mem_cgroup_update_mapped_file_stat(page, -1); | ||
758 | /* | 758 | /* |
759 | * It would be tidy to reset the PageAnon mapping here, | 759 | * It would be tidy to reset the PageAnon mapping here, |
760 | * but that might overwrite a racing page_add_anon_rmap | 760 | * but that might overwrite a racing page_add_anon_rmap |
diff --git a/mm/shmem.c b/mm/shmem.c index 4fb41c83daca..f8485062f3ba 100644 --- a/mm/shmem.c +++ b/mm/shmem.c | |||
@@ -29,7 +29,6 @@ | |||
29 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
30 | #include <linux/module.h> | 30 | #include <linux/module.h> |
31 | #include <linux/swap.h> | 31 | #include <linux/swap.h> |
32 | #include <linux/ima.h> | ||
33 | 32 | ||
34 | static struct vfsmount *shm_mnt; | 33 | static struct vfsmount *shm_mnt; |
35 | 34 | ||
@@ -42,6 +41,7 @@ static struct vfsmount *shm_mnt; | |||
42 | 41 | ||
43 | #include <linux/xattr.h> | 42 | #include <linux/xattr.h> |
44 | #include <linux/exportfs.h> | 43 | #include <linux/exportfs.h> |
44 | #include <linux/posix_acl.h> | ||
45 | #include <linux/generic_acl.h> | 45 | #include <linux/generic_acl.h> |
46 | #include <linux/mman.h> | 46 | #include <linux/mman.h> |
47 | #include <linux/string.h> | 47 | #include <linux/string.h> |
@@ -810,7 +810,7 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) | |||
810 | error = inode_setattr(inode, attr); | 810 | error = inode_setattr(inode, attr); |
811 | #ifdef CONFIG_TMPFS_POSIX_ACL | 811 | #ifdef CONFIG_TMPFS_POSIX_ACL |
812 | if (!error && (attr->ia_valid & ATTR_MODE)) | 812 | if (!error && (attr->ia_valid & ATTR_MODE)) |
813 | error = generic_acl_chmod(inode, &shmem_acl_ops); | 813 | error = generic_acl_chmod(inode); |
814 | #endif | 814 | #endif |
815 | if (page) | 815 | if (page) |
816 | page_cache_release(page); | 816 | page_cache_release(page); |
@@ -1824,11 +1824,13 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) | |||
1824 | return error; | 1824 | return error; |
1825 | } | 1825 | } |
1826 | } | 1826 | } |
1827 | error = shmem_acl_init(inode, dir); | 1827 | #ifdef CONFIG_TMPFS_POSIX_ACL |
1828 | error = generic_acl_init(inode, dir); | ||
1828 | if (error) { | 1829 | if (error) { |
1829 | iput(inode); | 1830 | iput(inode); |
1830 | return error; | 1831 | return error; |
1831 | } | 1832 | } |
1833 | #endif | ||
1832 | if (dir->i_mode & S_ISGID) { | 1834 | if (dir->i_mode & S_ISGID) { |
1833 | inode->i_gid = dir->i_gid; | 1835 | inode->i_gid = dir->i_gid; |
1834 | if (S_ISDIR(mode)) | 1836 | if (S_ISDIR(mode)) |
@@ -2043,27 +2045,28 @@ static const struct inode_operations shmem_symlink_inode_operations = { | |||
2043 | * filesystem level, though. | 2045 | * filesystem level, though. |
2044 | */ | 2046 | */ |
2045 | 2047 | ||
2046 | static size_t shmem_xattr_security_list(struct inode *inode, char *list, | 2048 | static size_t shmem_xattr_security_list(struct dentry *dentry, char *list, |
2047 | size_t list_len, const char *name, | 2049 | size_t list_len, const char *name, |
2048 | size_t name_len) | 2050 | size_t name_len, int handler_flags) |
2049 | { | 2051 | { |
2050 | return security_inode_listsecurity(inode, list, list_len); | 2052 | return security_inode_listsecurity(dentry->d_inode, list, list_len); |
2051 | } | 2053 | } |
2052 | 2054 | ||
2053 | static int shmem_xattr_security_get(struct inode *inode, const char *name, | 2055 | static int shmem_xattr_security_get(struct dentry *dentry, const char *name, |
2054 | void *buffer, size_t size) | 2056 | void *buffer, size_t size, int handler_flags) |
2055 | { | 2057 | { |
2056 | if (strcmp(name, "") == 0) | 2058 | if (strcmp(name, "") == 0) |
2057 | return -EINVAL; | 2059 | return -EINVAL; |
2058 | return xattr_getsecurity(inode, name, buffer, size); | 2060 | return xattr_getsecurity(dentry->d_inode, name, buffer, size); |
2059 | } | 2061 | } |
2060 | 2062 | ||
2061 | static int shmem_xattr_security_set(struct inode *inode, const char *name, | 2063 | static int shmem_xattr_security_set(struct dentry *dentry, const char *name, |
2062 | const void *value, size_t size, int flags) | 2064 | const void *value, size_t size, int flags, int handler_flags) |
2063 | { | 2065 | { |
2064 | if (strcmp(name, "") == 0) | 2066 | if (strcmp(name, "") == 0) |
2065 | return -EINVAL; | 2067 | return -EINVAL; |
2066 | return security_inode_setsecurity(inode, name, value, size, flags); | 2068 | return security_inode_setsecurity(dentry->d_inode, name, value, |
2069 | size, flags); | ||
2067 | } | 2070 | } |
2068 | 2071 | ||
2069 | static struct xattr_handler shmem_xattr_security_handler = { | 2072 | static struct xattr_handler shmem_xattr_security_handler = { |
@@ -2074,8 +2077,8 @@ static struct xattr_handler shmem_xattr_security_handler = { | |||
2074 | }; | 2077 | }; |
2075 | 2078 | ||
2076 | static struct xattr_handler *shmem_xattr_handlers[] = { | 2079 | static struct xattr_handler *shmem_xattr_handlers[] = { |
2077 | &shmem_xattr_acl_access_handler, | 2080 | &generic_acl_access_handler, |
2078 | &shmem_xattr_acl_default_handler, | 2081 | &generic_acl_default_handler, |
2079 | &shmem_xattr_security_handler, | 2082 | &shmem_xattr_security_handler, |
2080 | NULL | 2083 | NULL |
2081 | }; | 2084 | }; |
@@ -2454,7 +2457,7 @@ static const struct inode_operations shmem_inode_operations = { | |||
2454 | .getxattr = generic_getxattr, | 2457 | .getxattr = generic_getxattr, |
2455 | .listxattr = generic_listxattr, | 2458 | .listxattr = generic_listxattr, |
2456 | .removexattr = generic_removexattr, | 2459 | .removexattr = generic_removexattr, |
2457 | .check_acl = shmem_check_acl, | 2460 | .check_acl = generic_check_acl, |
2458 | #endif | 2461 | #endif |
2459 | 2462 | ||
2460 | }; | 2463 | }; |
@@ -2477,7 +2480,7 @@ static const struct inode_operations shmem_dir_inode_operations = { | |||
2477 | .getxattr = generic_getxattr, | 2480 | .getxattr = generic_getxattr, |
2478 | .listxattr = generic_listxattr, | 2481 | .listxattr = generic_listxattr, |
2479 | .removexattr = generic_removexattr, | 2482 | .removexattr = generic_removexattr, |
2480 | .check_acl = shmem_check_acl, | 2483 | .check_acl = generic_check_acl, |
2481 | #endif | 2484 | #endif |
2482 | }; | 2485 | }; |
2483 | 2486 | ||
@@ -2488,7 +2491,7 @@ static const struct inode_operations shmem_special_inode_operations = { | |||
2488 | .getxattr = generic_getxattr, | 2491 | .getxattr = generic_getxattr, |
2489 | .listxattr = generic_listxattr, | 2492 | .listxattr = generic_listxattr, |
2490 | .removexattr = generic_removexattr, | 2493 | .removexattr = generic_removexattr, |
2491 | .check_acl = shmem_check_acl, | 2494 | .check_acl = generic_check_acl, |
2492 | #endif | 2495 | #endif |
2493 | }; | 2496 | }; |
2494 | 2497 | ||
@@ -2626,7 +2629,8 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
2626 | int error; | 2629 | int error; |
2627 | struct file *file; | 2630 | struct file *file; |
2628 | struct inode *inode; | 2631 | struct inode *inode; |
2629 | struct dentry *dentry, *root; | 2632 | struct path path; |
2633 | struct dentry *root; | ||
2630 | struct qstr this; | 2634 | struct qstr this; |
2631 | 2635 | ||
2632 | if (IS_ERR(shm_mnt)) | 2636 | if (IS_ERR(shm_mnt)) |
@@ -2643,38 +2647,35 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags | |||
2643 | this.len = strlen(name); | 2647 | this.len = strlen(name); |
2644 | this.hash = 0; /* will go */ | 2648 | this.hash = 0; /* will go */ |
2645 | root = shm_mnt->mnt_root; | 2649 | root = shm_mnt->mnt_root; |
2646 | dentry = d_alloc(root, &this); | 2650 | path.dentry = d_alloc(root, &this); |
2647 | if (!dentry) | 2651 | if (!path.dentry) |
2648 | goto put_memory; | 2652 | goto put_memory; |
2649 | 2653 | path.mnt = mntget(shm_mnt); | |
2650 | error = -ENFILE; | ||
2651 | file = get_empty_filp(); | ||
2652 | if (!file) | ||
2653 | goto put_dentry; | ||
2654 | 2654 | ||
2655 | error = -ENOSPC; | 2655 | error = -ENOSPC; |
2656 | inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); | 2656 | inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); |
2657 | if (!inode) | 2657 | if (!inode) |
2658 | goto close_file; | 2658 | goto put_dentry; |
2659 | 2659 | ||
2660 | d_instantiate(dentry, inode); | 2660 | d_instantiate(path.dentry, inode); |
2661 | inode->i_size = size; | 2661 | inode->i_size = size; |
2662 | inode->i_nlink = 0; /* It is unlinked */ | 2662 | inode->i_nlink = 0; /* It is unlinked */ |
2663 | init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, | ||
2664 | &shmem_file_operations); | ||
2665 | |||
2666 | #ifndef CONFIG_MMU | 2663 | #ifndef CONFIG_MMU |
2667 | error = ramfs_nommu_expand_for_mapping(inode, size); | 2664 | error = ramfs_nommu_expand_for_mapping(inode, size); |
2668 | if (error) | 2665 | if (error) |
2669 | goto close_file; | 2666 | goto put_dentry; |
2670 | #endif | 2667 | #endif |
2671 | ima_counts_get(file); | 2668 | |
2669 | error = -ENFILE; | ||
2670 | file = alloc_file(&path, FMODE_WRITE | FMODE_READ, | ||
2671 | &shmem_file_operations); | ||
2672 | if (!file) | ||
2673 | goto put_dentry; | ||
2674 | |||
2672 | return file; | 2675 | return file; |
2673 | 2676 | ||
2674 | close_file: | ||
2675 | put_filp(file); | ||
2676 | put_dentry: | 2677 | put_dentry: |
2677 | dput(dentry); | 2678 | path_put(&path); |
2678 | put_memory: | 2679 | put_memory: |
2679 | shmem_unacct_size(flags, size); | 2680 | shmem_unacct_size(flags, size); |
2680 | return ERR_PTR(error); | 2681 | return ERR_PTR(error); |
diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c deleted file mode 100644 index df2c87fdae50..000000000000 --- a/mm/shmem_acl.c +++ /dev/null | |||
@@ -1,171 +0,0 @@ | |||
1 | /* | ||
2 | * mm/shmem_acl.c | ||
3 | * | ||
4 | * (C) 2005 Andreas Gruenbacher <agruen@suse.de> | ||
5 | * | ||
6 | * This file is released under the GPL. | ||
7 | */ | ||
8 | |||
9 | #include <linux/fs.h> | ||
10 | #include <linux/shmem_fs.h> | ||
11 | #include <linux/xattr.h> | ||
12 | #include <linux/generic_acl.h> | ||
13 | |||
14 | /** | ||
15 | * shmem_get_acl - generic_acl_operations->getacl() operation | ||
16 | */ | ||
17 | static struct posix_acl * | ||
18 | shmem_get_acl(struct inode *inode, int type) | ||
19 | { | ||
20 | struct posix_acl *acl = NULL; | ||
21 | |||
22 | spin_lock(&inode->i_lock); | ||
23 | switch(type) { | ||
24 | case ACL_TYPE_ACCESS: | ||
25 | acl = posix_acl_dup(inode->i_acl); | ||
26 | break; | ||
27 | |||
28 | case ACL_TYPE_DEFAULT: | ||
29 | acl = posix_acl_dup(inode->i_default_acl); | ||
30 | break; | ||
31 | } | ||
32 | spin_unlock(&inode->i_lock); | ||
33 | |||
34 | return acl; | ||
35 | } | ||
36 | |||
37 | /** | ||
38 | * shmem_set_acl - generic_acl_operations->setacl() operation | ||
39 | */ | ||
40 | static void | ||
41 | shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) | ||
42 | { | ||
43 | struct posix_acl *free = NULL; | ||
44 | |||
45 | spin_lock(&inode->i_lock); | ||
46 | switch(type) { | ||
47 | case ACL_TYPE_ACCESS: | ||
48 | free = inode->i_acl; | ||
49 | inode->i_acl = posix_acl_dup(acl); | ||
50 | break; | ||
51 | |||
52 | case ACL_TYPE_DEFAULT: | ||
53 | free = inode->i_default_acl; | ||
54 | inode->i_default_acl = posix_acl_dup(acl); | ||
55 | break; | ||
56 | } | ||
57 | spin_unlock(&inode->i_lock); | ||
58 | posix_acl_release(free); | ||
59 | } | ||
60 | |||
61 | struct generic_acl_operations shmem_acl_ops = { | ||
62 | .getacl = shmem_get_acl, | ||
63 | .setacl = shmem_set_acl, | ||
64 | }; | ||
65 | |||
66 | /** | ||
67 | * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access, | ||
68 | * shmem_xattr_acl_access_handler - plumbing code to implement the | ||
69 | * system.posix_acl_access xattr using the generic acl functions. | ||
70 | */ | ||
71 | |||
72 | static size_t | ||
73 | shmem_list_acl_access(struct inode *inode, char *list, size_t list_size, | ||
74 | const char *name, size_t name_len) | ||
75 | { | ||
76 | return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, | ||
77 | list, list_size); | ||
78 | } | ||
79 | |||
80 | static int | ||
81 | shmem_get_acl_access(struct inode *inode, const char *name, void *buffer, | ||
82 | size_t size) | ||
83 | { | ||
84 | if (strcmp(name, "") != 0) | ||
85 | return -EINVAL; | ||
86 | return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer, | ||
87 | size); | ||
88 | } | ||
89 | |||
90 | static int | ||
91 | shmem_set_acl_access(struct inode *inode, const char *name, const void *value, | ||
92 | size_t size, int flags) | ||
93 | { | ||
94 | if (strcmp(name, "") != 0) | ||
95 | return -EINVAL; | ||
96 | return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value, | ||
97 | size); | ||
98 | } | ||
99 | |||
100 | struct xattr_handler shmem_xattr_acl_access_handler = { | ||
101 | .prefix = POSIX_ACL_XATTR_ACCESS, | ||
102 | .list = shmem_list_acl_access, | ||
103 | .get = shmem_get_acl_access, | ||
104 | .set = shmem_set_acl_access, | ||
105 | }; | ||
106 | |||
107 | /** | ||
108 | * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default, | ||
109 | * shmem_xattr_acl_default_handler - plumbing code to implement the | ||
110 | * system.posix_acl_default xattr using the generic acl functions. | ||
111 | */ | ||
112 | |||
113 | static size_t | ||
114 | shmem_list_acl_default(struct inode *inode, char *list, size_t list_size, | ||
115 | const char *name, size_t name_len) | ||
116 | { | ||
117 | return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, | ||
118 | list, list_size); | ||
119 | } | ||
120 | |||
121 | static int | ||
122 | shmem_get_acl_default(struct inode *inode, const char *name, void *buffer, | ||
123 | size_t size) | ||
124 | { | ||
125 | if (strcmp(name, "") != 0) | ||
126 | return -EINVAL; | ||
127 | return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer, | ||
128 | size); | ||
129 | } | ||
130 | |||
131 | static int | ||
132 | shmem_set_acl_default(struct inode *inode, const char *name, const void *value, | ||
133 | size_t size, int flags) | ||
134 | { | ||
135 | if (strcmp(name, "") != 0) | ||
136 | return -EINVAL; | ||
137 | return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value, | ||
138 | size); | ||
139 | } | ||
140 | |||
141 | struct xattr_handler shmem_xattr_acl_default_handler = { | ||
142 | .prefix = POSIX_ACL_XATTR_DEFAULT, | ||
143 | .list = shmem_list_acl_default, | ||
144 | .get = shmem_get_acl_default, | ||
145 | .set = shmem_set_acl_default, | ||
146 | }; | ||
147 | |||
148 | /** | ||
149 | * shmem_acl_init - Inizialize the acl(s) of a new inode | ||
150 | */ | ||
151 | int | ||
152 | shmem_acl_init(struct inode *inode, struct inode *dir) | ||
153 | { | ||
154 | return generic_acl_init(inode, dir, &shmem_acl_ops); | ||
155 | } | ||
156 | |||
157 | /** | ||
158 | * shmem_check_acl - check_acl() callback for generic_permission() | ||
159 | */ | ||
160 | int | ||
161 | shmem_check_acl(struct inode *inode, int mask) | ||
162 | { | ||
163 | struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); | ||
164 | |||
165 | if (acl) { | ||
166 | int error = posix_acl_permission(inode, acl, mask); | ||
167 | posix_acl_release(acl); | ||
168 | return error; | ||
169 | } | ||
170 | return -EAGAIN; | ||
171 | } | ||
diff --git a/mm/truncate.c b/mm/truncate.c index 2c147a7e5f2c..342deee22684 100644 --- a/mm/truncate.c +++ b/mm/truncate.c | |||
@@ -272,6 +272,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
272 | pagevec_release(&pvec); | 272 | pagevec_release(&pvec); |
273 | break; | 273 | break; |
274 | } | 274 | } |
275 | mem_cgroup_uncharge_start(); | ||
275 | for (i = 0; i < pagevec_count(&pvec); i++) { | 276 | for (i = 0; i < pagevec_count(&pvec); i++) { |
276 | struct page *page = pvec.pages[i]; | 277 | struct page *page = pvec.pages[i]; |
277 | 278 | ||
@@ -286,6 +287,7 @@ void truncate_inode_pages_range(struct address_space *mapping, | |||
286 | unlock_page(page); | 287 | unlock_page(page); |
287 | } | 288 | } |
288 | pagevec_release(&pvec); | 289 | pagevec_release(&pvec); |
290 | mem_cgroup_uncharge_end(); | ||
289 | } | 291 | } |
290 | } | 292 | } |
291 | EXPORT_SYMBOL(truncate_inode_pages_range); | 293 | EXPORT_SYMBOL(truncate_inode_pages_range); |
@@ -327,6 +329,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
327 | pagevec_init(&pvec, 0); | 329 | pagevec_init(&pvec, 0); |
328 | while (next <= end && | 330 | while (next <= end && |
329 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { | 331 | pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { |
332 | mem_cgroup_uncharge_start(); | ||
330 | for (i = 0; i < pagevec_count(&pvec); i++) { | 333 | for (i = 0; i < pagevec_count(&pvec); i++) { |
331 | struct page *page = pvec.pages[i]; | 334 | struct page *page = pvec.pages[i]; |
332 | pgoff_t index; | 335 | pgoff_t index; |
@@ -354,6 +357,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, | |||
354 | break; | 357 | break; |
355 | } | 358 | } |
356 | pagevec_release(&pvec); | 359 | pagevec_release(&pvec); |
360 | mem_cgroup_uncharge_end(); | ||
357 | cond_resched(); | 361 | cond_resched(); |
358 | } | 362 | } |
359 | return ret; | 363 | return ret; |
@@ -428,6 +432,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
428 | while (next <= end && !wrapped && | 432 | while (next <= end && !wrapped && |
429 | pagevec_lookup(&pvec, mapping, next, | 433 | pagevec_lookup(&pvec, mapping, next, |
430 | min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { | 434 | min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { |
435 | mem_cgroup_uncharge_start(); | ||
431 | for (i = 0; i < pagevec_count(&pvec); i++) { | 436 | for (i = 0; i < pagevec_count(&pvec); i++) { |
432 | struct page *page = pvec.pages[i]; | 437 | struct page *page = pvec.pages[i]; |
433 | pgoff_t page_index; | 438 | pgoff_t page_index; |
@@ -477,6 +482,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, | |||
477 | unlock_page(page); | 482 | unlock_page(page); |
478 | } | 483 | } |
479 | pagevec_release(&pvec); | 484 | pagevec_release(&pvec); |
485 | mem_cgroup_uncharge_end(); | ||
480 | cond_resched(); | 486 | cond_resched(); |
481 | } | 487 | } |
482 | return ret; | 488 | return ret; |