diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2013-10-17 00:36:03 -0400 |
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2013-10-17 00:36:03 -0400 |
| commit | 056cdce0d3a214158f3a4ea40887b22639f855a8 (patch) | |
| tree | 8ced4ccf6c7bac7eef49710c3cdfb0745cc85102 | |
| parent | 0056019da4b7ee5ab51fb174fe0655278578516f (diff) | |
| parent | 57a8f0cdb87da776bf0e4ce7554a9133854fa779 (diff) | |
Merge branch 'akpm' (fixes from Andrew Morton)
Merge misc fixes from Andrew Morton.
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (21 commits)
mm: revert mremap pud_free anti-fix
mm: fix BUG in __split_huge_page_pmd
swap: fix set_blocksize race during swapon/swapoff
procfs: call default get_unmapped_area on MMU-present architectures
procfs: fix unintended truncation of returned mapped address
writeback: fix negative bdi max pause
percpu_refcount: export symbols
fs: buffer: move allocation failure loop into the allocator
mm: memcg: handle non-error OOM situations more gracefully
tools/testing/selftests: fix uninitialized variable
block/partitions/efi.c: treat size mismatch as a warning, not an error
mm: hugetlb: initialize PG_reserved for tail pages of gigantic compound pages
mm/zswap: bugfix: memory leak when re-swapon
mm: /proc/pid/pagemap: inspect _PAGE_SOFT_DIRTY only on present pages
mm: migration: do not lose soft dirty bit if page is in migration state
gcov: MAINTAINERS: Add an entry for gcov
mm/hugetlb.c: correct missing private flag clearing
mm/vmscan.c: don't forget to free shrinker->nr_deferred
ipc/sem.c: synchronize semop and semctl with IPC_RMID
ipc: update locking scheme comments
...
| -rw-r--r-- | MAINTAINERS | 6 | ||||
| -rw-r--r-- | block/partitions/efi.c | 7 | ||||
| -rw-r--r-- | fs/buffer.c | 14 | ||||
| -rw-r--r-- | fs/proc/inode.c | 10 | ||||
| -rw-r--r-- | fs/proc/task_mmu.c | 4 | ||||
| -rw-r--r-- | include/linux/memcontrol.h | 50 | ||||
| -rw-r--r-- | include/linux/sched.h | 7 | ||||
| -rw-r--r-- | ipc/sem.c | 42 | ||||
| -rw-r--r-- | ipc/util.c | 27 | ||||
| -rw-r--r-- | lib/percpu-refcount.c | 3 | ||||
| -rw-r--r-- | mm/filemap.c | 11 | ||||
| -rw-r--r-- | mm/huge_memory.c | 10 | ||||
| -rw-r--r-- | mm/hugetlb.c | 17 | ||||
| -rw-r--r-- | mm/memcontrol.c | 143 | ||||
| -rw-r--r-- | mm/memory.c | 20 | ||||
| -rw-r--r-- | mm/migrate.c | 2 | ||||
| -rw-r--r-- | mm/mprotect.c | 7 | ||||
| -rw-r--r-- | mm/mremap.c | 5 | ||||
| -rw-r--r-- | mm/oom_kill.c | 2 | ||||
| -rw-r--r-- | mm/page-writeback.c | 10 | ||||
| -rw-r--r-- | mm/swapfile.c | 4 | ||||
| -rw-r--r-- | mm/vmscan.c | 1 | ||||
| -rw-r--r-- | mm/zswap.c | 4 | ||||
| -rw-r--r-- | tools/testing/selftests/timers/posix_timers.c | 2 |
24 files changed, 219 insertions, 189 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index 72b1e5c2378a..a7c34ef3509d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS | |||
| @@ -3624,6 +3624,12 @@ L: linux-scsi@vger.kernel.org | |||
| 3624 | S: Odd Fixes (e.g., new signatures) | 3624 | S: Odd Fixes (e.g., new signatures) |
| 3625 | F: drivers/scsi/fdomain.* | 3625 | F: drivers/scsi/fdomain.* |
| 3626 | 3626 | ||
| 3627 | GCOV BASED KERNEL PROFILING | ||
| 3628 | M: Peter Oberparleiter <oberpar@linux.vnet.ibm.com> | ||
| 3629 | S: Maintained | ||
| 3630 | F: kernel/gcov/ | ||
| 3631 | F: Documentation/gcov.txt | ||
| 3632 | |||
| 3627 | GDT SCSI DISK ARRAY CONTROLLER DRIVER | 3633 | GDT SCSI DISK ARRAY CONTROLLER DRIVER |
| 3628 | M: Achim Leubner <achim_leubner@adaptec.com> | 3634 | M: Achim Leubner <achim_leubner@adaptec.com> |
| 3629 | L: linux-scsi@vger.kernel.org | 3635 | L: linux-scsi@vger.kernel.org |
diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 1eb09ee5311b..a8287b49d062 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c | |||
| @@ -222,11 +222,16 @@ check_hybrid: | |||
| 222 | * the disk size. | 222 | * the disk size. |
| 223 | * | 223 | * |
| 224 | * Hybrid MBRs do not necessarily comply with this. | 224 | * Hybrid MBRs do not necessarily comply with this. |
| 225 | * | ||
| 226 | * Consider a bad value here to be a warning to support dd'ing | ||
| 227 | * an image from a smaller disk to a larger disk. | ||
| 225 | */ | 228 | */ |
| 226 | if (ret == GPT_MBR_PROTECTIVE) { | 229 | if (ret == GPT_MBR_PROTECTIVE) { |
| 227 | sz = le32_to_cpu(mbr->partition_record[part].size_in_lba); | 230 | sz = le32_to_cpu(mbr->partition_record[part].size_in_lba); |
| 228 | if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF) | 231 | if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF) |
| 229 | ret = 0; | 232 | pr_debug("GPT: mbr size in lba (%u) different than whole disk (%u).\n", |
| 233 | sz, min_t(uint32_t, | ||
| 234 | total_sectors - 1, 0xFFFFFFFF)); | ||
| 230 | } | 235 | } |
| 231 | done: | 236 | done: |
| 232 | return ret; | 237 | return ret; |
diff --git a/fs/buffer.c b/fs/buffer.c index 4d7433534f5c..6024877335ca 100644 --- a/fs/buffer.c +++ b/fs/buffer.c | |||
| @@ -1005,9 +1005,19 @@ grow_dev_page(struct block_device *bdev, sector_t block, | |||
| 1005 | struct buffer_head *bh; | 1005 | struct buffer_head *bh; |
| 1006 | sector_t end_block; | 1006 | sector_t end_block; |
| 1007 | int ret = 0; /* Will call free_more_memory() */ | 1007 | int ret = 0; /* Will call free_more_memory() */ |
| 1008 | gfp_t gfp_mask; | ||
| 1008 | 1009 | ||
| 1009 | page = find_or_create_page(inode->i_mapping, index, | 1010 | gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS; |
| 1010 | (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS)|__GFP_MOVABLE); | 1011 | gfp_mask |= __GFP_MOVABLE; |
| 1012 | /* | ||
| 1013 | * XXX: __getblk_slow() can not really deal with failure and | ||
| 1014 | * will endlessly loop on improvised global reclaim. Prefer | ||
| 1015 | * looping in the allocator rather than here, at least that | ||
| 1016 | * code knows what it's doing. | ||
| 1017 | */ | ||
| 1018 | gfp_mask |= __GFP_NOFAIL; | ||
| 1019 | |||
| 1020 | page = find_or_create_page(inode->i_mapping, index, gfp_mask); | ||
| 1011 | if (!page) | 1021 | if (!page) |
| 1012 | return ret; | 1022 | return ret; |
| 1013 | 1023 | ||
diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 9f8ef9b7674d..8eaa1ba793fc 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c | |||
| @@ -288,10 +288,14 @@ static int proc_reg_mmap(struct file *file, struct vm_area_struct *vma) | |||
| 288 | static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) | 288 | static unsigned long proc_reg_get_unmapped_area(struct file *file, unsigned long orig_addr, unsigned long len, unsigned long pgoff, unsigned long flags) |
| 289 | { | 289 | { |
| 290 | struct proc_dir_entry *pde = PDE(file_inode(file)); | 290 | struct proc_dir_entry *pde = PDE(file_inode(file)); |
| 291 | int rv = -EIO; | 291 | unsigned long rv = -EIO; |
| 292 | unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); | 292 | unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long) = NULL; |
| 293 | if (use_pde(pde)) { | 293 | if (use_pde(pde)) { |
| 294 | get_unmapped_area = pde->proc_fops->get_unmapped_area; | 294 | #ifdef CONFIG_MMU |
| 295 | get_unmapped_area = current->mm->get_unmapped_area; | ||
| 296 | #endif | ||
| 297 | if (pde->proc_fops->get_unmapped_area) | ||
| 298 | get_unmapped_area = pde->proc_fops->get_unmapped_area; | ||
| 295 | if (get_unmapped_area) | 299 | if (get_unmapped_area) |
| 296 | rv = get_unmapped_area(file, orig_addr, len, pgoff, flags); | 300 | rv = get_unmapped_area(file, orig_addr, len, pgoff, flags); |
| 297 | unuse_pde(pde); | 301 | unuse_pde(pde); |
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7366e9d63cee..390bdab01c3c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c | |||
| @@ -941,6 +941,8 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, | |||
| 941 | frame = pte_pfn(pte); | 941 | frame = pte_pfn(pte); |
| 942 | flags = PM_PRESENT; | 942 | flags = PM_PRESENT; |
| 943 | page = vm_normal_page(vma, addr, pte); | 943 | page = vm_normal_page(vma, addr, pte); |
| 944 | if (pte_soft_dirty(pte)) | ||
| 945 | flags2 |= __PM_SOFT_DIRTY; | ||
| 944 | } else if (is_swap_pte(pte)) { | 946 | } else if (is_swap_pte(pte)) { |
| 945 | swp_entry_t entry; | 947 | swp_entry_t entry; |
| 946 | if (pte_swp_soft_dirty(pte)) | 948 | if (pte_swp_soft_dirty(pte)) |
| @@ -960,7 +962,7 @@ static void pte_to_pagemap_entry(pagemap_entry_t *pme, struct pagemapread *pm, | |||
| 960 | 962 | ||
| 961 | if (page && !PageAnon(page)) | 963 | if (page && !PageAnon(page)) |
| 962 | flags |= PM_FILE; | 964 | flags |= PM_FILE; |
| 963 | if ((vma->vm_flags & VM_SOFTDIRTY) || pte_soft_dirty(pte)) | 965 | if ((vma->vm_flags & VM_SOFTDIRTY)) |
| 964 | flags2 |= __PM_SOFT_DIRTY; | 966 | flags2 |= __PM_SOFT_DIRTY; |
| 965 | 967 | ||
| 966 | *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); | 968 | *pme = make_pme(PM_PFRAME(frame) | PM_STATUS2(pm->v2, flags2) | flags); |
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ecc82b37c4cc..b3e7a667e03c 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h | |||
| @@ -137,47 +137,24 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, | |||
| 137 | extern void mem_cgroup_replace_page_cache(struct page *oldpage, | 137 | extern void mem_cgroup_replace_page_cache(struct page *oldpage, |
| 138 | struct page *newpage); | 138 | struct page *newpage); |
| 139 | 139 | ||
| 140 | /** | 140 | static inline void mem_cgroup_oom_enable(void) |
| 141 | * mem_cgroup_toggle_oom - toggle the memcg OOM killer for the current task | ||
| 142 | * @new: true to enable, false to disable | ||
| 143 | * | ||
| 144 | * Toggle whether a failed memcg charge should invoke the OOM killer | ||
| 145 | * or just return -ENOMEM. Returns the previous toggle state. | ||
| 146 | * | ||
| 147 | * NOTE: Any path that enables the OOM killer before charging must | ||
| 148 | * call mem_cgroup_oom_synchronize() afterward to finalize the | ||
| 149 | * OOM handling and clean up. | ||
| 150 | */ | ||
| 151 | static inline bool mem_cgroup_toggle_oom(bool new) | ||
| 152 | { | 141 | { |
| 153 | bool old; | 142 | WARN_ON(current->memcg_oom.may_oom); |
| 154 | 143 | current->memcg_oom.may_oom = 1; | |
| 155 | old = current->memcg_oom.may_oom; | ||
| 156 | current->memcg_oom.may_oom = new; | ||
| 157 | |||
| 158 | return old; | ||
| 159 | } | 144 | } |
| 160 | 145 | ||
| 161 | static inline void mem_cgroup_enable_oom(void) | 146 | static inline void mem_cgroup_oom_disable(void) |
| 162 | { | 147 | { |
| 163 | bool old = mem_cgroup_toggle_oom(true); | 148 | WARN_ON(!current->memcg_oom.may_oom); |
| 164 | 149 | current->memcg_oom.may_oom = 0; | |
| 165 | WARN_ON(old == true); | ||
| 166 | } | ||
| 167 | |||
| 168 | static inline void mem_cgroup_disable_oom(void) | ||
| 169 | { | ||
| 170 | bool old = mem_cgroup_toggle_oom(false); | ||
| 171 | |||
| 172 | WARN_ON(old == false); | ||
| 173 | } | 150 | } |
| 174 | 151 | ||
| 175 | static inline bool task_in_memcg_oom(struct task_struct *p) | 152 | static inline bool task_in_memcg_oom(struct task_struct *p) |
| 176 | { | 153 | { |
| 177 | return p->memcg_oom.in_memcg_oom; | 154 | return p->memcg_oom.memcg; |
| 178 | } | 155 | } |
| 179 | 156 | ||
| 180 | bool mem_cgroup_oom_synchronize(void); | 157 | bool mem_cgroup_oom_synchronize(bool wait); |
| 181 | 158 | ||
| 182 | #ifdef CONFIG_MEMCG_SWAP | 159 | #ifdef CONFIG_MEMCG_SWAP |
| 183 | extern int do_swap_account; | 160 | extern int do_swap_account; |
| @@ -402,16 +379,11 @@ static inline void mem_cgroup_end_update_page_stat(struct page *page, | |||
| 402 | { | 379 | { |
| 403 | } | 380 | } |
| 404 | 381 | ||
| 405 | static inline bool mem_cgroup_toggle_oom(bool new) | 382 | static inline void mem_cgroup_oom_enable(void) |
| 406 | { | ||
| 407 | return false; | ||
| 408 | } | ||
| 409 | |||
| 410 | static inline void mem_cgroup_enable_oom(void) | ||
| 411 | { | 383 | { |
| 412 | } | 384 | } |
| 413 | 385 | ||
| 414 | static inline void mem_cgroup_disable_oom(void) | 386 | static inline void mem_cgroup_oom_disable(void) |
| 415 | { | 387 | { |
| 416 | } | 388 | } |
| 417 | 389 | ||
| @@ -420,7 +392,7 @@ static inline bool task_in_memcg_oom(struct task_struct *p) | |||
| 420 | return false; | 392 | return false; |
| 421 | } | 393 | } |
| 422 | 394 | ||
| 423 | static inline bool mem_cgroup_oom_synchronize(void) | 395 | static inline bool mem_cgroup_oom_synchronize(bool wait) |
| 424 | { | 396 | { |
| 425 | return false; | 397 | return false; |
| 426 | } | 398 | } |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 6682da36b293..e27baeeda3f4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -1394,11 +1394,10 @@ struct task_struct { | |||
| 1394 | } memcg_batch; | 1394 | } memcg_batch; |
| 1395 | unsigned int memcg_kmem_skip_account; | 1395 | unsigned int memcg_kmem_skip_account; |
| 1396 | struct memcg_oom_info { | 1396 | struct memcg_oom_info { |
| 1397 | struct mem_cgroup *memcg; | ||
| 1398 | gfp_t gfp_mask; | ||
| 1399 | int order; | ||
| 1397 | unsigned int may_oom:1; | 1400 | unsigned int may_oom:1; |
| 1398 | unsigned int in_memcg_oom:1; | ||
| 1399 | unsigned int oom_locked:1; | ||
| 1400 | int wakeups; | ||
| 1401 | struct mem_cgroup *wait_on_memcg; | ||
| 1402 | } memcg_oom; | 1401 | } memcg_oom; |
| 1403 | #endif | 1402 | #endif |
| 1404 | #ifdef CONFIG_UPROBES | 1403 | #ifdef CONFIG_UPROBES |
| @@ -1282,6 +1282,12 @@ static int semctl_setval(struct ipc_namespace *ns, int semid, int semnum, | |||
| 1282 | 1282 | ||
| 1283 | sem_lock(sma, NULL, -1); | 1283 | sem_lock(sma, NULL, -1); |
| 1284 | 1284 | ||
| 1285 | if (sma->sem_perm.deleted) { | ||
| 1286 | sem_unlock(sma, -1); | ||
| 1287 | rcu_read_unlock(); | ||
| 1288 | return -EIDRM; | ||
| 1289 | } | ||
| 1290 | |||
| 1285 | curr = &sma->sem_base[semnum]; | 1291 | curr = &sma->sem_base[semnum]; |
| 1286 | 1292 | ||
| 1287 | ipc_assert_locked_object(&sma->sem_perm); | 1293 | ipc_assert_locked_object(&sma->sem_perm); |
| @@ -1336,12 +1342,14 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, | |||
| 1336 | int i; | 1342 | int i; |
| 1337 | 1343 | ||
| 1338 | sem_lock(sma, NULL, -1); | 1344 | sem_lock(sma, NULL, -1); |
| 1345 | if (sma->sem_perm.deleted) { | ||
| 1346 | err = -EIDRM; | ||
| 1347 | goto out_unlock; | ||
| 1348 | } | ||
| 1339 | if(nsems > SEMMSL_FAST) { | 1349 | if(nsems > SEMMSL_FAST) { |
| 1340 | if (!ipc_rcu_getref(sma)) { | 1350 | if (!ipc_rcu_getref(sma)) { |
| 1341 | sem_unlock(sma, -1); | ||
| 1342 | rcu_read_unlock(); | ||
| 1343 | err = -EIDRM; | 1351 | err = -EIDRM; |
| 1344 | goto out_free; | 1352 | goto out_unlock; |
| 1345 | } | 1353 | } |
| 1346 | sem_unlock(sma, -1); | 1354 | sem_unlock(sma, -1); |
| 1347 | rcu_read_unlock(); | 1355 | rcu_read_unlock(); |
| @@ -1354,10 +1362,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, | |||
| 1354 | rcu_read_lock(); | 1362 | rcu_read_lock(); |
| 1355 | sem_lock_and_putref(sma); | 1363 | sem_lock_and_putref(sma); |
| 1356 | if (sma->sem_perm.deleted) { | 1364 | if (sma->sem_perm.deleted) { |
| 1357 | sem_unlock(sma, -1); | ||
| 1358 | rcu_read_unlock(); | ||
| 1359 | err = -EIDRM; | 1365 | err = -EIDRM; |
| 1360 | goto out_free; | 1366 | goto out_unlock; |
| 1361 | } | 1367 | } |
| 1362 | } | 1368 | } |
| 1363 | for (i = 0; i < sma->sem_nsems; i++) | 1369 | for (i = 0; i < sma->sem_nsems; i++) |
| @@ -1375,8 +1381,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, | |||
| 1375 | struct sem_undo *un; | 1381 | struct sem_undo *un; |
| 1376 | 1382 | ||
| 1377 | if (!ipc_rcu_getref(sma)) { | 1383 | if (!ipc_rcu_getref(sma)) { |
| 1378 | rcu_read_unlock(); | 1384 | err = -EIDRM; |
| 1379 | return -EIDRM; | 1385 | goto out_rcu_wakeup; |
| 1380 | } | 1386 | } |
| 1381 | rcu_read_unlock(); | 1387 | rcu_read_unlock(); |
| 1382 | 1388 | ||
| @@ -1404,10 +1410,8 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, | |||
| 1404 | rcu_read_lock(); | 1410 | rcu_read_lock(); |
| 1405 | sem_lock_and_putref(sma); | 1411 | sem_lock_and_putref(sma); |
| 1406 | if (sma->sem_perm.deleted) { | 1412 | if (sma->sem_perm.deleted) { |
| 1407 | sem_unlock(sma, -1); | ||
| 1408 | rcu_read_unlock(); | ||
| 1409 | err = -EIDRM; | 1413 | err = -EIDRM; |
| 1410 | goto out_free; | 1414 | goto out_unlock; |
| 1411 | } | 1415 | } |
| 1412 | 1416 | ||
| 1413 | for (i = 0; i < nsems; i++) | 1417 | for (i = 0; i < nsems; i++) |
| @@ -1431,6 +1435,10 @@ static int semctl_main(struct ipc_namespace *ns, int semid, int semnum, | |||
| 1431 | goto out_rcu_wakeup; | 1435 | goto out_rcu_wakeup; |
| 1432 | 1436 | ||
| 1433 | sem_lock(sma, NULL, -1); | 1437 | sem_lock(sma, NULL, -1); |
| 1438 | if (sma->sem_perm.deleted) { | ||
| 1439 | err = -EIDRM; | ||
| 1440 | goto out_unlock; | ||
| 1441 | } | ||
| 1434 | curr = &sma->sem_base[semnum]; | 1442 | curr = &sma->sem_base[semnum]; |
| 1435 | 1443 | ||
| 1436 | switch (cmd) { | 1444 | switch (cmd) { |
| @@ -1836,6 +1844,10 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
| 1836 | if (error) | 1844 | if (error) |
| 1837 | goto out_rcu_wakeup; | 1845 | goto out_rcu_wakeup; |
| 1838 | 1846 | ||
| 1847 | error = -EIDRM; | ||
| 1848 | locknum = sem_lock(sma, sops, nsops); | ||
| 1849 | if (sma->sem_perm.deleted) | ||
| 1850 | goto out_unlock_free; | ||
| 1839 | /* | 1851 | /* |
| 1840 | * semid identifiers are not unique - find_alloc_undo may have | 1852 | * semid identifiers are not unique - find_alloc_undo may have |
| 1841 | * allocated an undo structure, it was invalidated by an RMID | 1853 | * allocated an undo structure, it was invalidated by an RMID |
| @@ -1843,8 +1855,6 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops, | |||
| 1843 | * This case can be detected checking un->semid. The existence of | 1855 | * This case can be detected checking un->semid. The existence of |
| 1844 | * "un" itself is guaranteed by rcu. | 1856 | * "un" itself is guaranteed by rcu. |
| 1845 | */ | 1857 | */ |
| 1846 | error = -EIDRM; | ||
| 1847 | locknum = sem_lock(sma, sops, nsops); | ||
| 1848 | if (un && un->semid == -1) | 1858 | if (un && un->semid == -1) |
| 1849 | goto out_unlock_free; | 1859 | goto out_unlock_free; |
| 1850 | 1860 | ||
| @@ -2057,6 +2067,12 @@ void exit_sem(struct task_struct *tsk) | |||
| 2057 | } | 2067 | } |
| 2058 | 2068 | ||
| 2059 | sem_lock(sma, NULL, -1); | 2069 | sem_lock(sma, NULL, -1); |
| 2070 | /* exit_sem raced with IPC_RMID, nothing to do */ | ||
| 2071 | if (sma->sem_perm.deleted) { | ||
| 2072 | sem_unlock(sma, -1); | ||
| 2073 | rcu_read_unlock(); | ||
| 2074 | continue; | ||
| 2075 | } | ||
| 2060 | un = __lookup_undo(ulp, semid); | 2076 | un = __lookup_undo(ulp, semid); |
| 2061 | if (un == NULL) { | 2077 | if (un == NULL) { |
| 2062 | /* exit_sem raced with IPC_RMID+semget() that created | 2078 | /* exit_sem raced with IPC_RMID+semget() that created |
diff --git a/ipc/util.c b/ipc/util.c index fdb8ae740775..7684f41bce76 100644 --- a/ipc/util.c +++ b/ipc/util.c | |||
| @@ -17,12 +17,27 @@ | |||
| 17 | * Pavel Emelianov <xemul@openvz.org> | 17 | * Pavel Emelianov <xemul@openvz.org> |
| 18 | * | 18 | * |
| 19 | * General sysv ipc locking scheme: | 19 | * General sysv ipc locking scheme: |
| 20 | * when doing ipc id lookups, take the ids->rwsem | 20 | * rcu_read_lock() |
| 21 | * rcu_read_lock() | 21 | * obtain the ipc object (kern_ipc_perm) by looking up the id in an idr |
| 22 | * obtain the ipc object (kern_ipc_perm) | 22 | * tree. |
| 23 | * perform security, capabilities, auditing and permission checks, etc. | 23 | * - perform initial checks (capabilities, auditing and permission, |
| 24 | * acquire the ipc lock (kern_ipc_perm.lock) throught ipc_lock_object() | 24 | * etc). |
| 25 | * perform data updates (ie: SET, RMID, LOCK/UNLOCK commands) | 25 | * - perform read-only operations, such as STAT, INFO commands. |
| 26 | * acquire the ipc lock (kern_ipc_perm.lock) through | ||
| 27 | * ipc_lock_object() | ||
| 28 | * - perform data updates, such as SET, RMID commands and | ||
| 29 | * mechanism-specific operations (semop/semtimedop, | ||
| 30 | * msgsnd/msgrcv, shmat/shmdt). | ||
| 31 | * drop the ipc lock, through ipc_unlock_object(). | ||
| 32 | * rcu_read_unlock() | ||
| 33 | * | ||
| 34 | * The ids->rwsem must be taken when: | ||
| 35 | * - creating, removing and iterating the existing entries in ipc | ||
| 36 | * identifier sets. | ||
| 37 | * - iterating through files under /proc/sysvipc/ | ||
| 38 | * | ||
| 39 | * Note that sems have a special fast path that avoids kern_ipc_perm.lock - | ||
| 40 | * see sem_lock(). | ||
| 26 | */ | 41 | */ |
| 27 | 42 | ||
| 28 | #include <linux/mm.h> | 43 | #include <linux/mm.h> |
diff --git a/lib/percpu-refcount.c b/lib/percpu-refcount.c index 7deeb6297a48..1a53d497a8c5 100644 --- a/lib/percpu-refcount.c +++ b/lib/percpu-refcount.c | |||
| @@ -53,6 +53,7 @@ int percpu_ref_init(struct percpu_ref *ref, percpu_ref_func_t *release) | |||
| 53 | ref->release = release; | 53 | ref->release = release; |
| 54 | return 0; | 54 | return 0; |
| 55 | } | 55 | } |
| 56 | EXPORT_SYMBOL_GPL(percpu_ref_init); | ||
| 56 | 57 | ||
| 57 | /** | 58 | /** |
| 58 | * percpu_ref_cancel_init - cancel percpu_ref_init() | 59 | * percpu_ref_cancel_init - cancel percpu_ref_init() |
| @@ -84,6 +85,7 @@ void percpu_ref_cancel_init(struct percpu_ref *ref) | |||
| 84 | free_percpu(ref->pcpu_count); | 85 | free_percpu(ref->pcpu_count); |
| 85 | } | 86 | } |
| 86 | } | 87 | } |
| 88 | EXPORT_SYMBOL_GPL(percpu_ref_cancel_init); | ||
| 87 | 89 | ||
| 88 | static void percpu_ref_kill_rcu(struct rcu_head *rcu) | 90 | static void percpu_ref_kill_rcu(struct rcu_head *rcu) |
| 89 | { | 91 | { |
| @@ -156,3 +158,4 @@ void percpu_ref_kill_and_confirm(struct percpu_ref *ref, | |||
| 156 | 158 | ||
| 157 | call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); | 159 | call_rcu_sched(&ref->rcu, percpu_ref_kill_rcu); |
| 158 | } | 160 | } |
| 161 | EXPORT_SYMBOL_GPL(percpu_ref_kill_and_confirm); | ||
diff --git a/mm/filemap.c b/mm/filemap.c index 1e6aec4a2d2e..ae4846ff4849 100644 --- a/mm/filemap.c +++ b/mm/filemap.c | |||
| @@ -1616,7 +1616,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 1616 | struct inode *inode = mapping->host; | 1616 | struct inode *inode = mapping->host; |
| 1617 | pgoff_t offset = vmf->pgoff; | 1617 | pgoff_t offset = vmf->pgoff; |
| 1618 | struct page *page; | 1618 | struct page *page; |
| 1619 | bool memcg_oom; | ||
| 1620 | pgoff_t size; | 1619 | pgoff_t size; |
| 1621 | int ret = 0; | 1620 | int ret = 0; |
| 1622 | 1621 | ||
| @@ -1625,11 +1624,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 1625 | return VM_FAULT_SIGBUS; | 1624 | return VM_FAULT_SIGBUS; |
| 1626 | 1625 | ||
| 1627 | /* | 1626 | /* |
| 1628 | * Do we have something in the page cache already? Either | 1627 | * Do we have something in the page cache already? |
| 1629 | * way, try readahead, but disable the memcg OOM killer for it | ||
| 1630 | * as readahead is optional and no errors are propagated up | ||
| 1631 | * the fault stack. The OOM killer is enabled while trying to | ||
| 1632 | * instantiate the faulting page individually below. | ||
| 1633 | */ | 1628 | */ |
| 1634 | page = find_get_page(mapping, offset); | 1629 | page = find_get_page(mapping, offset); |
| 1635 | if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { | 1630 | if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { |
| @@ -1637,14 +1632,10 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) | |||
| 1637 | * We found the page, so try async readahead before | 1632 | * We found the page, so try async readahead before |
| 1638 | * waiting for the lock. | 1633 | * waiting for the lock. |
| 1639 | */ | 1634 | */ |
| 1640 | memcg_oom = mem_cgroup_toggle_oom(false); | ||
| 1641 | do_async_mmap_readahead(vma, ra, file, page, offset); | 1635 | do_async_mmap_readahead(vma, ra, file, page, offset); |
| 1642 | mem_cgroup_toggle_oom(memcg_oom); | ||
| 1643 | } else if (!page) { | 1636 | } else if (!page) { |
| 1644 | /* No page in the page cache at all */ | 1637 | /* No page in the page cache at all */ |
| 1645 | memcg_oom = mem_cgroup_toggle_oom(false); | ||
| 1646 | do_sync_mmap_readahead(vma, ra, file, offset); | 1638 | do_sync_mmap_readahead(vma, ra, file, offset); |
| 1647 | mem_cgroup_toggle_oom(memcg_oom); | ||
| 1648 | count_vm_event(PGMAJFAULT); | 1639 | count_vm_event(PGMAJFAULT); |
| 1649 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); | 1640 | mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); |
| 1650 | ret = VM_FAULT_MAJOR; | 1641 | ret = VM_FAULT_MAJOR; |
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7489884682d8..610e3df2768a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c | |||
| @@ -2697,6 +2697,7 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, | |||
| 2697 | 2697 | ||
| 2698 | mmun_start = haddr; | 2698 | mmun_start = haddr; |
| 2699 | mmun_end = haddr + HPAGE_PMD_SIZE; | 2699 | mmun_end = haddr + HPAGE_PMD_SIZE; |
| 2700 | again: | ||
| 2700 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); | 2701 | mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); |
| 2701 | spin_lock(&mm->page_table_lock); | 2702 | spin_lock(&mm->page_table_lock); |
| 2702 | if (unlikely(!pmd_trans_huge(*pmd))) { | 2703 | if (unlikely(!pmd_trans_huge(*pmd))) { |
| @@ -2719,7 +2720,14 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, | |||
| 2719 | split_huge_page(page); | 2720 | split_huge_page(page); |
| 2720 | 2721 | ||
| 2721 | put_page(page); | 2722 | put_page(page); |
| 2722 | BUG_ON(pmd_trans_huge(*pmd)); | 2723 | |
| 2724 | /* | ||
| 2725 | * We don't always have down_write of mmap_sem here: a racing | ||
| 2726 | * do_huge_pmd_wp_page() might have copied-on-write to another | ||
| 2727 | * huge page before our split_huge_page() got the anon_vma lock. | ||
| 2728 | */ | ||
| 2729 | if (unlikely(pmd_trans_huge(*pmd))) | ||
| 2730 | goto again; | ||
| 2723 | } | 2731 | } |
| 2724 | 2732 | ||
| 2725 | void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, | 2733 | void split_huge_page_pmd_mm(struct mm_struct *mm, unsigned long address, |
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b49579c7f2a5..0b7656e804d1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c | |||
| @@ -653,6 +653,7 @@ static void free_huge_page(struct page *page) | |||
| 653 | BUG_ON(page_count(page)); | 653 | BUG_ON(page_count(page)); |
| 654 | BUG_ON(page_mapcount(page)); | 654 | BUG_ON(page_mapcount(page)); |
| 655 | restore_reserve = PagePrivate(page); | 655 | restore_reserve = PagePrivate(page); |
| 656 | ClearPagePrivate(page); | ||
| 656 | 657 | ||
| 657 | spin_lock(&hugetlb_lock); | 658 | spin_lock(&hugetlb_lock); |
| 658 | hugetlb_cgroup_uncharge_page(hstate_index(h), | 659 | hugetlb_cgroup_uncharge_page(hstate_index(h), |
| @@ -695,8 +696,22 @@ static void prep_compound_gigantic_page(struct page *page, unsigned long order) | |||
| 695 | /* we rely on prep_new_huge_page to set the destructor */ | 696 | /* we rely on prep_new_huge_page to set the destructor */ |
| 696 | set_compound_order(page, order); | 697 | set_compound_order(page, order); |
| 697 | __SetPageHead(page); | 698 | __SetPageHead(page); |
| 699 | __ClearPageReserved(page); | ||
| 698 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { | 700 | for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { |
| 699 | __SetPageTail(p); | 701 | __SetPageTail(p); |
| 702 | /* | ||
| 703 | * For gigantic hugepages allocated through bootmem at | ||
| 704 | * boot, it's safer to be consistent with the not-gigantic | ||
| 705 | * hugepages and clear the PG_reserved bit from all tail pages | ||
| 706 | * too. Otherwse drivers using get_user_pages() to access tail | ||
| 707 | * pages may get the reference counting wrong if they see | ||
| 708 | * PG_reserved set on a tail page (despite the head page not | ||
| 709 | * having PG_reserved set). Enforcing this consistency between | ||
| 710 | * head and tail pages allows drivers to optimize away a check | ||
| 711 | * on the head page when they need know if put_page() is needed | ||
| 712 | * after get_user_pages(). | ||
| 713 | */ | ||
| 714 | __ClearPageReserved(p); | ||
| 700 | set_page_count(p, 0); | 715 | set_page_count(p, 0); |
| 701 | p->first_page = page; | 716 | p->first_page = page; |
| 702 | } | 717 | } |
| @@ -1329,9 +1344,9 @@ static void __init gather_bootmem_prealloc(void) | |||
| 1329 | #else | 1344 | #else |
| 1330 | page = virt_to_page(m); | 1345 | page = virt_to_page(m); |
| 1331 | #endif | 1346 | #endif |
| 1332 | __ClearPageReserved(page); | ||
| 1333 | WARN_ON(page_count(page) != 1); | 1347 | WARN_ON(page_count(page) != 1); |
| 1334 | prep_compound_huge_page(page, h->order); | 1348 | prep_compound_huge_page(page, h->order); |
| 1349 | WARN_ON(PageReserved(page)); | ||
| 1335 | prep_new_huge_page(h, page, page_to_nid(page)); | 1350 | prep_new_huge_page(h, page, page_to_nid(page)); |
| 1336 | /* | 1351 | /* |
| 1337 | * If we had gigantic hugepages allocated at boot time, we need | 1352 | * If we had gigantic hugepages allocated at boot time, we need |
diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1c52ddbc839b..34d3ca9572d6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c | |||
| @@ -866,6 +866,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | |||
| 866 | unsigned long val = 0; | 866 | unsigned long val = 0; |
| 867 | int cpu; | 867 | int cpu; |
| 868 | 868 | ||
| 869 | get_online_cpus(); | ||
| 869 | for_each_online_cpu(cpu) | 870 | for_each_online_cpu(cpu) |
| 870 | val += per_cpu(memcg->stat->events[idx], cpu); | 871 | val += per_cpu(memcg->stat->events[idx], cpu); |
| 871 | #ifdef CONFIG_HOTPLUG_CPU | 872 | #ifdef CONFIG_HOTPLUG_CPU |
| @@ -873,6 +874,7 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, | |||
| 873 | val += memcg->nocpu_base.events[idx]; | 874 | val += memcg->nocpu_base.events[idx]; |
| 874 | spin_unlock(&memcg->pcp_counter_lock); | 875 | spin_unlock(&memcg->pcp_counter_lock); |
| 875 | #endif | 876 | #endif |
| 877 | put_online_cpus(); | ||
| 876 | return val; | 878 | return val; |
| 877 | } | 879 | } |
| 878 | 880 | ||
| @@ -2159,110 +2161,59 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) | |||
| 2159 | memcg_wakeup_oom(memcg); | 2161 | memcg_wakeup_oom(memcg); |
| 2160 | } | 2162 | } |
| 2161 | 2163 | ||
| 2162 | /* | ||
| 2163 | * try to call OOM killer | ||
| 2164 | */ | ||
| 2165 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) | 2164 | static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) |
| 2166 | { | 2165 | { |
| 2167 | bool locked; | ||
| 2168 | int wakeups; | ||
| 2169 | |||
| 2170 | if (!current->memcg_oom.may_oom) | 2166 | if (!current->memcg_oom.may_oom) |
| 2171 | return; | 2167 | return; |
| 2172 | |||
| 2173 | current->memcg_oom.in_memcg_oom = 1; | ||
| 2174 | |||
| 2175 | /* | 2168 | /* |
| 2176 | * As with any blocking lock, a contender needs to start | 2169 | * We are in the middle of the charge context here, so we |
| 2177 | * listening for wakeups before attempting the trylock, | 2170 | * don't want to block when potentially sitting on a callstack |
| 2178 | * otherwise it can miss the wakeup from the unlock and sleep | 2171 | * that holds all kinds of filesystem and mm locks. |
| 2179 | * indefinitely. This is just open-coded because our locking | 2172 | * |
| 2180 | * is so particular to memcg hierarchies. | 2173 | * Also, the caller may handle a failed allocation gracefully |
| 2174 | * (like optional page cache readahead) and so an OOM killer | ||
| 2175 | * invocation might not even be necessary. | ||
| 2176 | * | ||
| 2177 | * That's why we don't do anything here except remember the | ||
| 2178 | * OOM context and then deal with it at the end of the page | ||
| 2179 | * fault when the stack is unwound, the locks are released, | ||
| 2180 | * and when we know whether the fault was overall successful. | ||
| 2181 | */ | 2181 | */ |
| 2182 | wakeups = atomic_read(&memcg->oom_wakeups); | 2182 | css_get(&memcg->css); |
| 2183 | mem_cgroup_mark_under_oom(memcg); | 2183 | current->memcg_oom.memcg = memcg; |
| 2184 | 2184 | current->memcg_oom.gfp_mask = mask; | |
| 2185 | locked = mem_cgroup_oom_trylock(memcg); | 2185 | current->memcg_oom.order = order; |
| 2186 | |||
| 2187 | if (locked) | ||
| 2188 | mem_cgroup_oom_notify(memcg); | ||
| 2189 | |||
| 2190 | if (locked && !memcg->oom_kill_disable) { | ||
| 2191 | mem_cgroup_unmark_under_oom(memcg); | ||
| 2192 | mem_cgroup_out_of_memory(memcg, mask, order); | ||
| 2193 | mem_cgroup_oom_unlock(memcg); | ||
| 2194 | /* | ||
| 2195 | * There is no guarantee that an OOM-lock contender | ||
| 2196 | * sees the wakeups triggered by the OOM kill | ||
| 2197 | * uncharges. Wake any sleepers explicitely. | ||
| 2198 | */ | ||
| 2199 | memcg_oom_recover(memcg); | ||
| 2200 | } else { | ||
| 2201 | /* | ||
| 2202 | * A system call can just return -ENOMEM, but if this | ||
| 2203 | * is a page fault and somebody else is handling the | ||
| 2204 | * OOM already, we need to sleep on the OOM waitqueue | ||
| 2205 | * for this memcg until the situation is resolved. | ||
| 2206 | * Which can take some time because it might be | ||
| 2207 | * handled by a userspace task. | ||
| 2208 | * | ||
| 2209 | * However, this is the charge context, which means | ||
| 2210 | * that we may sit on a large call stack and hold | ||
| 2211 | * various filesystem locks, the mmap_sem etc. and we | ||
| 2212 | * don't want the OOM handler to deadlock on them | ||
| 2213 | * while we sit here and wait. Store the current OOM | ||
| 2214 | * context in the task_struct, then return -ENOMEM. | ||
| 2215 | * At the end of the page fault handler, with the | ||
| 2216 | * stack unwound, pagefault_out_of_memory() will check | ||
| 2217 | * back with us by calling | ||
| 2218 | * mem_cgroup_oom_synchronize(), possibly putting the | ||
| 2219 | * task to sleep. | ||
| 2220 | */ | ||
| 2221 | current->memcg_oom.oom_locked = locked; | ||
| 2222 | current->memcg_oom.wakeups = wakeups; | ||
| 2223 | css_get(&memcg->css); | ||
| 2224 | current->memcg_oom.wait_on_memcg = memcg; | ||
| 2225 | } | ||
| 2226 | } | 2186 | } |
| 2227 | 2187 | ||
| 2228 | /** | 2188 | /** |
| 2229 | * mem_cgroup_oom_synchronize - complete memcg OOM handling | 2189 | * mem_cgroup_oom_synchronize - complete memcg OOM handling |
| 2190 | * @handle: actually kill/wait or just clean up the OOM state | ||
| 2230 | * | 2191 | * |
| 2231 | * This has to be called at the end of a page fault if the the memcg | 2192 | * This has to be called at the end of a page fault if the memcg OOM |
| 2232 | * OOM handler was enabled and the fault is returning %VM_FAULT_OOM. | 2193 | * handler was enabled. |
| 2233 | * | 2194 | * |
| 2234 | * Memcg supports userspace OOM handling, so failed allocations must | 2195 | * Memcg supports userspace OOM handling where failed allocations must |
| 2235 | * sleep on a waitqueue until the userspace task resolves the | 2196 | * sleep on a waitqueue until the userspace task resolves the |
| 2236 | * situation. Sleeping directly in the charge context with all kinds | 2197 | * situation. Sleeping directly in the charge context with all kinds |
| 2237 | * of locks held is not a good idea, instead we remember an OOM state | 2198 | * of locks held is not a good idea, instead we remember an OOM state |
| 2238 | * in the task and mem_cgroup_oom_synchronize() has to be called at | 2199 | * in the task and mem_cgroup_oom_synchronize() has to be called at |
| 2239 | * the end of the page fault to put the task to sleep and clean up the | 2200 | * the end of the page fault to complete the OOM handling. |
| 2240 | * OOM state. | ||
| 2241 | * | 2201 | * |
| 2242 | * Returns %true if an ongoing memcg OOM situation was detected and | 2202 | * Returns %true if an ongoing memcg OOM situation was detected and |
| 2243 | * finalized, %false otherwise. | 2203 | * completed, %false otherwise. |
| 2244 | */ | 2204 | */ |
| 2245 | bool mem_cgroup_oom_synchronize(void) | 2205 | bool mem_cgroup_oom_synchronize(bool handle) |
| 2246 | { | 2206 | { |
| 2207 | struct mem_cgroup *memcg = current->memcg_oom.memcg; | ||
| 2247 | struct oom_wait_info owait; | 2208 | struct oom_wait_info owait; |
| 2248 | struct mem_cgroup *memcg; | 2209 | bool locked; |
| 2249 | 2210 | ||
| 2250 | /* OOM is global, do not handle */ | 2211 | /* OOM is global, do not handle */ |
| 2251 | if (!current->memcg_oom.in_memcg_oom) | ||
| 2252 | return false; | ||
| 2253 | |||
| 2254 | /* | ||
| 2255 | * We invoked the OOM killer but there is a chance that a kill | ||
| 2256 | * did not free up any charges. Everybody else might already | ||
| 2257 | * be sleeping, so restart the fault and keep the rampage | ||
| 2258 | * going until some charges are released. | ||
| 2259 | */ | ||
| 2260 | memcg = current->memcg_oom.wait_on_memcg; | ||
| 2261 | if (!memcg) | 2212 | if (!memcg) |
| 2262 | goto out; | 2213 | return false; |
| 2263 | 2214 | ||
| 2264 | if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) | 2215 | if (!handle) |
| 2265 | goto out_memcg; | 2216 | goto cleanup; |
| 2266 | 2217 | ||
| 2267 | owait.memcg = memcg; | 2218 | owait.memcg = memcg; |
| 2268 | owait.wait.flags = 0; | 2219 | owait.wait.flags = 0; |
| @@ -2271,13 +2222,25 @@ bool mem_cgroup_oom_synchronize(void) | |||
| 2271 | INIT_LIST_HEAD(&owait.wait.task_list); | 2222 | INIT_LIST_HEAD(&owait.wait.task_list); |
| 2272 | 2223 | ||
| 2273 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); | 2224 | prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); |
| 2274 | /* Only sleep if we didn't miss any wakeups since OOM */ | 2225 | mem_cgroup_mark_under_oom(memcg); |
| 2275 | if (atomic_read(&memcg->oom_wakeups) == current->memcg_oom.wakeups) | 2226 | |
| 2227 | locked = mem_cgroup_oom_trylock(memcg); | ||
| 2228 | |||
| 2229 | if (locked) | ||
| 2230 | mem_cgroup_oom_notify(memcg); | ||
| 2231 | |||
| 2232 | if (locked && !memcg->oom_kill_disable) { | ||
| 2233 | mem_cgroup_unmark_under_oom(memcg); | ||
| 2234 | finish_wait(&memcg_oom_waitq, &owait.wait); | ||
| 2235 | mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, | ||
| 2236 | current->memcg_oom.order); | ||
| 2237 | } else { | ||
| 2276 | schedule(); | 2238 | schedule(); |
| 2277 | finish_wait(&memcg_oom_waitq, &owait.wait); | 2239 | mem_cgroup_unmark_under_oom(memcg); |
| 2278 | out_memcg: | 2240 | finish_wait(&memcg_oom_waitq, &owait.wait); |
| 2279 | mem_cgroup_unmark_under_oom(memcg); | 2241 | } |
| 2280 | if (current->memcg_oom.oom_locked) { | 2242 | |
| 2243 | if (locked) { | ||
| 2281 | mem_cgroup_oom_unlock(memcg); | 2244 | mem_cgroup_oom_unlock(memcg); |
| 2282 | /* | 2245 | /* |
| 2283 | * There is no guarantee that an OOM-lock contender | 2246 | * There is no guarantee that an OOM-lock contender |
| @@ -2286,10 +2249,9 @@ out_memcg: | |||
| 2286 | */ | 2249 | */ |
| 2287 | memcg_oom_recover(memcg); | 2250 | memcg_oom_recover(memcg); |
| 2288 | } | 2251 | } |
| 2252 | cleanup: | ||
| 2253 | current->memcg_oom.memcg = NULL; | ||
| 2289 | css_put(&memcg->css); | 2254 | css_put(&memcg->css); |
| 2290 | current->memcg_oom.wait_on_memcg = NULL; | ||
| 2291 | out: | ||
| 2292 | current->memcg_oom.in_memcg_oom = 0; | ||
| 2293 | return true; | 2255 | return true; |
| 2294 | } | 2256 | } |
| 2295 | 2257 | ||
| @@ -2703,6 +2665,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, | |||
| 2703 | || fatal_signal_pending(current))) | 2665 | || fatal_signal_pending(current))) |
| 2704 | goto bypass; | 2666 | goto bypass; |
| 2705 | 2667 | ||
| 2668 | if (unlikely(task_in_memcg_oom(current))) | ||
| 2669 | goto bypass; | ||
| 2670 | |||
| 2706 | /* | 2671 | /* |
| 2707 | * We always charge the cgroup the mm_struct belongs to. | 2672 | * We always charge the cgroup the mm_struct belongs to. |
| 2708 | * The mm_struct's mem_cgroup changes on task migration if the | 2673 | * The mm_struct's mem_cgroup changes on task migration if the |
| @@ -2801,6 +2766,8 @@ done: | |||
| 2801 | return 0; | 2766 | return 0; |
| 2802 | nomem: | 2767 | nomem: |
| 2803 | *ptr = NULL; | 2768 | *ptr = NULL; |
| 2769 | if (gfp_mask & __GFP_NOFAIL) | ||
| 2770 | return 0; | ||
| 2804 | return -ENOMEM; | 2771 | return -ENOMEM; |
| 2805 | bypass: | 2772 | bypass: |
| 2806 | *ptr = root_mem_cgroup; | 2773 | *ptr = root_mem_cgroup; |
diff --git a/mm/memory.c b/mm/memory.c index ca0003947115..1311f26497e6 100644 --- a/mm/memory.c +++ b/mm/memory.c | |||
| @@ -837,6 +837,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, | |||
| 837 | */ | 837 | */ |
| 838 | make_migration_entry_read(&entry); | 838 | make_migration_entry_read(&entry); |
| 839 | pte = swp_entry_to_pte(entry); | 839 | pte = swp_entry_to_pte(entry); |
| 840 | if (pte_swp_soft_dirty(*src_pte)) | ||
| 841 | pte = pte_swp_mksoft_dirty(pte); | ||
| 840 | set_pte_at(src_mm, addr, src_pte, pte); | 842 | set_pte_at(src_mm, addr, src_pte, pte); |
| 841 | } | 843 | } |
| 842 | } | 844 | } |
| @@ -3863,15 +3865,21 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 3863 | * space. Kernel faults are handled more gracefully. | 3865 | * space. Kernel faults are handled more gracefully. |
| 3864 | */ | 3866 | */ |
| 3865 | if (flags & FAULT_FLAG_USER) | 3867 | if (flags & FAULT_FLAG_USER) |
| 3866 | mem_cgroup_enable_oom(); | 3868 | mem_cgroup_oom_enable(); |
| 3867 | 3869 | ||
| 3868 | ret = __handle_mm_fault(mm, vma, address, flags); | 3870 | ret = __handle_mm_fault(mm, vma, address, flags); |
| 3869 | 3871 | ||
| 3870 | if (flags & FAULT_FLAG_USER) | 3872 | if (flags & FAULT_FLAG_USER) { |
| 3871 | mem_cgroup_disable_oom(); | 3873 | mem_cgroup_oom_disable(); |
| 3872 | 3874 | /* | |
| 3873 | if (WARN_ON(task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM))) | 3875 | * The task may have entered a memcg OOM situation but |
| 3874 | mem_cgroup_oom_synchronize(); | 3876 | * if the allocation error was handled gracefully (no |
| 3877 | * VM_FAULT_OOM), there is no need to kill anything. | ||
| 3878 | * Just clean up the OOM state peacefully. | ||
| 3879 | */ | ||
| 3880 | if (task_in_memcg_oom(current) && !(ret & VM_FAULT_OOM)) | ||
| 3881 | mem_cgroup_oom_synchronize(false); | ||
| 3882 | } | ||
| 3875 | 3883 | ||
| 3876 | return ret; | 3884 | return ret; |
| 3877 | } | 3885 | } |
diff --git a/mm/migrate.c b/mm/migrate.c index a26bccd44ccb..7a7325ee1d08 100644 --- a/mm/migrate.c +++ b/mm/migrate.c | |||
| @@ -161,6 +161,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, | |||
| 161 | 161 | ||
| 162 | get_page(new); | 162 | get_page(new); |
| 163 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); | 163 | pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); |
| 164 | if (pte_swp_soft_dirty(*ptep)) | ||
| 165 | pte = pte_mksoft_dirty(pte); | ||
| 164 | if (is_write_migration_entry(entry)) | 166 | if (is_write_migration_entry(entry)) |
| 165 | pte = pte_mkwrite(pte); | 167 | pte = pte_mkwrite(pte); |
| 166 | #ifdef CONFIG_HUGETLB_PAGE | 168 | #ifdef CONFIG_HUGETLB_PAGE |
diff --git a/mm/mprotect.c b/mm/mprotect.c index 94722a4d6b43..a3af058f68e4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c | |||
| @@ -94,13 +94,16 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, | |||
| 94 | swp_entry_t entry = pte_to_swp_entry(oldpte); | 94 | swp_entry_t entry = pte_to_swp_entry(oldpte); |
| 95 | 95 | ||
| 96 | if (is_write_migration_entry(entry)) { | 96 | if (is_write_migration_entry(entry)) { |
| 97 | pte_t newpte; | ||
| 97 | /* | 98 | /* |
| 98 | * A protection check is difficult so | 99 | * A protection check is difficult so |
| 99 | * just be safe and disable write | 100 | * just be safe and disable write |
| 100 | */ | 101 | */ |
| 101 | make_migration_entry_read(&entry); | 102 | make_migration_entry_read(&entry); |
| 102 | set_pte_at(mm, addr, pte, | 103 | newpte = swp_entry_to_pte(entry); |
| 103 | swp_entry_to_pte(entry)); | 104 | if (pte_swp_soft_dirty(oldpte)) |
| 105 | newpte = pte_swp_mksoft_dirty(newpte); | ||
| 106 | set_pte_at(mm, addr, pte, newpte); | ||
| 104 | } | 107 | } |
| 105 | pages++; | 108 | pages++; |
| 106 | } | 109 | } |
diff --git a/mm/mremap.c b/mm/mremap.c index 91b13d6a16d4..0843feb66f3d 100644 --- a/mm/mremap.c +++ b/mm/mremap.c | |||
| @@ -25,7 +25,6 @@ | |||
| 25 | #include <asm/uaccess.h> | 25 | #include <asm/uaccess.h> |
| 26 | #include <asm/cacheflush.h> | 26 | #include <asm/cacheflush.h> |
| 27 | #include <asm/tlbflush.h> | 27 | #include <asm/tlbflush.h> |
| 28 | #include <asm/pgalloc.h> | ||
| 29 | 28 | ||
| 30 | #include "internal.h" | 29 | #include "internal.h" |
| 31 | 30 | ||
| @@ -63,10 +62,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, | |||
| 63 | return NULL; | 62 | return NULL; |
| 64 | 63 | ||
| 65 | pmd = pmd_alloc(mm, pud, addr); | 64 | pmd = pmd_alloc(mm, pud, addr); |
| 66 | if (!pmd) { | 65 | if (!pmd) |
| 67 | pud_free(mm, pud); | ||
| 68 | return NULL; | 66 | return NULL; |
| 69 | } | ||
| 70 | 67 | ||
| 71 | VM_BUG_ON(pmd_trans_huge(*pmd)); | 68 | VM_BUG_ON(pmd_trans_huge(*pmd)); |
| 72 | 69 | ||
diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 314e9d274381..6738c47f1f72 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c | |||
| @@ -680,7 +680,7 @@ void pagefault_out_of_memory(void) | |||
| 680 | { | 680 | { |
| 681 | struct zonelist *zonelist; | 681 | struct zonelist *zonelist; |
| 682 | 682 | ||
| 683 | if (mem_cgroup_oom_synchronize()) | 683 | if (mem_cgroup_oom_synchronize(true)) |
| 684 | return; | 684 | return; |
| 685 | 685 | ||
| 686 | zonelist = node_zonelist(first_online_node, GFP_KERNEL); | 686 | zonelist = node_zonelist(first_online_node, GFP_KERNEL); |
diff --git a/mm/page-writeback.c b/mm/page-writeback.c index f5236f804aa6..63807583d8e8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c | |||
| @@ -1210,11 +1210,11 @@ static unsigned long dirty_poll_interval(unsigned long dirty, | |||
| 1210 | return 1; | 1210 | return 1; |
| 1211 | } | 1211 | } |
| 1212 | 1212 | ||
| 1213 | static long bdi_max_pause(struct backing_dev_info *bdi, | 1213 | static unsigned long bdi_max_pause(struct backing_dev_info *bdi, |
| 1214 | unsigned long bdi_dirty) | 1214 | unsigned long bdi_dirty) |
| 1215 | { | 1215 | { |
| 1216 | long bw = bdi->avg_write_bandwidth; | 1216 | unsigned long bw = bdi->avg_write_bandwidth; |
| 1217 | long t; | 1217 | unsigned long t; |
| 1218 | 1218 | ||
| 1219 | /* | 1219 | /* |
| 1220 | * Limit pause time for small memory systems. If sleeping for too long | 1220 | * Limit pause time for small memory systems. If sleeping for too long |
| @@ -1226,7 +1226,7 @@ static long bdi_max_pause(struct backing_dev_info *bdi, | |||
| 1226 | t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); | 1226 | t = bdi_dirty / (1 + bw / roundup_pow_of_two(1 + HZ / 8)); |
| 1227 | t++; | 1227 | t++; |
| 1228 | 1228 | ||
| 1229 | return min_t(long, t, MAX_PAUSE); | 1229 | return min_t(unsigned long, t, MAX_PAUSE); |
| 1230 | } | 1230 | } |
| 1231 | 1231 | ||
| 1232 | static long bdi_min_pause(struct backing_dev_info *bdi, | 1232 | static long bdi_min_pause(struct backing_dev_info *bdi, |
diff --git a/mm/swapfile.c b/mm/swapfile.c index 3963fc24fcc1..de7c904e52e5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c | |||
| @@ -1824,6 +1824,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1824 | struct filename *pathname; | 1824 | struct filename *pathname; |
| 1825 | int i, type, prev; | 1825 | int i, type, prev; |
| 1826 | int err; | 1826 | int err; |
| 1827 | unsigned int old_block_size; | ||
| 1827 | 1828 | ||
| 1828 | if (!capable(CAP_SYS_ADMIN)) | 1829 | if (!capable(CAP_SYS_ADMIN)) |
| 1829 | return -EPERM; | 1830 | return -EPERM; |
| @@ -1914,6 +1915,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1914 | } | 1915 | } |
| 1915 | 1916 | ||
| 1916 | swap_file = p->swap_file; | 1917 | swap_file = p->swap_file; |
| 1918 | old_block_size = p->old_block_size; | ||
| 1917 | p->swap_file = NULL; | 1919 | p->swap_file = NULL; |
| 1918 | p->max = 0; | 1920 | p->max = 0; |
| 1919 | swap_map = p->swap_map; | 1921 | swap_map = p->swap_map; |
| @@ -1938,7 +1940,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) | |||
| 1938 | inode = mapping->host; | 1940 | inode = mapping->host; |
| 1939 | if (S_ISBLK(inode->i_mode)) { | 1941 | if (S_ISBLK(inode->i_mode)) { |
| 1940 | struct block_device *bdev = I_BDEV(inode); | 1942 | struct block_device *bdev = I_BDEV(inode); |
| 1941 | set_blocksize(bdev, p->old_block_size); | 1943 | set_blocksize(bdev, old_block_size); |
| 1942 | blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); | 1944 | blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); |
| 1943 | } else { | 1945 | } else { |
| 1944 | mutex_lock(&inode->i_mutex); | 1946 | mutex_lock(&inode->i_mutex); |
diff --git a/mm/vmscan.c b/mm/vmscan.c index 53f2f82f83ae..eea668d9cff6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c | |||
| @@ -211,6 +211,7 @@ void unregister_shrinker(struct shrinker *shrinker) | |||
| 211 | down_write(&shrinker_rwsem); | 211 | down_write(&shrinker_rwsem); |
| 212 | list_del(&shrinker->list); | 212 | list_del(&shrinker->list); |
| 213 | up_write(&shrinker_rwsem); | 213 | up_write(&shrinker_rwsem); |
| 214 | kfree(shrinker->nr_deferred); | ||
| 214 | } | 215 | } |
| 215 | EXPORT_SYMBOL(unregister_shrinker); | 216 | EXPORT_SYMBOL(unregister_shrinker); |
| 216 | 217 | ||
diff --git a/mm/zswap.c b/mm/zswap.c index 841e35f1db22..d93510c6aa2d 100644 --- a/mm/zswap.c +++ b/mm/zswap.c | |||
| @@ -804,6 +804,10 @@ static void zswap_frontswap_invalidate_area(unsigned type) | |||
| 804 | } | 804 | } |
| 805 | tree->rbroot = RB_ROOT; | 805 | tree->rbroot = RB_ROOT; |
| 806 | spin_unlock(&tree->lock); | 806 | spin_unlock(&tree->lock); |
| 807 | |||
| 808 | zbud_destroy_pool(tree->pool); | ||
| 809 | kfree(tree); | ||
| 810 | zswap_trees[type] = NULL; | ||
| 807 | } | 811 | } |
| 808 | 812 | ||
| 809 | static struct zbud_ops zswap_zbud_ops = { | 813 | static struct zbud_ops zswap_zbud_ops = { |
diff --git a/tools/testing/selftests/timers/posix_timers.c b/tools/testing/selftests/timers/posix_timers.c index 4fa655d68a81..41bd85559d4b 100644 --- a/tools/testing/selftests/timers/posix_timers.c +++ b/tools/testing/selftests/timers/posix_timers.c | |||
| @@ -151,7 +151,7 @@ static int check_timer_create(int which) | |||
| 151 | fflush(stdout); | 151 | fflush(stdout); |
| 152 | 152 | ||
| 153 | done = 0; | 153 | done = 0; |
| 154 | timer_create(which, NULL, &id); | 154 | err = timer_create(which, NULL, &id); |
| 155 | if (err < 0) { | 155 | if (err < 0) { |
| 156 | perror("Can't create timer\n"); | 156 | perror("Can't create timer\n"); |
| 157 | return -1; | 157 | return -1; |
