diff options
author | Konstantin Khlebnikov <khlebnikov@openvz.org> | 2012-10-08 19:28:54 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-10-09 03:22:18 -0400 |
commit | e9714acf8c439688884234dcac2bfc38bb607d38 (patch) | |
tree | 2e21c88f855a9f5168a143fa9948141140ff02a2 | |
parent | 2dd8ad81e31d0d36a5d448329c646ab43eb17788 (diff) |
mm: kill vma flag VM_EXECUTABLE and mm->num_exe_file_vmas
Currently the kernel sets mm->exe_file during sys_execve() and then tracks
number of vmas with VM_EXECUTABLE flag in mm->num_exe_file_vmas, as soon
as this counter drops to zero kernel resets mm->exe_file to NULL. Plus it
resets mm->exe_file at last mmput() when mm->mm_users drops to zero.
VMA with VM_EXECUTABLE flag appears after mapping file with flag
MAP_EXECUTABLE, such vmas can appears only at sys_execve() or after vma
splitting, because sys_mmap ignores this flag. Usually binfmt module sets
mm->exe_file and mmaps executable vmas with this file, they hold
mm->exe_file while task is running.
comment from v2.6.25-6245-g925d1c4 ("procfs task exe symlink"),
where all this stuff was introduced:
> The kernel implements readlink of /proc/pid/exe by getting the file from
> the first executable VMA. Then the path to the file is reconstructed and
> reported as the result.
>
> Because of the VMA walk the code is slightly different on nommu systems.
> This patch avoids separate /proc/pid/exe code on nommu systems. Instead of
> walking the VMAs to find the first executable file-backed VMA we store a
> reference to the exec'd file in the mm_struct.
>
> That reference would prevent the filesystem holding the executable file
> from being unmounted even after unmapping the VMAs. So we track the number
> of VM_EXECUTABLE VMAs and drop the new reference when the last one is
> unmapped. This avoids pinning the mounted filesystem.
exe_file's vma accounting is hooked into every file mmap/unmmap and vma
split/merge just to fix some hypothetical pinning fs from umounting by mm,
which already unmapped all its executable files, but still alive.
Seems like currently nobody depends on this behaviour. We can try to
remove this logic and keep mm->exe_file until final mmput().
mm->exe_file is still protected with mm->mmap_sem, because we want to
change it via new sys_prctl(PR_SET_MM_EXE_FILE). Also via this syscall
task can change its mm->exe_file and unpin mountpoint explicitly.
Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Carsten Otte <cotte@de.ibm.com>
Cc: Chris Metcalf <cmetcalf@tilera.com>
Cc: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Eric Paris <eparis@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hugh Dickins <hughd@google.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: James Morris <james.l.morris@oracle.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Kentaro Takeda <takedakn@nttdata.co.jp>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Robert Richter <robert.richter@amd.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
Cc: Venkatesh Pallipadi <venki@google.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | include/linux/mm.h | 4 | ||||
-rw-r--r-- | include/linux/mm_types.h | 1 | ||||
-rw-r--r-- | include/linux/mman.h | 1 | ||||
-rw-r--r-- | kernel/fork.c | 21 | ||||
-rw-r--r-- | mm/mmap.c | 25 | ||||
-rw-r--r-- | mm/nommu.c | 11 |
6 files changed, 5 insertions, 58 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h index 44d3fc25f556..25ef49c1f2bd 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h | |||
@@ -87,7 +87,6 @@ extern unsigned int kobjsize(const void *objp); | |||
87 | #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ | 87 | #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ |
88 | #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ | 88 | #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ |
89 | 89 | ||
90 | #define VM_EXECUTABLE 0x00001000 | ||
91 | #define VM_LOCKED 0x00002000 | 90 | #define VM_LOCKED 0x00002000 |
92 | #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ | 91 | #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ |
93 | 92 | ||
@@ -1396,9 +1395,6 @@ extern void exit_mmap(struct mm_struct *); | |||
1396 | extern int mm_take_all_locks(struct mm_struct *mm); | 1395 | extern int mm_take_all_locks(struct mm_struct *mm); |
1397 | extern void mm_drop_all_locks(struct mm_struct *mm); | 1396 | extern void mm_drop_all_locks(struct mm_struct *mm); |
1398 | 1397 | ||
1399 | /* From fs/proc/base.c. callers must _not_ hold the mm's exe_file_lock */ | ||
1400 | extern void added_exe_file_vma(struct mm_struct *mm); | ||
1401 | extern void removed_exe_file_vma(struct mm_struct *mm); | ||
1402 | extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); | 1398 | extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); |
1403 | extern struct file *get_mm_exe_file(struct mm_struct *mm); | 1399 | extern struct file *get_mm_exe_file(struct mm_struct *mm); |
1404 | 1400 | ||
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bf7867200b95..58d3173eb365 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -394,7 +394,6 @@ struct mm_struct { | |||
394 | 394 | ||
395 | /* store ref to file /proc/<pid>/exe symlink points to */ | 395 | /* store ref to file /proc/<pid>/exe symlink points to */ |
396 | struct file *exe_file; | 396 | struct file *exe_file; |
397 | unsigned long num_exe_file_vmas; | ||
398 | #ifdef CONFIG_MMU_NOTIFIER | 397 | #ifdef CONFIG_MMU_NOTIFIER |
399 | struct mmu_notifier_mm *mmu_notifier_mm; | 398 | struct mmu_notifier_mm *mmu_notifier_mm; |
400 | #endif | 399 | #endif |
diff --git a/include/linux/mman.h b/include/linux/mman.h index 8b74e9b1d0ad..77cec2f45cb7 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h | |||
@@ -86,7 +86,6 @@ calc_vm_flag_bits(unsigned long flags) | |||
86 | { | 86 | { |
87 | return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | | 87 | return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | |
88 | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | | 88 | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | |
89 | _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) | | ||
90 | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); | 89 | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); |
91 | } | 90 | } |
92 | #endif /* __KERNEL__ */ | 91 | #endif /* __KERNEL__ */ |
diff --git a/kernel/fork.c b/kernel/fork.c index a57a993681ed..ec667f797af3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -622,26 +622,6 @@ void mmput(struct mm_struct *mm) | |||
622 | } | 622 | } |
623 | EXPORT_SYMBOL_GPL(mmput); | 623 | EXPORT_SYMBOL_GPL(mmput); |
624 | 624 | ||
625 | /* | ||
626 | * We added or removed a vma mapping the executable. The vmas are only mapped | ||
627 | * during exec and are not mapped with the mmap system call. | ||
628 | * Callers must hold down_write() on the mm's mmap_sem for these | ||
629 | */ | ||
630 | void added_exe_file_vma(struct mm_struct *mm) | ||
631 | { | ||
632 | mm->num_exe_file_vmas++; | ||
633 | } | ||
634 | |||
635 | void removed_exe_file_vma(struct mm_struct *mm) | ||
636 | { | ||
637 | mm->num_exe_file_vmas--; | ||
638 | if ((mm->num_exe_file_vmas == 0) && mm->exe_file) { | ||
639 | fput(mm->exe_file); | ||
640 | mm->exe_file = NULL; | ||
641 | } | ||
642 | |||
643 | } | ||
644 | |||
645 | void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) | 625 | void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) |
646 | { | 626 | { |
647 | if (new_exe_file) | 627 | if (new_exe_file) |
@@ -649,7 +629,6 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) | |||
649 | if (mm->exe_file) | 629 | if (mm->exe_file) |
650 | fput(mm->exe_file); | 630 | fput(mm->exe_file); |
651 | mm->exe_file = new_exe_file; | 631 | mm->exe_file = new_exe_file; |
652 | mm->num_exe_file_vmas = 0; | ||
653 | } | 632 | } |
654 | 633 | ||
655 | struct file *get_mm_exe_file(struct mm_struct *mm) | 634 | struct file *get_mm_exe_file(struct mm_struct *mm) |
@@ -231,11 +231,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) | |||
231 | might_sleep(); | 231 | might_sleep(); |
232 | if (vma->vm_ops && vma->vm_ops->close) | 232 | if (vma->vm_ops && vma->vm_ops->close) |
233 | vma->vm_ops->close(vma); | 233 | vma->vm_ops->close(vma); |
234 | if (vma->vm_file) { | 234 | if (vma->vm_file) |
235 | fput(vma->vm_file); | 235 | fput(vma->vm_file); |
236 | if (vma->vm_flags & VM_EXECUTABLE) | ||
237 | removed_exe_file_vma(vma->vm_mm); | ||
238 | } | ||
239 | mpol_put(vma_policy(vma)); | 236 | mpol_put(vma_policy(vma)); |
240 | kmem_cache_free(vm_area_cachep, vma); | 237 | kmem_cache_free(vm_area_cachep, vma); |
241 | return next; | 238 | return next; |
@@ -636,8 +633,6 @@ again: remove_next = 1 + (end > next->vm_end); | |||
636 | if (file) { | 633 | if (file) { |
637 | uprobe_munmap(next, next->vm_start, next->vm_end); | 634 | uprobe_munmap(next, next->vm_start, next->vm_end); |
638 | fput(file); | 635 | fput(file); |
639 | if (next->vm_flags & VM_EXECUTABLE) | ||
640 | removed_exe_file_vma(mm); | ||
641 | } | 636 | } |
642 | if (next->anon_vma) | 637 | if (next->anon_vma) |
643 | anon_vma_merge(vma, next); | 638 | anon_vma_merge(vma, next); |
@@ -1304,8 +1299,6 @@ munmap_back: | |||
1304 | error = file->f_op->mmap(file, vma); | 1299 | error = file->f_op->mmap(file, vma); |
1305 | if (error) | 1300 | if (error) |
1306 | goto unmap_and_free_vma; | 1301 | goto unmap_and_free_vma; |
1307 | if (vm_flags & VM_EXECUTABLE) | ||
1308 | added_exe_file_vma(mm); | ||
1309 | 1302 | ||
1310 | /* Can addr have changed?? | 1303 | /* Can addr have changed?? |
1311 | * | 1304 | * |
@@ -1987,11 +1980,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
1987 | if (anon_vma_clone(new, vma)) | 1980 | if (anon_vma_clone(new, vma)) |
1988 | goto out_free_mpol; | 1981 | goto out_free_mpol; |
1989 | 1982 | ||
1990 | if (new->vm_file) { | 1983 | if (new->vm_file) |
1991 | get_file(new->vm_file); | 1984 | get_file(new->vm_file); |
1992 | if (vma->vm_flags & VM_EXECUTABLE) | ||
1993 | added_exe_file_vma(mm); | ||
1994 | } | ||
1995 | 1985 | ||
1996 | if (new->vm_ops && new->vm_ops->open) | 1986 | if (new->vm_ops && new->vm_ops->open) |
1997 | new->vm_ops->open(new); | 1987 | new->vm_ops->open(new); |
@@ -2009,11 +1999,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, | |||
2009 | /* Clean everything up if vma_adjust failed. */ | 1999 | /* Clean everything up if vma_adjust failed. */ |
2010 | if (new->vm_ops && new->vm_ops->close) | 2000 | if (new->vm_ops && new->vm_ops->close) |
2011 | new->vm_ops->close(new); | 2001 | new->vm_ops->close(new); |
2012 | if (new->vm_file) { | 2002 | if (new->vm_file) |
2013 | if (vma->vm_flags & VM_EXECUTABLE) | ||
2014 | removed_exe_file_vma(mm); | ||
2015 | fput(new->vm_file); | 2003 | fput(new->vm_file); |
2016 | } | ||
2017 | unlink_anon_vmas(new); | 2004 | unlink_anon_vmas(new); |
2018 | out_free_mpol: | 2005 | out_free_mpol: |
2019 | mpol_put(pol); | 2006 | mpol_put(pol); |
@@ -2408,12 +2395,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, | |||
2408 | new_vma->vm_start = addr; | 2395 | new_vma->vm_start = addr; |
2409 | new_vma->vm_end = addr + len; | 2396 | new_vma->vm_end = addr + len; |
2410 | new_vma->vm_pgoff = pgoff; | 2397 | new_vma->vm_pgoff = pgoff; |
2411 | if (new_vma->vm_file) { | 2398 | if (new_vma->vm_file) |
2412 | get_file(new_vma->vm_file); | 2399 | get_file(new_vma->vm_file); |
2413 | |||
2414 | if (vma->vm_flags & VM_EXECUTABLE) | ||
2415 | added_exe_file_vma(mm); | ||
2416 | } | ||
2417 | if (new_vma->vm_ops && new_vma->vm_ops->open) | 2400 | if (new_vma->vm_ops && new_vma->vm_ops->open) |
2418 | new_vma->vm_ops->open(new_vma); | 2401 | new_vma->vm_ops->open(new_vma); |
2419 | vma_link(mm, new_vma, prev, rb_link, rb_parent); | 2402 | vma_link(mm, new_vma, prev, rb_link, rb_parent); |
diff --git a/mm/nommu.c b/mm/nommu.c index 98318dcff742..9c4a7b63a4df 100644 --- a/mm/nommu.c +++ b/mm/nommu.c | |||
@@ -789,11 +789,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) | |||
789 | kenter("%p", vma); | 789 | kenter("%p", vma); |
790 | if (vma->vm_ops && vma->vm_ops->close) | 790 | if (vma->vm_ops && vma->vm_ops->close) |
791 | vma->vm_ops->close(vma); | 791 | vma->vm_ops->close(vma); |
792 | if (vma->vm_file) { | 792 | if (vma->vm_file) |
793 | fput(vma->vm_file); | 793 | fput(vma->vm_file); |
794 | if (vma->vm_flags & VM_EXECUTABLE) | ||
795 | removed_exe_file_vma(mm); | ||
796 | } | ||
797 | put_nommu_region(vma->vm_region); | 794 | put_nommu_region(vma->vm_region); |
798 | kmem_cache_free(vm_area_cachep, vma); | 795 | kmem_cache_free(vm_area_cachep, vma); |
799 | } | 796 | } |
@@ -1284,10 +1281,6 @@ unsigned long do_mmap_pgoff(struct file *file, | |||
1284 | if (file) { | 1281 | if (file) { |
1285 | region->vm_file = get_file(file); | 1282 | region->vm_file = get_file(file); |
1286 | vma->vm_file = get_file(file); | 1283 | vma->vm_file = get_file(file); |
1287 | if (vm_flags & VM_EXECUTABLE) { | ||
1288 | added_exe_file_vma(current->mm); | ||
1289 | vma->vm_mm = current->mm; | ||
1290 | } | ||
1291 | } | 1284 | } |
1292 | 1285 | ||
1293 | down_write(&nommu_region_sem); | 1286 | down_write(&nommu_region_sem); |
@@ -1440,8 +1433,6 @@ error: | |||
1440 | kmem_cache_free(vm_region_jar, region); | 1433 | kmem_cache_free(vm_region_jar, region); |
1441 | if (vma->vm_file) | 1434 | if (vma->vm_file) |
1442 | fput(vma->vm_file); | 1435 | fput(vma->vm_file); |
1443 | if (vma->vm_flags & VM_EXECUTABLE) | ||
1444 | removed_exe_file_vma(vma->vm_mm); | ||
1445 | kmem_cache_free(vm_area_cachep, vma); | 1436 | kmem_cache_free(vm_area_cachep, vma); |
1446 | kleave(" = %d", ret); | 1437 | kleave(" = %d", ret); |
1447 | return ret; | 1438 | return ret; |