aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorKonstantin Khlebnikov <khlebnikov@openvz.org>2012-10-08 19:28:54 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-09 03:22:18 -0400
commite9714acf8c439688884234dcac2bfc38bb607d38 (patch)
tree2e21c88f855a9f5168a143fa9948141140ff02a2
parent2dd8ad81e31d0d36a5d448329c646ab43eb17788 (diff)
mm: kill vma flag VM_EXECUTABLE and mm->num_exe_file_vmas
Currently the kernel sets mm->exe_file during sys_execve() and then tracks number of vmas with VM_EXECUTABLE flag in mm->num_exe_file_vmas, as soon as this counter drops to zero kernel resets mm->exe_file to NULL. Plus it resets mm->exe_file at last mmput() when mm->mm_users drops to zero. VMA with VM_EXECUTABLE flag appears after mapping file with flag MAP_EXECUTABLE, such vmas can appears only at sys_execve() or after vma splitting, because sys_mmap ignores this flag. Usually binfmt module sets mm->exe_file and mmaps executable vmas with this file, they hold mm->exe_file while task is running. comment from v2.6.25-6245-g925d1c4 ("procfs task exe symlink"), where all this stuff was introduced: > The kernel implements readlink of /proc/pid/exe by getting the file from > the first executable VMA. Then the path to the file is reconstructed and > reported as the result. > > Because of the VMA walk the code is slightly different on nommu systems. > This patch avoids separate /proc/pid/exe code on nommu systems. Instead of > walking the VMAs to find the first executable file-backed VMA we store a > reference to the exec'd file in the mm_struct. > > That reference would prevent the filesystem holding the executable file > from being unmounted even after unmapping the VMAs. So we track the number > of VM_EXECUTABLE VMAs and drop the new reference when the last one is > unmapped. This avoids pinning the mounted filesystem. exe_file's vma accounting is hooked into every file mmap/unmmap and vma split/merge just to fix some hypothetical pinning fs from umounting by mm, which already unmapped all its executable files, but still alive. Seems like currently nobody depends on this behaviour. We can try to remove this logic and keep mm->exe_file until final mmput(). mm->exe_file is still protected with mm->mmap_sem, because we want to change it via new sys_prctl(PR_SET_MM_EXE_FILE). Also via this syscall task can change its mm->exe_file and unpin mountpoint explicitly. Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Carsten Otte <cotte@de.ibm.com> Cc: Chris Metcalf <cmetcalf@tilera.com> Cc: Cyrill Gorcunov <gorcunov@openvz.org> Cc: Eric Paris <eparis@redhat.com> Cc: H. Peter Anvin <hpa@zytor.com> Cc: Hugh Dickins <hughd@google.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: James Morris <james.l.morris@oracle.com> Cc: Jason Baron <jbaron@redhat.com> Cc: Kentaro Takeda <takedakn@nttdata.co.jp> Cc: Matt Helsley <matthltc@us.ibm.com> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Robert Richter <robert.richter@amd.com> Cc: Suresh Siddha <suresh.b.siddha@intel.com> Cc: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp> Cc: Venkatesh Pallipadi <venki@google.com> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--include/linux/mm.h4
-rw-r--r--include/linux/mm_types.h1
-rw-r--r--include/linux/mman.h1
-rw-r--r--kernel/fork.c21
-rw-r--r--mm/mmap.c25
-rw-r--r--mm/nommu.c11
6 files changed, 5 insertions, 58 deletions
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 44d3fc25f556..25ef49c1f2bd 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -87,7 +87,6 @@ extern unsigned int kobjsize(const void *objp);
87#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ 87#define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */
88#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ 88#define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */
89 89
90#define VM_EXECUTABLE 0x00001000
91#define VM_LOCKED 0x00002000 90#define VM_LOCKED 0x00002000
92#define VM_IO 0x00004000 /* Memory mapped I/O or similar */ 91#define VM_IO 0x00004000 /* Memory mapped I/O or similar */
93 92
@@ -1396,9 +1395,6 @@ extern void exit_mmap(struct mm_struct *);
1396extern int mm_take_all_locks(struct mm_struct *mm); 1395extern int mm_take_all_locks(struct mm_struct *mm);
1397extern void mm_drop_all_locks(struct mm_struct *mm); 1396extern void mm_drop_all_locks(struct mm_struct *mm);
1398 1397
1399/* From fs/proc/base.c. callers must _not_ hold the mm's exe_file_lock */
1400extern void added_exe_file_vma(struct mm_struct *mm);
1401extern void removed_exe_file_vma(struct mm_struct *mm);
1402extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); 1398extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file);
1403extern struct file *get_mm_exe_file(struct mm_struct *mm); 1399extern struct file *get_mm_exe_file(struct mm_struct *mm);
1404 1400
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index bf7867200b95..58d3173eb365 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -394,7 +394,6 @@ struct mm_struct {
394 394
395 /* store ref to file /proc/<pid>/exe symlink points to */ 395 /* store ref to file /proc/<pid>/exe symlink points to */
396 struct file *exe_file; 396 struct file *exe_file;
397 unsigned long num_exe_file_vmas;
398#ifdef CONFIG_MMU_NOTIFIER 397#ifdef CONFIG_MMU_NOTIFIER
399 struct mmu_notifier_mm *mmu_notifier_mm; 398 struct mmu_notifier_mm *mmu_notifier_mm;
400#endif 399#endif
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 8b74e9b1d0ad..77cec2f45cb7 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -86,7 +86,6 @@ calc_vm_flag_bits(unsigned long flags)
86{ 86{
87 return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | 87 return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
88 _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | 88 _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
89 _calc_vm_trans(flags, MAP_EXECUTABLE, VM_EXECUTABLE) |
90 _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); 89 _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED );
91} 90}
92#endif /* __KERNEL__ */ 91#endif /* __KERNEL__ */
diff --git a/kernel/fork.c b/kernel/fork.c
index a57a993681ed..ec667f797af3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -622,26 +622,6 @@ void mmput(struct mm_struct *mm)
622} 622}
623EXPORT_SYMBOL_GPL(mmput); 623EXPORT_SYMBOL_GPL(mmput);
624 624
625/*
626 * We added or removed a vma mapping the executable. The vmas are only mapped
627 * during exec and are not mapped with the mmap system call.
628 * Callers must hold down_write() on the mm's mmap_sem for these
629 */
630void added_exe_file_vma(struct mm_struct *mm)
631{
632 mm->num_exe_file_vmas++;
633}
634
635void removed_exe_file_vma(struct mm_struct *mm)
636{
637 mm->num_exe_file_vmas--;
638 if ((mm->num_exe_file_vmas == 0) && mm->exe_file) {
639 fput(mm->exe_file);
640 mm->exe_file = NULL;
641 }
642
643}
644
645void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) 625void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
646{ 626{
647 if (new_exe_file) 627 if (new_exe_file)
@@ -649,7 +629,6 @@ void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
649 if (mm->exe_file) 629 if (mm->exe_file)
650 fput(mm->exe_file); 630 fput(mm->exe_file);
651 mm->exe_file = new_exe_file; 631 mm->exe_file = new_exe_file;
652 mm->num_exe_file_vmas = 0;
653} 632}
654 633
655struct file *get_mm_exe_file(struct mm_struct *mm) 634struct file *get_mm_exe_file(struct mm_struct *mm)
diff --git a/mm/mmap.c b/mm/mmap.c
index d0686d355113..c1ad2e78ea58 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -231,11 +231,8 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
231 might_sleep(); 231 might_sleep();
232 if (vma->vm_ops && vma->vm_ops->close) 232 if (vma->vm_ops && vma->vm_ops->close)
233 vma->vm_ops->close(vma); 233 vma->vm_ops->close(vma);
234 if (vma->vm_file) { 234 if (vma->vm_file)
235 fput(vma->vm_file); 235 fput(vma->vm_file);
236 if (vma->vm_flags & VM_EXECUTABLE)
237 removed_exe_file_vma(vma->vm_mm);
238 }
239 mpol_put(vma_policy(vma)); 236 mpol_put(vma_policy(vma));
240 kmem_cache_free(vm_area_cachep, vma); 237 kmem_cache_free(vm_area_cachep, vma);
241 return next; 238 return next;
@@ -636,8 +633,6 @@ again: remove_next = 1 + (end > next->vm_end);
636 if (file) { 633 if (file) {
637 uprobe_munmap(next, next->vm_start, next->vm_end); 634 uprobe_munmap(next, next->vm_start, next->vm_end);
638 fput(file); 635 fput(file);
639 if (next->vm_flags & VM_EXECUTABLE)
640 removed_exe_file_vma(mm);
641 } 636 }
642 if (next->anon_vma) 637 if (next->anon_vma)
643 anon_vma_merge(vma, next); 638 anon_vma_merge(vma, next);
@@ -1304,8 +1299,6 @@ munmap_back:
1304 error = file->f_op->mmap(file, vma); 1299 error = file->f_op->mmap(file, vma);
1305 if (error) 1300 if (error)
1306 goto unmap_and_free_vma; 1301 goto unmap_and_free_vma;
1307 if (vm_flags & VM_EXECUTABLE)
1308 added_exe_file_vma(mm);
1309 1302
1310 /* Can addr have changed?? 1303 /* Can addr have changed??
1311 * 1304 *
@@ -1987,11 +1980,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
1987 if (anon_vma_clone(new, vma)) 1980 if (anon_vma_clone(new, vma))
1988 goto out_free_mpol; 1981 goto out_free_mpol;
1989 1982
1990 if (new->vm_file) { 1983 if (new->vm_file)
1991 get_file(new->vm_file); 1984 get_file(new->vm_file);
1992 if (vma->vm_flags & VM_EXECUTABLE)
1993 added_exe_file_vma(mm);
1994 }
1995 1985
1996 if (new->vm_ops && new->vm_ops->open) 1986 if (new->vm_ops && new->vm_ops->open)
1997 new->vm_ops->open(new); 1987 new->vm_ops->open(new);
@@ -2009,11 +1999,8 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
2009 /* Clean everything up if vma_adjust failed. */ 1999 /* Clean everything up if vma_adjust failed. */
2010 if (new->vm_ops && new->vm_ops->close) 2000 if (new->vm_ops && new->vm_ops->close)
2011 new->vm_ops->close(new); 2001 new->vm_ops->close(new);
2012 if (new->vm_file) { 2002 if (new->vm_file)
2013 if (vma->vm_flags & VM_EXECUTABLE)
2014 removed_exe_file_vma(mm);
2015 fput(new->vm_file); 2003 fput(new->vm_file);
2016 }
2017 unlink_anon_vmas(new); 2004 unlink_anon_vmas(new);
2018 out_free_mpol: 2005 out_free_mpol:
2019 mpol_put(pol); 2006 mpol_put(pol);
@@ -2408,12 +2395,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
2408 new_vma->vm_start = addr; 2395 new_vma->vm_start = addr;
2409 new_vma->vm_end = addr + len; 2396 new_vma->vm_end = addr + len;
2410 new_vma->vm_pgoff = pgoff; 2397 new_vma->vm_pgoff = pgoff;
2411 if (new_vma->vm_file) { 2398 if (new_vma->vm_file)
2412 get_file(new_vma->vm_file); 2399 get_file(new_vma->vm_file);
2413
2414 if (vma->vm_flags & VM_EXECUTABLE)
2415 added_exe_file_vma(mm);
2416 }
2417 if (new_vma->vm_ops && new_vma->vm_ops->open) 2400 if (new_vma->vm_ops && new_vma->vm_ops->open)
2418 new_vma->vm_ops->open(new_vma); 2401 new_vma->vm_ops->open(new_vma);
2419 vma_link(mm, new_vma, prev, rb_link, rb_parent); 2402 vma_link(mm, new_vma, prev, rb_link, rb_parent);
diff --git a/mm/nommu.c b/mm/nommu.c
index 98318dcff742..9c4a7b63a4df 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -789,11 +789,8 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
789 kenter("%p", vma); 789 kenter("%p", vma);
790 if (vma->vm_ops && vma->vm_ops->close) 790 if (vma->vm_ops && vma->vm_ops->close)
791 vma->vm_ops->close(vma); 791 vma->vm_ops->close(vma);
792 if (vma->vm_file) { 792 if (vma->vm_file)
793 fput(vma->vm_file); 793 fput(vma->vm_file);
794 if (vma->vm_flags & VM_EXECUTABLE)
795 removed_exe_file_vma(mm);
796 }
797 put_nommu_region(vma->vm_region); 794 put_nommu_region(vma->vm_region);
798 kmem_cache_free(vm_area_cachep, vma); 795 kmem_cache_free(vm_area_cachep, vma);
799} 796}
@@ -1284,10 +1281,6 @@ unsigned long do_mmap_pgoff(struct file *file,
1284 if (file) { 1281 if (file) {
1285 region->vm_file = get_file(file); 1282 region->vm_file = get_file(file);
1286 vma->vm_file = get_file(file); 1283 vma->vm_file = get_file(file);
1287 if (vm_flags & VM_EXECUTABLE) {
1288 added_exe_file_vma(current->mm);
1289 vma->vm_mm = current->mm;
1290 }
1291 } 1284 }
1292 1285
1293 down_write(&nommu_region_sem); 1286 down_write(&nommu_region_sem);
@@ -1440,8 +1433,6 @@ error:
1440 kmem_cache_free(vm_region_jar, region); 1433 kmem_cache_free(vm_region_jar, region);
1441 if (vma->vm_file) 1434 if (vma->vm_file)
1442 fput(vma->vm_file); 1435 fput(vma->vm_file);
1443 if (vma->vm_flags & VM_EXECUTABLE)
1444 removed_exe_file_vma(vma->vm_mm);
1445 kmem_cache_free(vm_area_cachep, vma); 1436 kmem_cache_free(vm_area_cachep, vma);
1446 kleave(" = %d", ret); 1437 kleave(" = %d", ret);
1447 return ret; 1438 return ret;