aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDavid Herrmann <dh.herrmann@gmail.com>2014-08-08 17:25:29 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-08-08 18:57:31 -0400
commit9183df25fe7b194563db3fec6dc3202a5855839c (patch)
tree8af760c24e1ee26e159598ae2a66912ef40cd3b0
parent40e041a2c858b3caefc757e26cb85bfceae5062b (diff)
shm: add memfd_create() syscall
memfd_create() is similar to mmap(MAP_ANON), but returns a file-descriptor that you can pass to mmap(). It can support sealing and avoids any connection to user-visible mount-points. Thus, it's not subject to quotas on mounted file-systems, but can be used like malloc()'ed memory, but with a file-descriptor to it. memfd_create() returns the raw shmem file, so calls like ftruncate() can be used to modify the underlying inode. Also calls like fstat() will return proper information and mark the file as regular file. If you want sealing, you can specify MFD_ALLOW_SEALING. Otherwise, sealing is not supported (like on all other regular files). Compared to O_TMPFILE, it does not require a tmpfs mount-point and is not subject to a filesystem size limit. It is still properly accounted to memcg limits, though, and to the same overcommit or no-overcommit accounting as all user memory. Signed-off-by: David Herrmann <dh.herrmann@gmail.com> Acked-by: Hugh Dickins <hughd@google.com> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Cc: Ryan Lortie <desrt@desrt.ca> Cc: Lennart Poettering <lennart@poettering.net> Cc: Daniel Mack <zonque@gmail.com> Cc: Andy Lutomirski <luto@amacapital.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/x86/syscalls/syscall_32.tbl1
-rw-r--r--arch/x86/syscalls/syscall_64.tbl1
-rw-r--r--include/linux/syscalls.h1
-rw-r--r--include/uapi/linux/memfd.h8
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--mm/shmem.c73
6 files changed, 85 insertions, 0 deletions
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index d1b4a119d4a5..028b78168d85 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -362,3 +362,4 @@
362353 i386 renameat2 sys_renameat2 362353 i386 renameat2 sys_renameat2
363354 i386 seccomp sys_seccomp 363354 i386 seccomp sys_seccomp
364355 i386 getrandom sys_getrandom 364355 i386 getrandom sys_getrandom
365356 i386 memfd_create sys_memfd_create
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 252c804bb1aa..ca2b9aa78c81 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -325,6 +325,7 @@
325316 common renameat2 sys_renameat2 325316 common renameat2 sys_renameat2
326317 common seccomp sys_seccomp 326317 common seccomp sys_seccomp
327318 common getrandom sys_getrandom 327318 common getrandom sys_getrandom
328319 common memfd_create sys_memfd_create
328 329
329# 330#
330# x32-specific system call numbers start at 512 to avoid cache impact 331# x32-specific system call numbers start at 512 to avoid cache impact
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 701daff5d899..15a069425cbf 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -802,6 +802,7 @@ asmlinkage long sys_timerfd_settime(int ufd, int flags,
802asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr); 802asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
803asmlinkage long sys_eventfd(unsigned int count); 803asmlinkage long sys_eventfd(unsigned int count);
804asmlinkage long sys_eventfd2(unsigned int count, int flags); 804asmlinkage long sys_eventfd2(unsigned int count, int flags);
805asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
805asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); 806asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
806asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int); 807asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
807asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *, 808asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
diff --git a/include/uapi/linux/memfd.h b/include/uapi/linux/memfd.h
new file mode 100644
index 000000000000..534e364bda92
--- /dev/null
+++ b/include/uapi/linux/memfd.h
@@ -0,0 +1,8 @@
1#ifndef _UAPI_LINUX_MEMFD_H
2#define _UAPI_LINUX_MEMFD_H
3
4/* flags for memfd_create(2) (unsigned int) */
5#define MFD_CLOEXEC 0x0001U
6#define MFD_ALLOW_SEALING 0x0002U
7
8#endif /* _UAPI_LINUX_MEMFD_H */
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 2904a2105914..1f79e3714533 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -197,6 +197,7 @@ cond_syscall(compat_sys_timerfd_settime);
197cond_syscall(compat_sys_timerfd_gettime); 197cond_syscall(compat_sys_timerfd_gettime);
198cond_syscall(sys_eventfd); 198cond_syscall(sys_eventfd);
199cond_syscall(sys_eventfd2); 199cond_syscall(sys_eventfd2);
200cond_syscall(sys_memfd_create);
200 201
201/* performance counters: */ 202/* performance counters: */
202cond_syscall(sys_perf_event_open); 203cond_syscall(sys_perf_event_open);
diff --git a/mm/shmem.c b/mm/shmem.c
index 8b43bb7a4efe..4a5498795a2b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -66,7 +66,9 @@ static struct vfsmount *shm_mnt;
66#include <linux/highmem.h> 66#include <linux/highmem.h>
67#include <linux/seq_file.h> 67#include <linux/seq_file.h>
68#include <linux/magic.h> 68#include <linux/magic.h>
69#include <linux/syscalls.h>
69#include <linux/fcntl.h> 70#include <linux/fcntl.h>
71#include <uapi/linux/memfd.h>
70 72
71#include <asm/uaccess.h> 73#include <asm/uaccess.h>
72#include <asm/pgtable.h> 74#include <asm/pgtable.h>
@@ -2732,6 +2734,77 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
2732 shmem_show_mpol(seq, sbinfo->mpol); 2734 shmem_show_mpol(seq, sbinfo->mpol);
2733 return 0; 2735 return 0;
2734} 2736}
2737
2738#define MFD_NAME_PREFIX "memfd:"
2739#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1)
2740#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN)
2741
2742#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING)
2743
2744SYSCALL_DEFINE2(memfd_create,
2745 const char __user *, uname,
2746 unsigned int, flags)
2747{
2748 struct shmem_inode_info *info;
2749 struct file *file;
2750 int fd, error;
2751 char *name;
2752 long len;
2753
2754 if (flags & ~(unsigned int)MFD_ALL_FLAGS)
2755 return -EINVAL;
2756
2757 /* length includes terminating zero */
2758 len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1);
2759 if (len <= 0)
2760 return -EFAULT;
2761 if (len > MFD_NAME_MAX_LEN + 1)
2762 return -EINVAL;
2763
2764 name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_TEMPORARY);
2765 if (!name)
2766 return -ENOMEM;
2767
2768 strcpy(name, MFD_NAME_PREFIX);
2769 if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) {
2770 error = -EFAULT;
2771 goto err_name;
2772 }
2773
2774 /* terminating-zero may have changed after strnlen_user() returned */
2775 if (name[len + MFD_NAME_PREFIX_LEN - 1]) {
2776 error = -EFAULT;
2777 goto err_name;
2778 }
2779
2780 fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
2781 if (fd < 0) {
2782 error = fd;
2783 goto err_name;
2784 }
2785
2786 file = shmem_file_setup(name, 0, VM_NORESERVE);
2787 if (IS_ERR(file)) {
2788 error = PTR_ERR(file);
2789 goto err_fd;
2790 }
2791 info = SHMEM_I(file_inode(file));
2792 file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
2793 file->f_flags |= O_RDWR | O_LARGEFILE;
2794 if (flags & MFD_ALLOW_SEALING)
2795 info->seals &= ~F_SEAL_SEAL;
2796
2797 fd_install(fd, file);
2798 kfree(name);
2799 return fd;
2800
2801err_fd:
2802 put_unused_fd(fd);
2803err_name:
2804 kfree(name);
2805 return error;
2806}
2807
2735#endif /* CONFIG_TMPFS */ 2808#endif /* CONFIG_TMPFS */
2736 2809
2737static void shmem_put_super(struct super_block *sb) 2810static void shmem_put_super(struct super_block *sb)