summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2019-05-07 15:30:24 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2019-05-07 15:30:24 -0400
commiteac7078a0fff1e72cf2b641721e3f55ec7e5e21e (patch)
treedaf83e62d8313025a2c5e4ceb2d7f3c1b8ff057c
parent41bc10cabe96bbd0ff3e2813d15f9070bff57a03 (diff)
parent43c6afee48d4d866d5eb984d3a5dbbc7d9b4e7bf (diff)
Merge tag 'pidfd-v5.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux
Pull pidfd updates from Christian Brauner: "This patchset makes it possible to retrieve pidfds at process creation time by introducing the new flag CLONE_PIDFD to the clone() system call. Linus originally suggested to implement this as a new flag to clone() instead of making it a separate system call. After a thorough review from Oleg CLONE_PIDFD returns pidfds in the parent_tidptr argument. This means we can give back the associated pid and the pidfd at the same time. Access to process metadata information thus becomes rather trivial. As has been agreed, CLONE_PIDFD creates file descriptors based on anonymous inodes similar to the new mount api. They are made unconditional by this patchset as they are now needed by core kernel code (vfs, pidfd) even more than they already were before (timerfd, signalfd, io_uring, epoll etc.). The core patchset is rather small. The bulky looking changelist is caused by David's very simple changes to Kconfig to make anon inodes unconditional. A pidfd comes with additional information in fdinfo if the kernel supports procfs. The fdinfo file contains the pid of the process in the callers pid namespace in the same format as the procfs status file, i.e. "Pid:\t%d". To remove worries about missing metadata access this patchset comes with a sample/test program that illustrates how a combination of CLONE_PIDFD and pidfd_send_signal() can be used to gain race-free access to process metadata through /proc/<pid>. Further work based on this patchset has been done by Joel. His work makes pidfds pollable. It finished too late for this merge window. I would prefer to have it sitting in linux-next for a while and send it for inclusion during the 5.3 merge window" * tag 'pidfd-v5.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux: samples: show race-free pidfd metadata access signal: support CLONE_PIDFD with pidfd_send_signal clone: add CLONE_PIDFD Make anon_inodes unconditional
-rw-r--r--arch/arm/kvm/Kconfig1
-rw-r--r--arch/arm64/kvm/Kconfig1
-rw-r--r--arch/mips/kvm/Kconfig1
-rw-r--r--arch/powerpc/kvm/Kconfig1
-rw-r--r--arch/s390/kvm/Kconfig1
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--drivers/base/Kconfig1
-rw-r--r--drivers/char/tpm/Kconfig1
-rw-r--r--drivers/dma-buf/Kconfig1
-rw-r--r--drivers/gpio/Kconfig1
-rw-r--r--drivers/iio/Kconfig1
-rw-r--r--drivers/infiniband/Kconfig1
-rw-r--r--drivers/vfio/Kconfig1
-rw-r--r--fs/Makefile2
-rw-r--r--fs/notify/fanotify/Kconfig1
-rw-r--r--fs/notify/inotify/Kconfig1
-rw-r--r--include/linux/pid.h2
-rw-r--r--include/uapi/linux/sched.h1
-rw-r--r--init/Kconfig10
-rw-r--r--kernel/fork.c107
-rw-r--r--kernel/signal.c12
-rw-r--r--kernel/sys_ni.c3
-rw-r--r--samples/Makefile2
-rw-r--r--samples/pidfd/Makefile6
-rw-r--r--samples/pidfd/pidfd-metadata.c112
26 files changed, 235 insertions, 38 deletions
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 3f5320f46de2..f591026347a5 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -22,7 +22,6 @@ config KVM
22 bool "Kernel-based Virtual Machine (KVM) support" 22 bool "Kernel-based Virtual Machine (KVM) support"
23 depends on MMU && OF 23 depends on MMU && OF
24 select PREEMPT_NOTIFIERS 24 select PREEMPT_NOTIFIERS
25 select ANON_INODES
26 select ARM_GIC 25 select ARM_GIC
27 select ARM_GIC_V3 26 select ARM_GIC_V3
28 select ARM_GIC_V3_ITS 27 select ARM_GIC_V3_ITS
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index a3f85624313e..a67121d419a2 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -23,7 +23,6 @@ config KVM
23 depends on OF 23 depends on OF
24 select MMU_NOTIFIER 24 select MMU_NOTIFIER
25 select PREEMPT_NOTIFIERS 25 select PREEMPT_NOTIFIERS
26 select ANON_INODES
27 select HAVE_KVM_CPU_RELAX_INTERCEPT 26 select HAVE_KVM_CPU_RELAX_INTERCEPT
28 select HAVE_KVM_ARCH_TLB_FLUSH_ALL 27 select HAVE_KVM_ARCH_TLB_FLUSH_ALL
29 select KVM_MMIO 28 select KVM_MMIO
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index 4528bc9c3cb1..eac25aef21e0 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -21,7 +21,6 @@ config KVM
21 depends on MIPS_FP_SUPPORT 21 depends on MIPS_FP_SUPPORT
22 select EXPORT_UASM 22 select EXPORT_UASM
23 select PREEMPT_NOTIFIERS 23 select PREEMPT_NOTIFIERS
24 select ANON_INODES
25 select KVM_GENERIC_DIRTYLOG_READ_PROTECT 24 select KVM_GENERIC_DIRTYLOG_READ_PROTECT
26 select HAVE_KVM_VCPU_ASYNC_IOCTL 25 select HAVE_KVM_VCPU_ASYNC_IOCTL
27 select KVM_MMIO 26 select KVM_MMIO
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index bfdde04e4905..f53997a8ca62 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,7 +20,6 @@ if VIRTUALIZATION
20config KVM 20config KVM
21 bool 21 bool
22 select PREEMPT_NOTIFIERS 22 select PREEMPT_NOTIFIERS
23 select ANON_INODES
24 select HAVE_KVM_EVENTFD 23 select HAVE_KVM_EVENTFD
25 select HAVE_KVM_VCPU_ASYNC_IOCTL 24 select HAVE_KVM_VCPU_ASYNC_IOCTL
26 select SRCU 25 select SRCU
diff --git a/arch/s390/kvm/Kconfig b/arch/s390/kvm/Kconfig
index 767453faacfc..1816ee48eadd 100644
--- a/arch/s390/kvm/Kconfig
+++ b/arch/s390/kvm/Kconfig
@@ -21,7 +21,6 @@ config KVM
21 prompt "Kernel-based Virtual Machine (KVM) support" 21 prompt "Kernel-based Virtual Machine (KVM) support"
22 depends on HAVE_KVM 22 depends on HAVE_KVM
23 select PREEMPT_NOTIFIERS 23 select PREEMPT_NOTIFIERS
24 select ANON_INODES
25 select HAVE_KVM_CPU_RELAX_INTERCEPT 24 select HAVE_KVM_CPU_RELAX_INTERCEPT
26 select HAVE_KVM_VCPU_ASYNC_IOCTL 25 select HAVE_KVM_VCPU_ASYNC_IOCTL
27 select HAVE_KVM_EVENTFD 26 select HAVE_KVM_EVENTFD
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0a3cc347143f..e7212731cffb 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -44,7 +44,6 @@ config X86
44 # 44 #
45 select ACPI_LEGACY_TABLES_LOOKUP if ACPI 45 select ACPI_LEGACY_TABLES_LOOKUP if ACPI
46 select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI 46 select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI
47 select ANON_INODES
48 select ARCH_32BIT_OFF_T if X86_32 47 select ARCH_32BIT_OFF_T if X86_32
49 select ARCH_CLOCKSOURCE_DATA 48 select ARCH_CLOCKSOURCE_DATA
50 select ARCH_CLOCKSOURCE_INIT 49 select ARCH_CLOCKSOURCE_INIT
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 72fa955f4a15..fc042419e670 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -27,7 +27,6 @@ config KVM
27 depends on X86_LOCAL_APIC 27 depends on X86_LOCAL_APIC
28 select PREEMPT_NOTIFIERS 28 select PREEMPT_NOTIFIERS
29 select MMU_NOTIFIER 29 select MMU_NOTIFIER
30 select ANON_INODES
31 select HAVE_KVM_IRQCHIP 30 select HAVE_KVM_IRQCHIP
32 select HAVE_KVM_IRQFD 31 select HAVE_KVM_IRQFD
33 select IRQ_BYPASS_MANAGER 32 select IRQ_BYPASS_MANAGER
diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
index 059700ea3521..03f067da12ee 100644
--- a/drivers/base/Kconfig
+++ b/drivers/base/Kconfig
@@ -174,7 +174,6 @@ source "drivers/base/regmap/Kconfig"
174config DMA_SHARED_BUFFER 174config DMA_SHARED_BUFFER
175 bool 175 bool
176 default n 176 default n
177 select ANON_INODES
178 select IRQ_WORK 177 select IRQ_WORK
179 help 178 help
180 This option enables the framework for buffer-sharing between 179 This option enables the framework for buffer-sharing between
diff --git a/drivers/char/tpm/Kconfig b/drivers/char/tpm/Kconfig
index 536e55d3919f..f3e4bc490cf0 100644
--- a/drivers/char/tpm/Kconfig
+++ b/drivers/char/tpm/Kconfig
@@ -157,7 +157,6 @@ config TCG_CRB
157config TCG_VTPM_PROXY 157config TCG_VTPM_PROXY
158 tristate "VTPM Proxy Interface" 158 tristate "VTPM Proxy Interface"
159 depends on TCG_TPM 159 depends on TCG_TPM
160 select ANON_INODES
161 ---help--- 160 ---help---
162 This driver proxies for an emulated TPM (vTPM) running in userspace. 161 This driver proxies for an emulated TPM (vTPM) running in userspace.
163 A device /dev/vtpmx is provided that creates a device pair 162 A device /dev/vtpmx is provided that creates a device pair
diff --git a/drivers/dma-buf/Kconfig b/drivers/dma-buf/Kconfig
index 2e5a0faa2cb1..3fc9c2efc583 100644
--- a/drivers/dma-buf/Kconfig
+++ b/drivers/dma-buf/Kconfig
@@ -3,7 +3,6 @@ menu "DMABUF options"
3config SYNC_FILE 3config SYNC_FILE
4 bool "Explicit Synchronization Framework" 4 bool "Explicit Synchronization Framework"
5 default n 5 default n
6 select ANON_INODES
7 select DMA_SHARED_BUFFER 6 select DMA_SHARED_BUFFER
8 ---help--- 7 ---help---
9 The Sync File Framework adds explicit syncronization via 8 The Sync File Framework adds explicit syncronization via
diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 3f50526a771f..0f91600c27ae 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -12,7 +12,6 @@ config ARCH_HAVE_CUSTOM_GPIO_H
12 12
13menuconfig GPIOLIB 13menuconfig GPIOLIB
14 bool "GPIO Support" 14 bool "GPIO Support"
15 select ANON_INODES
16 help 15 help
17 This enables GPIO support through the generic GPIO library. 16 This enables GPIO support through the generic GPIO library.
18 You only need to enable this, if you also want to enable 17 You only need to enable this, if you also want to enable
diff --git a/drivers/iio/Kconfig b/drivers/iio/Kconfig
index d08aeb41cd07..1dec0fecb6ef 100644
--- a/drivers/iio/Kconfig
+++ b/drivers/iio/Kconfig
@@ -4,7 +4,6 @@
4 4
5menuconfig IIO 5menuconfig IIO
6 tristate "Industrial I/O support" 6 tristate "Industrial I/O support"
7 select ANON_INODES
8 help 7 help
9 The industrial I/O subsystem provides a unified framework for 8 The industrial I/O subsystem provides a unified framework for
10 drivers for many different types of embedded sensors using a 9 drivers for many different types of embedded sensors using a
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index a1fb840de45d..d318bab25860 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -25,7 +25,6 @@ config INFINIBAND_USER_MAD
25 25
26config INFINIBAND_USER_ACCESS 26config INFINIBAND_USER_ACCESS
27 tristate "InfiniBand userspace access (verbs and CM)" 27 tristate "InfiniBand userspace access (verbs and CM)"
28 select ANON_INODES
29 depends on MMU 28 depends on MMU
30 ---help--- 29 ---help---
31 Userspace InfiniBand access support. This enables the 30 Userspace InfiniBand access support. This enables the
diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 9de5ed38da83..3798d77d131c 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -22,7 +22,6 @@ menuconfig VFIO
22 tristate "VFIO Non-Privileged userspace driver framework" 22 tristate "VFIO Non-Privileged userspace driver framework"
23 depends on IOMMU_API 23 depends on IOMMU_API
24 select VFIO_IOMMU_TYPE1 if (X86 || S390 || ARM || ARM64) 24 select VFIO_IOMMU_TYPE1 if (X86 || S390 || ARM || ARM64)
25 select ANON_INODES
26 help 25 help
27 VFIO provides a framework for secure userspace device drivers. 26 VFIO provides a framework for secure userspace device drivers.
28 See Documentation/vfio.txt for more details. 27 See Documentation/vfio.txt for more details.
diff --git a/fs/Makefile b/fs/Makefile
index 427fec226fae..35945f8139e6 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -25,7 +25,7 @@ obj-$(CONFIG_PROC_FS) += proc_namespace.o
25 25
26obj-y += notify/ 26obj-y += notify/
27obj-$(CONFIG_EPOLL) += eventpoll.o 27obj-$(CONFIG_EPOLL) += eventpoll.o
28obj-$(CONFIG_ANON_INODES) += anon_inodes.o 28obj-y += anon_inodes.o
29obj-$(CONFIG_SIGNALFD) += signalfd.o 29obj-$(CONFIG_SIGNALFD) += signalfd.o
30obj-$(CONFIG_TIMERFD) += timerfd.o 30obj-$(CONFIG_TIMERFD) += timerfd.o
31obj-$(CONFIG_EVENTFD) += eventfd.o 31obj-$(CONFIG_EVENTFD) += eventfd.o
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 735bfb2e9190..521dc91d2cb5 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -1,7 +1,6 @@
1config FANOTIFY 1config FANOTIFY
2 bool "Filesystem wide access notification" 2 bool "Filesystem wide access notification"
3 select FSNOTIFY 3 select FSNOTIFY
4 select ANON_INODES
5 select EXPORTFS 4 select EXPORTFS
6 default n 5 default n
7 ---help--- 6 ---help---
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index b981fc0c8379..0161c74e76e2 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -1,6 +1,5 @@
1config INOTIFY_USER 1config INOTIFY_USER
2 bool "Inotify support for userspace" 2 bool "Inotify support for userspace"
3 select ANON_INODES
4 select FSNOTIFY 3 select FSNOTIFY
5 default y 4 default y
6 ---help--- 5 ---help---
diff --git a/include/linux/pid.h b/include/linux/pid.h
index b6f4ba16065a..3c8ef5a199ca 100644
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -66,6 +66,8 @@ struct pid
66 66
67extern struct pid init_struct_pid; 67extern struct pid init_struct_pid;
68 68
69extern const struct file_operations pidfd_fops;
70
69static inline struct pid *get_pid(struct pid *pid) 71static inline struct pid *get_pid(struct pid *pid)
70{ 72{
71 if (pid) 73 if (pid)
diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h
index 22627f80063e..ed4ee170bee2 100644
--- a/include/uapi/linux/sched.h
+++ b/include/uapi/linux/sched.h
@@ -10,6 +10,7 @@
10#define CLONE_FS 0x00000200 /* set if fs info shared between processes */ 10#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
11#define CLONE_FILES 0x00000400 /* set if open files shared between processes */ 11#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
12#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ 12#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
13#define CLONE_PIDFD 0x00001000 /* set if a pidfd should be placed in parent */
13#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ 14#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
14#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ 15#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
15#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ 16#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
diff --git a/init/Kconfig b/init/Kconfig
index 4592bf7997c0..be8f97e37a76 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1171,9 +1171,6 @@ config LD_DEAD_CODE_DATA_ELIMINATION
1171config SYSCTL 1171config SYSCTL
1172 bool 1172 bool
1173 1173
1174config ANON_INODES
1175 bool
1176
1177config HAVE_UID16 1174config HAVE_UID16
1178 bool 1175 bool
1179 1176
@@ -1378,14 +1375,12 @@ config HAVE_FUTEX_CMPXCHG
1378config EPOLL 1375config EPOLL
1379 bool "Enable eventpoll support" if EXPERT 1376 bool "Enable eventpoll support" if EXPERT
1380 default y 1377 default y
1381 select ANON_INODES
1382 help 1378 help
1383 Disabling this option will cause the kernel to be built without 1379 Disabling this option will cause the kernel to be built without
1384 support for epoll family of system calls. 1380 support for epoll family of system calls.
1385 1381
1386config SIGNALFD 1382config SIGNALFD
1387 bool "Enable signalfd() system call" if EXPERT 1383 bool "Enable signalfd() system call" if EXPERT
1388 select ANON_INODES
1389 default y 1384 default y
1390 help 1385 help
1391 Enable the signalfd() system call that allows to receive signals 1386 Enable the signalfd() system call that allows to receive signals
@@ -1395,7 +1390,6 @@ config SIGNALFD
1395 1390
1396config TIMERFD 1391config TIMERFD
1397 bool "Enable timerfd() system call" if EXPERT 1392 bool "Enable timerfd() system call" if EXPERT
1398 select ANON_INODES
1399 default y 1393 default y
1400 help 1394 help
1401 Enable the timerfd() system call that allows to receive timer 1395 Enable the timerfd() system call that allows to receive timer
@@ -1405,7 +1399,6 @@ config TIMERFD
1405 1399
1406config EVENTFD 1400config EVENTFD
1407 bool "Enable eventfd() system call" if EXPERT 1401 bool "Enable eventfd() system call" if EXPERT
1408 select ANON_INODES
1409 default y 1402 default y
1410 help 1403 help
1411 Enable the eventfd() system call that allows to receive both 1404 Enable the eventfd() system call that allows to receive both
@@ -1516,7 +1509,6 @@ config KALLSYMS_BASE_RELATIVE
1516# syscall, maps, verifier 1509# syscall, maps, verifier
1517config BPF_SYSCALL 1510config BPF_SYSCALL
1518 bool "Enable bpf() system call" 1511 bool "Enable bpf() system call"
1519 select ANON_INODES
1520 select BPF 1512 select BPF
1521 select IRQ_WORK 1513 select IRQ_WORK
1522 default n 1514 default n
@@ -1533,7 +1525,6 @@ config BPF_JIT_ALWAYS_ON
1533 1525
1534config USERFAULTFD 1526config USERFAULTFD
1535 bool "Enable userfaultfd() system call" 1527 bool "Enable userfaultfd() system call"
1536 select ANON_INODES
1537 depends on MMU 1528 depends on MMU
1538 help 1529 help
1539 Enable the userfaultfd() system call that allows to intercept and 1530 Enable the userfaultfd() system call that allows to intercept and
@@ -1600,7 +1591,6 @@ config PERF_EVENTS
1600 bool "Kernel performance events and counters" 1591 bool "Kernel performance events and counters"
1601 default y if PROFILING 1592 default y if PROFILING
1602 depends on HAVE_PERF_EVENTS 1593 depends on HAVE_PERF_EVENTS
1603 select ANON_INODES
1604 select IRQ_WORK 1594 select IRQ_WORK
1605 select SRCU 1595 select SRCU
1606 help 1596 help
diff --git a/kernel/fork.c b/kernel/fork.c
index fbe9dfcd8680..8b03d93ba068 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -11,6 +11,7 @@
11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' 11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
12 */ 12 */
13 13
14#include <linux/anon_inodes.h>
14#include <linux/slab.h> 15#include <linux/slab.h>
15#include <linux/sched/autogroup.h> 16#include <linux/sched/autogroup.h>
16#include <linux/sched/mm.h> 17#include <linux/sched/mm.h>
@@ -21,6 +22,7 @@
21#include <linux/sched/task.h> 22#include <linux/sched/task.h>
22#include <linux/sched/task_stack.h> 23#include <linux/sched/task_stack.h>
23#include <linux/sched/cputime.h> 24#include <linux/sched/cputime.h>
25#include <linux/seq_file.h>
24#include <linux/rtmutex.h> 26#include <linux/rtmutex.h>
25#include <linux/init.h> 27#include <linux/init.h>
26#include <linux/unistd.h> 28#include <linux/unistd.h>
@@ -1670,6 +1672,58 @@ static inline void rcu_copy_process(struct task_struct *p)
1670#endif /* #ifdef CONFIG_TASKS_RCU */ 1672#endif /* #ifdef CONFIG_TASKS_RCU */
1671} 1673}
1672 1674
1675static int pidfd_release(struct inode *inode, struct file *file)
1676{
1677 struct pid *pid = file->private_data;
1678
1679 file->private_data = NULL;
1680 put_pid(pid);
1681 return 0;
1682}
1683
1684#ifdef CONFIG_PROC_FS
1685static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
1686{
1687 struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
1688 struct pid *pid = f->private_data;
1689
1690 seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns));
1691 seq_putc(m, '\n');
1692}
1693#endif
1694
1695const struct file_operations pidfd_fops = {
1696 .release = pidfd_release,
1697#ifdef CONFIG_PROC_FS
1698 .show_fdinfo = pidfd_show_fdinfo,
1699#endif
1700};
1701
1702/**
1703 * pidfd_create() - Create a new pid file descriptor.
1704 *
1705 * @pid: struct pid that the pidfd will reference
1706 *
1707 * This creates a new pid file descriptor with the O_CLOEXEC flag set.
1708 *
1709 * Note, that this function can only be called after the fd table has
1710 * been unshared to avoid leaking the pidfd to the new process.
1711 *
1712 * Return: On success, a cloexec pidfd is returned.
1713 * On error, a negative errno number will be returned.
1714 */
1715static int pidfd_create(struct pid *pid)
1716{
1717 int fd;
1718
1719 fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
1720 O_RDWR | O_CLOEXEC);
1721 if (fd < 0)
1722 put_pid(pid);
1723
1724 return fd;
1725}
1726
1673/* 1727/*
1674 * This creates a new process as a copy of the old one, 1728 * This creates a new process as a copy of the old one,
1675 * but does not actually start it yet. 1729 * but does not actually start it yet.
@@ -1682,13 +1736,14 @@ static __latent_entropy struct task_struct *copy_process(
1682 unsigned long clone_flags, 1736 unsigned long clone_flags,
1683 unsigned long stack_start, 1737 unsigned long stack_start,
1684 unsigned long stack_size, 1738 unsigned long stack_size,
1739 int __user *parent_tidptr,
1685 int __user *child_tidptr, 1740 int __user *child_tidptr,
1686 struct pid *pid, 1741 struct pid *pid,
1687 int trace, 1742 int trace,
1688 unsigned long tls, 1743 unsigned long tls,
1689 int node) 1744 int node)
1690{ 1745{
1691 int retval; 1746 int pidfd = -1, retval;
1692 struct task_struct *p; 1747 struct task_struct *p;
1693 struct multiprocess_signals delayed; 1748 struct multiprocess_signals delayed;
1694 1749
@@ -1738,6 +1793,31 @@ static __latent_entropy struct task_struct *copy_process(
1738 return ERR_PTR(-EINVAL); 1793 return ERR_PTR(-EINVAL);
1739 } 1794 }
1740 1795
1796 if (clone_flags & CLONE_PIDFD) {
1797 int reserved;
1798
1799 /*
1800 * - CLONE_PARENT_SETTID is useless for pidfds and also
1801 * parent_tidptr is used to return pidfds.
1802 * - CLONE_DETACHED is blocked so that we can potentially
1803 * reuse it later for CLONE_PIDFD.
1804 * - CLONE_THREAD is blocked until someone really needs it.
1805 */
1806 if (clone_flags &
1807 (CLONE_DETACHED | CLONE_PARENT_SETTID | CLONE_THREAD))
1808 return ERR_PTR(-EINVAL);
1809
1810 /*
1811 * Verify that parent_tidptr is sane so we can potentially
1812 * reuse it later.
1813 */
1814 if (get_user(reserved, parent_tidptr))
1815 return ERR_PTR(-EFAULT);
1816
1817 if (reserved != 0)
1818 return ERR_PTR(-EINVAL);
1819 }
1820
1741 /* 1821 /*
1742 * Force any signals received before this point to be delivered 1822 * Force any signals received before this point to be delivered
1743 * before the fork happens. Collect up signals sent to multiple 1823 * before the fork happens. Collect up signals sent to multiple
@@ -1944,6 +2024,22 @@ static __latent_entropy struct task_struct *copy_process(
1944 } 2024 }
1945 } 2025 }
1946 2026
2027 /*
2028 * This has to happen after we've potentially unshared the file
2029 * descriptor table (so that the pidfd doesn't leak into the child
2030 * if the fd table isn't shared).
2031 */
2032 if (clone_flags & CLONE_PIDFD) {
2033 retval = pidfd_create(pid);
2034 if (retval < 0)
2035 goto bad_fork_free_pid;
2036
2037 pidfd = retval;
2038 retval = put_user(pidfd, parent_tidptr);
2039 if (retval)
2040 goto bad_fork_put_pidfd;
2041 }
2042
1947#ifdef CONFIG_BLOCK 2043#ifdef CONFIG_BLOCK
1948 p->plug = NULL; 2044 p->plug = NULL;
1949#endif 2045#endif
@@ -2004,7 +2100,7 @@ static __latent_entropy struct task_struct *copy_process(
2004 */ 2100 */
2005 retval = cgroup_can_fork(p); 2101 retval = cgroup_can_fork(p);
2006 if (retval) 2102 if (retval)
2007 goto bad_fork_free_pid; 2103 goto bad_fork_put_pidfd;
2008 2104
2009 /* 2105 /*
2010 * From this point on we must avoid any synchronous user-space 2106 * From this point on we must avoid any synchronous user-space
@@ -2119,6 +2215,9 @@ bad_fork_cancel_cgroup:
2119 spin_unlock(&current->sighand->siglock); 2215 spin_unlock(&current->sighand->siglock);
2120 write_unlock_irq(&tasklist_lock); 2216 write_unlock_irq(&tasklist_lock);
2121 cgroup_cancel_fork(p); 2217 cgroup_cancel_fork(p);
2218bad_fork_put_pidfd:
2219 if (clone_flags & CLONE_PIDFD)
2220 ksys_close(pidfd);
2122bad_fork_free_pid: 2221bad_fork_free_pid:
2123 cgroup_threadgroup_change_end(current); 2222 cgroup_threadgroup_change_end(current);
2124 if (pid != &init_struct_pid) 2223 if (pid != &init_struct_pid)
@@ -2184,7 +2283,7 @@ static inline void init_idle_pids(struct task_struct *idle)
2184struct task_struct *fork_idle(int cpu) 2283struct task_struct *fork_idle(int cpu)
2185{ 2284{
2186 struct task_struct *task; 2285 struct task_struct *task;
2187 task = copy_process(CLONE_VM, 0, 0, NULL, &init_struct_pid, 0, 0, 2286 task = copy_process(CLONE_VM, 0, 0, NULL, NULL, &init_struct_pid, 0, 0,
2188 cpu_to_node(cpu)); 2287 cpu_to_node(cpu));
2189 if (!IS_ERR(task)) { 2288 if (!IS_ERR(task)) {
2190 init_idle_pids(task); 2289 init_idle_pids(task);
@@ -2236,7 +2335,7 @@ long _do_fork(unsigned long clone_flags,
2236 trace = 0; 2335 trace = 0;
2237 } 2336 }
2238 2337
2239 p = copy_process(clone_flags, stack_start, stack_size, 2338 p = copy_process(clone_flags, stack_start, stack_size, parent_tidptr,
2240 child_tidptr, NULL, trace, tls, NUMA_NO_NODE); 2339 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
2241 add_latent_entropy(); 2340 add_latent_entropy();
2242 2341
diff --git a/kernel/signal.c b/kernel/signal.c
index 227ba170298e..cd83cc376767 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3513,7 +3513,6 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
3513 return kill_something_info(sig, &info, pid); 3513 return kill_something_info(sig, &info, pid);
3514} 3514}
3515 3515
3516#ifdef CONFIG_PROC_FS
3517/* 3516/*
3518 * Verify that the signaler and signalee either are in the same pid namespace 3517 * Verify that the signaler and signalee either are in the same pid namespace
3519 * or that the signaler's pid namespace is an ancestor of the signalee's pid 3518 * or that the signaler's pid namespace is an ancestor of the signalee's pid
@@ -3550,6 +3549,14 @@ static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
3550 return copy_siginfo_from_user(kinfo, info); 3549 return copy_siginfo_from_user(kinfo, info);
3551} 3550}
3552 3551
3552static struct pid *pidfd_to_pid(const struct file *file)
3553{
3554 if (file->f_op == &pidfd_fops)
3555 return file->private_data;
3556
3557 return tgid_pidfd_to_pid(file);
3558}
3559
3553/** 3560/**
3554 * sys_pidfd_send_signal - send a signal to a process through a task file 3561 * sys_pidfd_send_signal - send a signal to a process through a task file
3555 * descriptor 3562 * descriptor
@@ -3586,7 +3593,7 @@ SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
3586 return -EBADF; 3593 return -EBADF;
3587 3594
3588 /* Is this a pidfd? */ 3595 /* Is this a pidfd? */
3589 pid = tgid_pidfd_to_pid(f.file); 3596 pid = pidfd_to_pid(f.file);
3590 if (IS_ERR(pid)) { 3597 if (IS_ERR(pid)) {
3591 ret = PTR_ERR(pid); 3598 ret = PTR_ERR(pid);
3592 goto err; 3599 goto err;
@@ -3620,7 +3627,6 @@ err:
3620 fdput(f); 3627 fdput(f);
3621 return ret; 3628 return ret;
3622} 3629}
3623#endif /* CONFIG_PROC_FS */
3624 3630
3625static int 3631static int
3626do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) 3632do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index d21f4befaea4..4d9ae5ea6caf 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -167,9 +167,6 @@ COND_SYSCALL(syslog);
167 167
168/* kernel/sched/core.c */ 168/* kernel/sched/core.c */
169 169
170/* kernel/signal.c */
171COND_SYSCALL(pidfd_send_signal);
172
173/* kernel/sys.c */ 170/* kernel/sys.c */
174COND_SYSCALL(setregid); 171COND_SYSCALL(setregid);
175COND_SYSCALL(setgid); 172COND_SYSCALL(setgid);
diff --git a/samples/Makefile b/samples/Makefile
index b1142a958811..fadadb1c3b05 100644
--- a/samples/Makefile
+++ b/samples/Makefile
@@ -3,4 +3,4 @@
3obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ trace_events/ livepatch/ \ 3obj-$(CONFIG_SAMPLES) += kobject/ kprobes/ trace_events/ livepatch/ \
4 hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/ seccomp/ \ 4 hw_breakpoint/ kfifo/ kdb/ hidraw/ rpmsg/ seccomp/ \
5 configfs/ connector/ v4l/ trace_printk/ \ 5 configfs/ connector/ v4l/ trace_printk/ \
6 vfio-mdev/ statx/ qmi/ binderfs/ 6 vfio-mdev/ statx/ qmi/ binderfs/ pidfd/
diff --git a/samples/pidfd/Makefile b/samples/pidfd/Makefile
new file mode 100644
index 000000000000..0ff97784177a
--- /dev/null
+++ b/samples/pidfd/Makefile
@@ -0,0 +1,6 @@
1# SPDX-License-Identifier: GPL-2.0
2
3hostprogs-y := pidfd-metadata
4always := $(hostprogs-y)
5HOSTCFLAGS_pidfd-metadata.o += -I$(objtree)/usr/include
6all: pidfd-metadata
diff --git a/samples/pidfd/pidfd-metadata.c b/samples/pidfd/pidfd-metadata.c
new file mode 100644
index 000000000000..640f5f757c57
--- /dev/null
+++ b/samples/pidfd/pidfd-metadata.c
@@ -0,0 +1,112 @@
1// SPDX-License-Identifier: GPL-2.0
2
3#define _GNU_SOURCE
4#include <err.h>
5#include <errno.h>
6#include <fcntl.h>
7#include <inttypes.h>
8#include <limits.h>
9#include <sched.h>
10#include <signal.h>
11#include <stdio.h>
12#include <stdlib.h>
13#include <string.h>
14#include <sys/stat.h>
15#include <sys/syscall.h>
16#include <sys/types.h>
17#include <sys/wait.h>
18#include <unistd.h>
19
20#ifndef CLONE_PIDFD
21#define CLONE_PIDFD 0x00001000
22#endif
23
24static int do_child(void *args)
25{
26 printf("%d\n", getpid());
27 _exit(EXIT_SUCCESS);
28}
29
30static pid_t pidfd_clone(int flags, int *pidfd)
31{
32 size_t stack_size = 1024;
33 char *stack[1024] = { 0 };
34
35#ifdef __ia64__
36 return __clone2(do_child, stack, stack_size, flags | SIGCHLD, NULL, pidfd);
37#else
38 return clone(do_child, stack + stack_size, flags | SIGCHLD, NULL, pidfd);
39#endif
40}
41
42static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
43 unsigned int flags)
44{
45 return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags);
46}
47
48static int pidfd_metadata_fd(pid_t pid, int pidfd)
49{
50 int procfd, ret;
51 char path[100];
52
53 snprintf(path, sizeof(path), "/proc/%d", pid);
54 procfd = open(path, O_DIRECTORY | O_RDONLY | O_CLOEXEC);
55 if (procfd < 0) {
56 warn("Failed to open %s\n", path);
57 return -1;
58 }
59
60 /*
61 * Verify that the pid has not been recycled and our /proc/<pid> handle
62 * is still valid.
63 */
64 ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0);
65 if (ret < 0) {
66 switch (errno) {
67 case EPERM:
68 /* Process exists, just not allowed to signal it. */
69 break;
70 default:
71 warn("Failed to signal process\n");
72 close(procfd);
73 procfd = -1;
74 }
75 }
76
77 return procfd;
78}
79
80int main(int argc, char *argv[])
81{
82 int pidfd = 0, ret = EXIT_FAILURE;
83 char buf[4096] = { 0 };
84 pid_t pid;
85 int procfd, statusfd;
86 ssize_t bytes;
87
88 pid = pidfd_clone(CLONE_PIDFD, &pidfd);
89 if (pid < 0)
90 exit(ret);
91
92 procfd = pidfd_metadata_fd(pid, pidfd);
93 close(pidfd);
94 if (procfd < 0)
95 goto out;
96
97 statusfd = openat(procfd, "status", O_RDONLY | O_CLOEXEC);
98 close(procfd);
99 if (statusfd < 0)
100 goto out;
101
102 bytes = read(statusfd, buf, sizeof(buf));
103 if (bytes > 0)
104 bytes = write(STDOUT_FILENO, buf, bytes);
105 close(statusfd);
106 ret = EXIT_SUCCESS;
107
108out:
109 (void)wait(NULL);
110
111 exit(ret);
112}