aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristopher Yeoh <cyeoh@au1.ibm.com>2011-10-31 20:06:39 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-10-31 20:30:44 -0400
commitfcf634098c00dd9cd247447368495f0b79be12d1 (patch)
tree77fc98cd461bd52ba3b14e833d54a115ffbbd7bc
parent32ea845d5bafc37b7406bea1aee3005407cb0900 (diff)
Cross Memory Attach
The basic idea behind cross memory attach is to allow MPI programs doing intra-node communication to do a single copy of the message rather than a double copy of the message via shared memory. The following patch attempts to achieve this by allowing a destination process, given an address and size from a source process, to copy memory directly from the source process into its own address space via a system call. There is also a symmetrical ability to copy from the current process's address space into a destination process's address space. - Use of /proc/pid/mem has been considered, but there are issues with using it: - Does not allow for specifying iovecs for both src and dest, assuming preadv or pwritev was implemented either the area read from or written to would need to be contiguous. - Currently mem_read allows only processes who are currently ptrace'ing the target and are still able to ptrace the target to read from the target. This check could possibly be moved to the open call, but its not clear exactly what race this restriction is stopping (reason appears to have been lost) - Having to send the fd of /proc/self/mem via SCM_RIGHTS on unix domain socket is a bit ugly from a userspace point of view, especially when you may have hundreds if not (eventually) thousands of processes that all need to do this with each other - Doesn't allow for some future use of the interface we would like to consider adding in the future (see below) - Interestingly reading from /proc/pid/mem currently actually involves two copies! (But this could be fixed pretty easily) As mentioned previously use of vmsplice instead was considered, but has problems. Since you need the reader and writer working co-operatively if the pipe is not drained then you block. Which requires some wrapping to do non blocking on the send side or polling on the receive. In all to all communication it requires ordering otherwise you can deadlock. And in the example of many MPI tasks writing to one MPI task vmsplice serialises the copying. There are some cases of MPI collectives where even a single copy interface does not get us the performance gain we could. For example in an MPI_Reduce rather than copy the data from the source we would like to instead use it directly in a mathops (say the reduce is doing a sum) as this would save us doing a copy. We don't need to keep a copy of the data from the source. I haven't implemented this, but I think this interface could in the future do all this through the use of the flags - eg could specify the math operation and type and the kernel rather than just copying the data would apply the specified operation between the source and destination and store it in the destination. Although we don't have a "second user" of the interface (though I've had some nibbles from people who may be interested in using it for intra process messaging which is not MPI). This interface is something which hardware vendors are already doing for their custom drivers to implement fast local communication. And so in addition to this being useful for OpenMPI it would mean the driver maintainers don't have to fix things up when the mm changes. There was some discussion about how much faster a true zero copy would go. Here's a link back to the email with some testing I did on that: http://marc.info/?l=linux-mm&m=130105930902915&w=2 There is a basic man page for the proposed interface here: http://ozlabs.org/~cyeoh/cma/process_vm_readv.txt This has been implemented for x86 and powerpc, other architecture should mainly (I think) just need to add syscall numbers for the process_vm_readv and process_vm_writev. There are 32 bit compatibility versions for 64-bit kernels. For arch maintainers there are some simple tests to be able to quickly verify that the syscalls are working correctly here: http://ozlabs.org/~cyeoh/cma/cma-test-20110718.tgz Signed-off-by: Chris Yeoh <yeohc@au1.ibm.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Paul Mackerras <paulus@samba.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Howells <dhowells@redhat.com> Cc: James Morris <jmorris@namei.org> Cc: <linux-man@vger.kernel.org> Cc: <linux-arch@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--arch/powerpc/include/asm/systbl.h2
-rw-r--r--arch/powerpc/include/asm/unistd.h4
-rw-r--r--arch/x86/ia32/ia32entry.S2
-rw-r--r--arch/x86/include/asm/unistd_32.h4
-rw-r--r--arch/x86/include/asm/unistd_64.h4
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--fs/aio.c4
-rw-r--r--fs/compat.c7
-rw-r--r--fs/read_write.c8
-rw-r--r--include/linux/compat.h3
-rw-r--r--include/linux/fs.h7
-rw-r--r--include/linux/syscalls.h13
-rw-r--r--kernel/sys_ni.c4
-rw-r--r--mm/Makefile3
-rw-r--r--mm/process_vm_access.c496
-rw-r--r--security/keys/compat.c2
-rw-r--r--security/keys/keyctl.c2
17 files changed, 550 insertions, 17 deletions
diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h
index fa0d27a400de..559ae1ee6706 100644
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -354,3 +354,5 @@ COMPAT_SYS_SPU(clock_adjtime)
354SYSCALL_SPU(syncfs) 354SYSCALL_SPU(syncfs)
355COMPAT_SYS_SPU(sendmmsg) 355COMPAT_SYS_SPU(sendmmsg)
356SYSCALL_SPU(setns) 356SYSCALL_SPU(setns)
357COMPAT_SYS(process_vm_readv)
358COMPAT_SYS(process_vm_writev)
diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h
index b8b3f599362b..d3d1b5efd7eb 100644
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -373,10 +373,12 @@
373#define __NR_syncfs 348 373#define __NR_syncfs 348
374#define __NR_sendmmsg 349 374#define __NR_sendmmsg 349
375#define __NR_setns 350 375#define __NR_setns 350
376#define __NR_process_vm_readv 351
377#define __NR_process_vm_writev 352
376 378
377#ifdef __KERNEL__ 379#ifdef __KERNEL__
378 380
379#define __NR_syscalls 351 381#define __NR_syscalls 353
380 382
381#define __NR__exit __NR_exit 383#define __NR__exit __NR_exit
382#define NR_syscalls __NR_syscalls 384#define NR_syscalls __NR_syscalls
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 54edb207ff3a..a6253ec1b284 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -850,4 +850,6 @@ ia32_sys_call_table:
850 .quad sys_syncfs 850 .quad sys_syncfs
851 .quad compat_sys_sendmmsg /* 345 */ 851 .quad compat_sys_sendmmsg /* 345 */
852 .quad sys_setns 852 .quad sys_setns
853 .quad compat_sys_process_vm_readv
854 .quad compat_sys_process_vm_writev
853ia32_syscall_end: 855ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 593485b38ab3..599c77d38f33 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -352,10 +352,12 @@
352#define __NR_syncfs 344 352#define __NR_syncfs 344
353#define __NR_sendmmsg 345 353#define __NR_sendmmsg 345
354#define __NR_setns 346 354#define __NR_setns 346
355#define __NR_process_vm_readv 347
356#define __NR_process_vm_writev 348
355 357
356#ifdef __KERNEL__ 358#ifdef __KERNEL__
357 359
358#define NR_syscalls 347 360#define NR_syscalls 349
359 361
360#define __ARCH_WANT_IPC_PARSE_VERSION 362#define __ARCH_WANT_IPC_PARSE_VERSION
361#define __ARCH_WANT_OLD_READDIR 363#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 0a6ba337a2eb..0431f193c3f2 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -682,6 +682,10 @@ __SYSCALL(__NR_sendmmsg, sys_sendmmsg)
682__SYSCALL(__NR_setns, sys_setns) 682__SYSCALL(__NR_setns, sys_setns)
683#define __NR_getcpu 309 683#define __NR_getcpu 309
684__SYSCALL(__NR_getcpu, sys_getcpu) 684__SYSCALL(__NR_getcpu, sys_getcpu)
685#define __NR_process_vm_readv 310
686__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv)
687#define __NR_process_vm_writev 311
688__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev)
685 689
686#ifndef __NO_STUBS 690#ifndef __NO_STUBS
687#define __ARCH_WANT_OLD_READDIR 691#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index bc19be332bc9..9a0e31293920 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -346,3 +346,5 @@ ENTRY(sys_call_table)
346 .long sys_syncfs 346 .long sys_syncfs
347 .long sys_sendmmsg /* 345 */ 347 .long sys_sendmmsg /* 345 */
348 .long sys_setns 348 .long sys_setns
349 .long sys_process_vm_readv
350 .long sys_process_vm_writev
diff --git a/fs/aio.c b/fs/aio.c
index e29ec485af25..632b235f4fbe 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -1387,13 +1387,13 @@ static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
1387 ret = compat_rw_copy_check_uvector(type, 1387 ret = compat_rw_copy_check_uvector(type,
1388 (struct compat_iovec __user *)kiocb->ki_buf, 1388 (struct compat_iovec __user *)kiocb->ki_buf,
1389 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, 1389 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
1390 &kiocb->ki_iovec); 1390 &kiocb->ki_iovec, 1);
1391 else 1391 else
1392#endif 1392#endif
1393 ret = rw_copy_check_uvector(type, 1393 ret = rw_copy_check_uvector(type,
1394 (struct iovec __user *)kiocb->ki_buf, 1394 (struct iovec __user *)kiocb->ki_buf,
1395 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec, 1395 kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
1396 &kiocb->ki_iovec); 1396 &kiocb->ki_iovec, 1);
1397 if (ret < 0) 1397 if (ret < 0)
1398 goto out; 1398 goto out;
1399 1399
diff --git a/fs/compat.c b/fs/compat.c
index 302e761bd0aa..c98787536bb8 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -546,7 +546,7 @@ out:
546ssize_t compat_rw_copy_check_uvector(int type, 546ssize_t compat_rw_copy_check_uvector(int type,
547 const struct compat_iovec __user *uvector, unsigned long nr_segs, 547 const struct compat_iovec __user *uvector, unsigned long nr_segs,
548 unsigned long fast_segs, struct iovec *fast_pointer, 548 unsigned long fast_segs, struct iovec *fast_pointer,
549 struct iovec **ret_pointer) 549 struct iovec **ret_pointer, int check_access)
550{ 550{
551 compat_ssize_t tot_len; 551 compat_ssize_t tot_len;
552 struct iovec *iov = *ret_pointer = fast_pointer; 552 struct iovec *iov = *ret_pointer = fast_pointer;
@@ -593,7 +593,8 @@ ssize_t compat_rw_copy_check_uvector(int type,
593 } 593 }
594 if (len < 0) /* size_t not fitting in compat_ssize_t .. */ 594 if (len < 0) /* size_t not fitting in compat_ssize_t .. */
595 goto out; 595 goto out;
596 if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) { 596 if (check_access &&
597 !access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
597 ret = -EFAULT; 598 ret = -EFAULT;
598 goto out; 599 goto out;
599 } 600 }
@@ -1107,7 +1108,7 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
1107 goto out; 1108 goto out;
1108 1109
1109 tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs, 1110 tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
1110 UIO_FASTIOV, iovstack, &iov); 1111 UIO_FASTIOV, iovstack, &iov, 1);
1111 if (tot_len == 0) { 1112 if (tot_len == 0) {
1112 ret = 0; 1113 ret = 0;
1113 goto out; 1114 goto out;
diff --git a/fs/read_write.c b/fs/read_write.c
index dfd125798791..5ad4248b0cd8 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -633,7 +633,8 @@ ssize_t do_loop_readv_writev(struct file *filp, struct iovec *iov,
633ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, 633ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
634 unsigned long nr_segs, unsigned long fast_segs, 634 unsigned long nr_segs, unsigned long fast_segs,
635 struct iovec *fast_pointer, 635 struct iovec *fast_pointer,
636 struct iovec **ret_pointer) 636 struct iovec **ret_pointer,
637 int check_access)
637{ 638{
638 unsigned long seg; 639 unsigned long seg;
639 ssize_t ret; 640 ssize_t ret;
@@ -689,7 +690,8 @@ ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
689 ret = -EINVAL; 690 ret = -EINVAL;
690 goto out; 691 goto out;
691 } 692 }
692 if (unlikely(!access_ok(vrfy_dir(type), buf, len))) { 693 if (check_access
694 && unlikely(!access_ok(vrfy_dir(type), buf, len))) {
693 ret = -EFAULT; 695 ret = -EFAULT;
694 goto out; 696 goto out;
695 } 697 }
@@ -721,7 +723,7 @@ static ssize_t do_readv_writev(int type, struct file *file,
721 } 723 }
722 724
723 ret = rw_copy_check_uvector(type, uvector, nr_segs, 725 ret = rw_copy_check_uvector(type, uvector, nr_segs,
724 ARRAY_SIZE(iovstack), iovstack, &iov); 726 ARRAY_SIZE(iovstack), iovstack, &iov, 1);
725 if (ret <= 0) 727 if (ret <= 0)
726 goto out; 728 goto out;
727 729
diff --git a/include/linux/compat.h b/include/linux/compat.h
index c6e7523bf765..154bf5683015 100644
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -547,7 +547,8 @@ extern ssize_t compat_rw_copy_check_uvector(int type,
547 const struct compat_iovec __user *uvector, 547 const struct compat_iovec __user *uvector,
548 unsigned long nr_segs, 548 unsigned long nr_segs,
549 unsigned long fast_segs, struct iovec *fast_pointer, 549 unsigned long fast_segs, struct iovec *fast_pointer,
550 struct iovec **ret_pointer); 550 struct iovec **ret_pointer,
551 int check_access);
551 552
552extern void __user *compat_alloc_user_space(unsigned long len); 553extern void __user *compat_alloc_user_space(unsigned long len);
553 554
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 14493a2d5a03..87b4c6b9692d 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1633,9 +1633,10 @@ struct inode_operations {
1633struct seq_file; 1633struct seq_file;
1634 1634
1635ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector, 1635ssize_t rw_copy_check_uvector(int type, const struct iovec __user * uvector,
1636 unsigned long nr_segs, unsigned long fast_segs, 1636 unsigned long nr_segs, unsigned long fast_segs,
1637 struct iovec *fast_pointer, 1637 struct iovec *fast_pointer,
1638 struct iovec **ret_pointer); 1638 struct iovec **ret_pointer,
1639 int check_access);
1639 1640
1640extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); 1641extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *);
1641extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); 1642extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index 1ff0ec2a5e8d..86a24b1166d1 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -844,4 +844,17 @@ asmlinkage long sys_open_by_handle_at(int mountdirfd,
844 struct file_handle __user *handle, 844 struct file_handle __user *handle,
845 int flags); 845 int flags);
846asmlinkage long sys_setns(int fd, int nstype); 846asmlinkage long sys_setns(int fd, int nstype);
847asmlinkage long sys_process_vm_readv(pid_t pid,
848 const struct iovec __user *lvec,
849 unsigned long liovcnt,
850 const struct iovec __user *rvec,
851 unsigned long riovcnt,
852 unsigned long flags);
853asmlinkage long sys_process_vm_writev(pid_t pid,
854 const struct iovec __user *lvec,
855 unsigned long liovcnt,
856 const struct iovec __user *rvec,
857 unsigned long riovcnt,
858 unsigned long flags);
859
847#endif 860#endif
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index a9a5de07c4f1..47bfa16430d7 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -145,6 +145,10 @@ cond_syscall(sys_io_submit);
145cond_syscall(sys_io_cancel); 145cond_syscall(sys_io_cancel);
146cond_syscall(sys_io_getevents); 146cond_syscall(sys_io_getevents);
147cond_syscall(sys_syslog); 147cond_syscall(sys_syslog);
148cond_syscall(sys_process_vm_readv);
149cond_syscall(sys_process_vm_writev);
150cond_syscall(compat_sys_process_vm_readv);
151cond_syscall(compat_sys_process_vm_writev);
148 152
149/* arch-specific weak syscall entries */ 153/* arch-specific weak syscall entries */
150cond_syscall(sys_pciconfig_read); 154cond_syscall(sys_pciconfig_read);
diff --git a/mm/Makefile b/mm/Makefile
index 836e4163c1bf..50ec00ef2a0e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,7 +5,8 @@
5mmu-y := nommu.o 5mmu-y := nommu.o
6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ 6mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ 7 mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
8 vmalloc.o pagewalk.o pgtable-generic.o 8 vmalloc.o pagewalk.o pgtable-generic.o \
9 process_vm_access.o
9 10
10obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ 11obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
11 maccess.o page_alloc.o page-writeback.o \ 12 maccess.o page_alloc.o page-writeback.o \
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
new file mode 100644
index 000000000000..e920aa3ce104
--- /dev/null
+++ b/mm/process_vm_access.c
@@ -0,0 +1,496 @@
1/*
2 * linux/mm/process_vm_access.c
3 *
4 * Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/mm.h>
13#include <linux/uio.h>
14#include <linux/sched.h>
15#include <linux/highmem.h>
16#include <linux/ptrace.h>
17#include <linux/slab.h>
18#include <linux/syscalls.h>
19
20#ifdef CONFIG_COMPAT
21#include <linux/compat.h>
22#endif
23
24/**
25 * process_vm_rw_pages - read/write pages from task specified
26 * @task: task to read/write from
27 * @mm: mm for task
28 * @process_pages: struct pages area that can store at least
29 * nr_pages_to_copy struct page pointers
30 * @pa: address of page in task to start copying from/to
31 * @start_offset: offset in page to start copying from/to
32 * @len: number of bytes to copy
33 * @lvec: iovec array specifying where to copy to/from
34 * @lvec_cnt: number of elements in iovec array
35 * @lvec_current: index in iovec array we are up to
36 * @lvec_offset: offset in bytes from current iovec iov_base we are up to
37 * @vm_write: 0 means copy from, 1 means copy to
38 * @nr_pages_to_copy: number of pages to copy
39 * @bytes_copied: returns number of bytes successfully copied
40 * Returns 0 on success, error code otherwise
41 */
42static int process_vm_rw_pages(struct task_struct *task,
43 struct mm_struct *mm,
44 struct page **process_pages,
45 unsigned long pa,
46 unsigned long start_offset,
47 unsigned long len,
48 const struct iovec *lvec,
49 unsigned long lvec_cnt,
50 unsigned long *lvec_current,
51 size_t *lvec_offset,
52 int vm_write,
53 unsigned int nr_pages_to_copy,
54 ssize_t *bytes_copied)
55{
56 int pages_pinned;
57 void *target_kaddr;
58 int pgs_copied = 0;
59 int j;
60 int ret;
61 ssize_t bytes_to_copy;
62 ssize_t rc = 0;
63
64 *bytes_copied = 0;
65
66 /* Get the pages we're interested in */
67 down_read(&mm->mmap_sem);
68 pages_pinned = get_user_pages(task, mm, pa,
69 nr_pages_to_copy,
70 vm_write, 0, process_pages, NULL);
71 up_read(&mm->mmap_sem);
72
73 if (pages_pinned != nr_pages_to_copy) {
74 rc = -EFAULT;
75 goto end;
76 }
77
78 /* Do the copy for each page */
79 for (pgs_copied = 0;
80 (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt);
81 pgs_copied++) {
82 /* Make sure we have a non zero length iovec */
83 while (*lvec_current < lvec_cnt
84 && lvec[*lvec_current].iov_len == 0)
85 (*lvec_current)++;
86 if (*lvec_current == lvec_cnt)
87 break;
88
89 /*
90 * Will copy smallest of:
91 * - bytes remaining in page
92 * - bytes remaining in destination iovec
93 */
94 bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset,
95 len - *bytes_copied);
96 bytes_to_copy = min_t(ssize_t, bytes_to_copy,
97 lvec[*lvec_current].iov_len
98 - *lvec_offset);
99
100 target_kaddr = kmap(process_pages[pgs_copied]) + start_offset;
101
102 if (vm_write)
103 ret = copy_from_user(target_kaddr,
104 lvec[*lvec_current].iov_base
105 + *lvec_offset,
106 bytes_to_copy);
107 else
108 ret = copy_to_user(lvec[*lvec_current].iov_base
109 + *lvec_offset,
110 target_kaddr, bytes_to_copy);
111 kunmap(process_pages[pgs_copied]);
112 if (ret) {
113 *bytes_copied += bytes_to_copy - ret;
114 pgs_copied++;
115 rc = -EFAULT;
116 goto end;
117 }
118 *bytes_copied += bytes_to_copy;
119 *lvec_offset += bytes_to_copy;
120 if (*lvec_offset == lvec[*lvec_current].iov_len) {
121 /*
122 * Need to copy remaining part of page into the
123 * next iovec if there are any bytes left in page
124 */
125 (*lvec_current)++;
126 *lvec_offset = 0;
127 start_offset = (start_offset + bytes_to_copy)
128 % PAGE_SIZE;
129 if (start_offset)
130 pgs_copied--;
131 } else {
132 start_offset = 0;
133 }
134 }
135
136end:
137 if (vm_write) {
138 for (j = 0; j < pages_pinned; j++) {
139 if (j < pgs_copied)
140 set_page_dirty_lock(process_pages[j]);
141 put_page(process_pages[j]);
142 }
143 } else {
144 for (j = 0; j < pages_pinned; j++)
145 put_page(process_pages[j]);
146 }
147
148 return rc;
149}
150
151/* Maximum number of pages kmalloc'd to hold struct page's during copy */
152#define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2)
153
154/**
155 * process_vm_rw_single_vec - read/write pages from task specified
156 * @addr: start memory address of target process
157 * @len: size of area to copy to/from
158 * @lvec: iovec array specifying where to copy to/from locally
159 * @lvec_cnt: number of elements in iovec array
160 * @lvec_current: index in iovec array we are up to
161 * @lvec_offset: offset in bytes from current iovec iov_base we are up to
162 * @process_pages: struct pages area that can store at least
163 * nr_pages_to_copy struct page pointers
164 * @mm: mm for task
165 * @task: task to read/write from
166 * @vm_write: 0 means copy from, 1 means copy to
167 * @bytes_copied: returns number of bytes successfully copied
168 * Returns 0 on success or on failure error code
169 */
170static int process_vm_rw_single_vec(unsigned long addr,
171 unsigned long len,
172 const struct iovec *lvec,
173 unsigned long lvec_cnt,
174 unsigned long *lvec_current,
175 size_t *lvec_offset,
176 struct page **process_pages,
177 struct mm_struct *mm,
178 struct task_struct *task,
179 int vm_write,
180 ssize_t *bytes_copied)
181{
182 unsigned long pa = addr & PAGE_MASK;
183 unsigned long start_offset = addr - pa;
184 unsigned long nr_pages;
185 ssize_t bytes_copied_loop;
186 ssize_t rc = 0;
187 unsigned long nr_pages_copied = 0;
188 unsigned long nr_pages_to_copy;
189 unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
190 / sizeof(struct pages *);
191
192 *bytes_copied = 0;
193
194 /* Work out address and page range required */
195 if (len == 0)
196 return 0;
197 nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
198
199 while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) {
200 nr_pages_to_copy = min(nr_pages - nr_pages_copied,
201 max_pages_per_loop);
202
203 rc = process_vm_rw_pages(task, mm, process_pages, pa,
204 start_offset, len,
205 lvec, lvec_cnt,
206 lvec_current, lvec_offset,
207 vm_write, nr_pages_to_copy,
208 &bytes_copied_loop);
209 start_offset = 0;
210 *bytes_copied += bytes_copied_loop;
211
212 if (rc < 0) {
213 return rc;
214 } else {
215 len -= bytes_copied_loop;
216 nr_pages_copied += nr_pages_to_copy;
217 pa += nr_pages_to_copy * PAGE_SIZE;
218 }
219 }
220
221 return rc;
222}
223
224/* Maximum number of entries for process pages array
225 which lives on stack */
226#define PVM_MAX_PP_ARRAY_COUNT 16
227
228/**
229 * process_vm_rw_core - core of reading/writing pages from task specified
230 * @pid: PID of process to read/write from/to
231 * @lvec: iovec array specifying where to copy to/from locally
232 * @liovcnt: size of lvec array
233 * @rvec: iovec array specifying where to copy to/from in the other process
234 * @riovcnt: size of rvec array
235 * @flags: currently unused
236 * @vm_write: 0 if reading from other process, 1 if writing to other process
237 * Returns the number of bytes read/written or error code. May
238 * return less bytes than expected if an error occurs during the copying
239 * process.
240 */
241static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
242 unsigned long liovcnt,
243 const struct iovec *rvec,
244 unsigned long riovcnt,
245 unsigned long flags, int vm_write)
246{
247 struct task_struct *task;
248 struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT];
249 struct page **process_pages = pp_stack;
250 struct mm_struct *mm;
251 unsigned long i;
252 ssize_t rc = 0;
253 ssize_t bytes_copied_loop;
254 ssize_t bytes_copied = 0;
255 unsigned long nr_pages = 0;
256 unsigned long nr_pages_iov;
257 unsigned long iov_l_curr_idx = 0;
258 size_t iov_l_curr_offset = 0;
259 ssize_t iov_len;
260
261 /*
262 * Work out how many pages of struct pages we're going to need
263 * when eventually calling get_user_pages
264 */
265 for (i = 0; i < riovcnt; i++) {
266 iov_len = rvec[i].iov_len;
267 if (iov_len > 0) {
268 nr_pages_iov = ((unsigned long)rvec[i].iov_base
269 + iov_len)
270 / PAGE_SIZE - (unsigned long)rvec[i].iov_base
271 / PAGE_SIZE + 1;
272 nr_pages = max(nr_pages, nr_pages_iov);
273 }
274 }
275
276 if (nr_pages == 0)
277 return 0;
278
279 if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) {
280 /* For reliability don't try to kmalloc more than
281 2 pages worth */
282 process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES,
283 sizeof(struct pages *)*nr_pages),
284 GFP_KERNEL);
285
286 if (!process_pages)
287 return -ENOMEM;
288 }
289
290 /* Get process information */
291 rcu_read_lock();
292 task = find_task_by_vpid(pid);
293 if (task)
294 get_task_struct(task);
295 rcu_read_unlock();
296 if (!task) {
297 rc = -ESRCH;
298 goto free_proc_pages;
299 }
300
301 task_lock(task);
302 if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
303 task_unlock(task);
304 rc = -EPERM;
305 goto put_task_struct;
306 }
307 mm = task->mm;
308
309 if (!mm || (task->flags & PF_KTHREAD)) {
310 task_unlock(task);
311 rc = -EINVAL;
312 goto put_task_struct;
313 }
314
315 atomic_inc(&mm->mm_users);
316 task_unlock(task);
317
318 for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) {
319 rc = process_vm_rw_single_vec(
320 (unsigned long)rvec[i].iov_base, rvec[i].iov_len,
321 lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset,
322 process_pages, mm, task, vm_write, &bytes_copied_loop);
323 bytes_copied += bytes_copied_loop;
324 if (rc != 0) {
325 /* If we have managed to copy any data at all then
326 we return the number of bytes copied. Otherwise
327 we return the error code */
328 if (bytes_copied)
329 rc = bytes_copied;
330 goto put_mm;
331 }
332 }
333
334 rc = bytes_copied;
335put_mm:
336 mmput(mm);
337
338put_task_struct:
339 put_task_struct(task);
340
341free_proc_pages:
342 if (process_pages != pp_stack)
343 kfree(process_pages);
344 return rc;
345}
346
347/**
348 * process_vm_rw - check iovecs before calling core routine
349 * @pid: PID of process to read/write from/to
350 * @lvec: iovec array specifying where to copy to/from locally
351 * @liovcnt: size of lvec array
352 * @rvec: iovec array specifying where to copy to/from in the other process
353 * @riovcnt: size of rvec array
354 * @flags: currently unused
355 * @vm_write: 0 if reading from other process, 1 if writing to other process
356 * Returns the number of bytes read/written or error code. May
357 * return less bytes than expected if an error occurs during the copying
358 * process.
359 */
360static ssize_t process_vm_rw(pid_t pid,
361 const struct iovec __user *lvec,
362 unsigned long liovcnt,
363 const struct iovec __user *rvec,
364 unsigned long riovcnt,
365 unsigned long flags, int vm_write)
366{
367 struct iovec iovstack_l[UIO_FASTIOV];
368 struct iovec iovstack_r[UIO_FASTIOV];
369 struct iovec *iov_l = iovstack_l;
370 struct iovec *iov_r = iovstack_r;
371 ssize_t rc;
372
373 if (flags != 0)
374 return -EINVAL;
375
376 /* Check iovecs */
377 if (vm_write)
378 rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV,
379 iovstack_l, &iov_l, 1);
380 else
381 rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV,
382 iovstack_l, &iov_l, 1);
383 if (rc <= 0)
384 goto free_iovecs;
385
386 rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV,
387 iovstack_r, &iov_r, 0);
388 if (rc <= 0)
389 goto free_iovecs;
390
391 rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
392 vm_write);
393
394free_iovecs:
395 if (iov_r != iovstack_r)
396 kfree(iov_r);
397 if (iov_l != iovstack_l)
398 kfree(iov_l);
399
400 return rc;
401}
402
403SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec,
404 unsigned long, liovcnt, const struct iovec __user *, rvec,
405 unsigned long, riovcnt, unsigned long, flags)
406{
407 return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0);
408}
409
410SYSCALL_DEFINE6(process_vm_writev, pid_t, pid,
411 const struct iovec __user *, lvec,
412 unsigned long, liovcnt, const struct iovec __user *, rvec,
413 unsigned long, riovcnt, unsigned long, flags)
414{
415 return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1);
416}
417
418#ifdef CONFIG_COMPAT
419
420asmlinkage ssize_t
421compat_process_vm_rw(compat_pid_t pid,
422 const struct compat_iovec __user *lvec,
423 unsigned long liovcnt,
424 const struct compat_iovec __user *rvec,
425 unsigned long riovcnt,
426 unsigned long flags, int vm_write)
427{
428 struct iovec iovstack_l[UIO_FASTIOV];
429 struct iovec iovstack_r[UIO_FASTIOV];
430 struct iovec *iov_l = iovstack_l;
431 struct iovec *iov_r = iovstack_r;
432 ssize_t rc = -EFAULT;
433
434 if (flags != 0)
435 return -EINVAL;
436
437 if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec)))
438 goto out;
439
440 if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec)))
441 goto out;
442
443 if (vm_write)
444 rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
445 UIO_FASTIOV, iovstack_l,
446 &iov_l, 1);
447 else
448 rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt,
449 UIO_FASTIOV, iovstack_l,
450 &iov_l, 1);
451 if (rc <= 0)
452 goto free_iovecs;
453 rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt,
454 UIO_FASTIOV, iovstack_r,
455 &iov_r, 0);
456 if (rc <= 0)
457 goto free_iovecs;
458
459 rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
460 vm_write);
461
462free_iovecs:
463 if (iov_r != iovstack_r)
464 kfree(iov_r);
465 if (iov_l != iovstack_l)
466 kfree(iov_l);
467
468out:
469 return rc;
470}
471
472asmlinkage ssize_t
473compat_sys_process_vm_readv(compat_pid_t pid,
474 const struct compat_iovec __user *lvec,
475 unsigned long liovcnt,
476 const struct compat_iovec __user *rvec,
477 unsigned long riovcnt,
478 unsigned long flags)
479{
480 return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
481 riovcnt, flags, 0);
482}
483
484asmlinkage ssize_t
485compat_sys_process_vm_writev(compat_pid_t pid,
486 const struct compat_iovec __user *lvec,
487 unsigned long liovcnt,
488 const struct compat_iovec __user *rvec,
489 unsigned long riovcnt,
490 unsigned long flags)
491{
492 return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
493 riovcnt, flags, 1);
494}
495
496#endif
diff --git a/security/keys/compat.c b/security/keys/compat.c
index 338b510e9027..4c48e13448f8 100644
--- a/security/keys/compat.c
+++ b/security/keys/compat.c
@@ -38,7 +38,7 @@ long compat_keyctl_instantiate_key_iov(
38 38
39 ret = compat_rw_copy_check_uvector(WRITE, _payload_iov, ioc, 39 ret = compat_rw_copy_check_uvector(WRITE, _payload_iov, ioc,
40 ARRAY_SIZE(iovstack), 40 ARRAY_SIZE(iovstack),
41 iovstack, &iov); 41 iovstack, &iov, 1);
42 if (ret < 0) 42 if (ret < 0)
43 return ret; 43 return ret;
44 if (ret == 0) 44 if (ret == 0)
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index eca51918c951..0b3f5d72af1c 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -1065,7 +1065,7 @@ long keyctl_instantiate_key_iov(key_serial_t id,
1065 goto no_payload; 1065 goto no_payload;
1066 1066
1067 ret = rw_copy_check_uvector(WRITE, _payload_iov, ioc, 1067 ret = rw_copy_check_uvector(WRITE, _payload_iov, ioc,
1068 ARRAY_SIZE(iovstack), iovstack, &iov); 1068 ARRAY_SIZE(iovstack), iovstack, &iov, 1);
1069 if (ret < 0) 1069 if (ret < 0)
1070 return ret; 1070 return ret;
1071 if (ret == 0) 1071 if (ret == 0)