aboutsummaryrefslogtreecommitdiffstats
path: root/mm/process_vm_access.c
diff options
context:
space:
mode:
authorChristopher Yeoh <cyeoh@au1.ibm.com>2011-10-31 20:06:39 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-10-31 20:30:44 -0400
commitfcf634098c00dd9cd247447368495f0b79be12d1 (patch)
tree77fc98cd461bd52ba3b14e833d54a115ffbbd7bc /mm/process_vm_access.c
parent32ea845d5bafc37b7406bea1aee3005407cb0900 (diff)
Cross Memory Attach
The basic idea behind cross memory attach is to allow MPI programs doing intra-node communication to do a single copy of the message rather than a double copy of the message via shared memory. The following patch attempts to achieve this by allowing a destination process, given an address and size from a source process, to copy memory directly from the source process into its own address space via a system call. There is also a symmetrical ability to copy from the current process's address space into a destination process's address space. - Use of /proc/pid/mem has been considered, but there are issues with using it: - Does not allow for specifying iovecs for both src and dest, assuming preadv or pwritev was implemented either the area read from or written to would need to be contiguous. - Currently mem_read allows only processes who are currently ptrace'ing the target and are still able to ptrace the target to read from the target. This check could possibly be moved to the open call, but its not clear exactly what race this restriction is stopping (reason appears to have been lost) - Having to send the fd of /proc/self/mem via SCM_RIGHTS on unix domain socket is a bit ugly from a userspace point of view, especially when you may have hundreds if not (eventually) thousands of processes that all need to do this with each other - Doesn't allow for some future use of the interface we would like to consider adding in the future (see below) - Interestingly reading from /proc/pid/mem currently actually involves two copies! (But this could be fixed pretty easily) As mentioned previously use of vmsplice instead was considered, but has problems. Since you need the reader and writer working co-operatively if the pipe is not drained then you block. Which requires some wrapping to do non blocking on the send side or polling on the receive. In all to all communication it requires ordering otherwise you can deadlock. And in the example of many MPI tasks writing to one MPI task vmsplice serialises the copying. There are some cases of MPI collectives where even a single copy interface does not get us the performance gain we could. For example in an MPI_Reduce rather than copy the data from the source we would like to instead use it directly in a mathops (say the reduce is doing a sum) as this would save us doing a copy. We don't need to keep a copy of the data from the source. I haven't implemented this, but I think this interface could in the future do all this through the use of the flags - eg could specify the math operation and type and the kernel rather than just copying the data would apply the specified operation between the source and destination and store it in the destination. Although we don't have a "second user" of the interface (though I've had some nibbles from people who may be interested in using it for intra process messaging which is not MPI). This interface is something which hardware vendors are already doing for their custom drivers to implement fast local communication. And so in addition to this being useful for OpenMPI it would mean the driver maintainers don't have to fix things up when the mm changes. There was some discussion about how much faster a true zero copy would go. Here's a link back to the email with some testing I did on that: http://marc.info/?l=linux-mm&m=130105930902915&w=2 There is a basic man page for the proposed interface here: http://ozlabs.org/~cyeoh/cma/process_vm_readv.txt This has been implemented for x86 and powerpc, other architecture should mainly (I think) just need to add syscall numbers for the process_vm_readv and process_vm_writev. There are 32 bit compatibility versions for 64-bit kernels. For arch maintainers there are some simple tests to be able to quickly verify that the syscalls are working correctly here: http://ozlabs.org/~cyeoh/cma/cma-test-20110718.tgz Signed-off-by: Chris Yeoh <yeohc@au1.ibm.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Arnd Bergmann <arnd@arndb.de> Cc: Paul Mackerras <paulus@samba.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Howells <dhowells@redhat.com> Cc: James Morris <jmorris@namei.org> Cc: <linux-man@vger.kernel.org> Cc: <linux-arch@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/process_vm_access.c')
-rw-r--r--mm/process_vm_access.c496
1 files changed, 496 insertions, 0 deletions
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
new file mode 100644
index 00000000000..e920aa3ce10
--- /dev/null
+++ b/mm/process_vm_access.c
@@ -0,0 +1,496 @@
1/*
2 * linux/mm/process_vm_access.c
3 *
4 * Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp.
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/mm.h>
13#include <linux/uio.h>
14#include <linux/sched.h>
15#include <linux/highmem.h>
16#include <linux/ptrace.h>
17#include <linux/slab.h>
18#include <linux/syscalls.h>
19
20#ifdef CONFIG_COMPAT
21#include <linux/compat.h>
22#endif
23
24/**
25 * process_vm_rw_pages - read/write pages from task specified
26 * @task: task to read/write from
27 * @mm: mm for task
28 * @process_pages: struct pages area that can store at least
29 * nr_pages_to_copy struct page pointers
30 * @pa: address of page in task to start copying from/to
31 * @start_offset: offset in page to start copying from/to
32 * @len: number of bytes to copy
33 * @lvec: iovec array specifying where to copy to/from
34 * @lvec_cnt: number of elements in iovec array
35 * @lvec_current: index in iovec array we are up to
36 * @lvec_offset: offset in bytes from current iovec iov_base we are up to
37 * @vm_write: 0 means copy from, 1 means copy to
38 * @nr_pages_to_copy: number of pages to copy
39 * @bytes_copied: returns number of bytes successfully copied
40 * Returns 0 on success, error code otherwise
41 */
42static int process_vm_rw_pages(struct task_struct *task,
43 struct mm_struct *mm,
44 struct page **process_pages,
45 unsigned long pa,
46 unsigned long start_offset,
47 unsigned long len,
48 const struct iovec *lvec,
49 unsigned long lvec_cnt,
50 unsigned long *lvec_current,
51 size_t *lvec_offset,
52 int vm_write,
53 unsigned int nr_pages_to_copy,
54 ssize_t *bytes_copied)
55{
56 int pages_pinned;
57 void *target_kaddr;
58 int pgs_copied = 0;
59 int j;
60 int ret;
61 ssize_t bytes_to_copy;
62 ssize_t rc = 0;
63
64 *bytes_copied = 0;
65
66 /* Get the pages we're interested in */
67 down_read(&mm->mmap_sem);
68 pages_pinned = get_user_pages(task, mm, pa,
69 nr_pages_to_copy,
70 vm_write, 0, process_pages, NULL);
71 up_read(&mm->mmap_sem);
72
73 if (pages_pinned != nr_pages_to_copy) {
74 rc = -EFAULT;
75 goto end;
76 }
77
78 /* Do the copy for each page */
79 for (pgs_copied = 0;
80 (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt);
81 pgs_copied++) {
82 /* Make sure we have a non zero length iovec */
83 while (*lvec_current < lvec_cnt
84 && lvec[*lvec_current].iov_len == 0)
85 (*lvec_current)++;
86 if (*lvec_current == lvec_cnt)
87 break;
88
89 /*
90 * Will copy smallest of:
91 * - bytes remaining in page
92 * - bytes remaining in destination iovec
93 */
94 bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset,
95 len - *bytes_copied);
96 bytes_to_copy = min_t(ssize_t, bytes_to_copy,
97 lvec[*lvec_current].iov_len
98 - *lvec_offset);
99
100 target_kaddr = kmap(process_pages[pgs_copied]) + start_offset;
101
102 if (vm_write)
103 ret = copy_from_user(target_kaddr,
104 lvec[*lvec_current].iov_base
105 + *lvec_offset,
106 bytes_to_copy);
107 else
108 ret = copy_to_user(lvec[*lvec_current].iov_base
109 + *lvec_offset,
110 target_kaddr, bytes_to_copy);
111 kunmap(process_pages[pgs_copied]);
112 if (ret) {
113 *bytes_copied += bytes_to_copy - ret;
114 pgs_copied++;
115 rc = -EFAULT;
116 goto end;
117 }
118 *bytes_copied += bytes_to_copy;
119 *lvec_offset += bytes_to_copy;
120 if (*lvec_offset == lvec[*lvec_current].iov_len) {
121 /*
122 * Need to copy remaining part of page into the
123 * next iovec if there are any bytes left in page
124 */
125 (*lvec_current)++;
126 *lvec_offset = 0;
127 start_offset = (start_offset + bytes_to_copy)
128 % PAGE_SIZE;
129 if (start_offset)
130 pgs_copied--;
131 } else {
132 start_offset = 0;
133 }
134 }
135
136end:
137 if (vm_write) {
138 for (j = 0; j < pages_pinned; j++) {
139 if (j < pgs_copied)
140 set_page_dirty_lock(process_pages[j]);
141 put_page(process_pages[j]);
142 }
143 } else {
144 for (j = 0; j < pages_pinned; j++)
145 put_page(process_pages[j]);
146 }
147
148 return rc;
149}
150
151/* Maximum number of pages kmalloc'd to hold struct page's during copy */
152#define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2)
153
154/**
155 * process_vm_rw_single_vec - read/write pages from task specified
156 * @addr: start memory address of target process
157 * @len: size of area to copy to/from
158 * @lvec: iovec array specifying where to copy to/from locally
159 * @lvec_cnt: number of elements in iovec array
160 * @lvec_current: index in iovec array we are up to
161 * @lvec_offset: offset in bytes from current iovec iov_base we are up to
162 * @process_pages: struct pages area that can store at least
163 * nr_pages_to_copy struct page pointers
164 * @mm: mm for task
165 * @task: task to read/write from
166 * @vm_write: 0 means copy from, 1 means copy to
167 * @bytes_copied: returns number of bytes successfully copied
168 * Returns 0 on success or on failure error code
169 */
170static int process_vm_rw_single_vec(unsigned long addr,
171 unsigned long len,
172 const struct iovec *lvec,
173 unsigned long lvec_cnt,
174 unsigned long *lvec_current,
175 size_t *lvec_offset,
176 struct page **process_pages,
177 struct mm_struct *mm,
178 struct task_struct *task,
179 int vm_write,
180 ssize_t *bytes_copied)
181{
182 unsigned long pa = addr & PAGE_MASK;
183 unsigned long start_offset = addr - pa;
184 unsigned long nr_pages;
185 ssize_t bytes_copied_loop;
186 ssize_t rc = 0;
187 unsigned long nr_pages_copied = 0;
188 unsigned long nr_pages_to_copy;
189 unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
190 / sizeof(struct pages *);
191
192 *bytes_copied = 0;
193
194 /* Work out address and page range required */
195 if (len == 0)
196 return 0;
197 nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
198
199 while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) {
200 nr_pages_to_copy = min(nr_pages - nr_pages_copied,
201 max_pages_per_loop);
202
203 rc = process_vm_rw_pages(task, mm, process_pages, pa,
204 start_offset, len,
205 lvec, lvec_cnt,
206 lvec_current, lvec_offset,
207 vm_write, nr_pages_to_copy,
208 &bytes_copied_loop);
209 start_offset = 0;
210 *bytes_copied += bytes_copied_loop;
211
212 if (rc < 0) {
213 return rc;
214 } else {
215 len -= bytes_copied_loop;
216 nr_pages_copied += nr_pages_to_copy;
217 pa += nr_pages_to_copy * PAGE_SIZE;
218 }
219 }
220
221 return rc;
222}
223
224/* Maximum number of entries for process pages array
225 which lives on stack */
226#define PVM_MAX_PP_ARRAY_COUNT 16
227
228/**
229 * process_vm_rw_core - core of reading/writing pages from task specified
230 * @pid: PID of process to read/write from/to
231 * @lvec: iovec array specifying where to copy to/from locally
232 * @liovcnt: size of lvec array
233 * @rvec: iovec array specifying where to copy to/from in the other process
234 * @riovcnt: size of rvec array
235 * @flags: currently unused
236 * @vm_write: 0 if reading from other process, 1 if writing to other process
237 * Returns the number of bytes read/written or error code. May
238 * return less bytes than expected if an error occurs during the copying
239 * process.
240 */
241static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
242 unsigned long liovcnt,
243 const struct iovec *rvec,
244 unsigned long riovcnt,
245 unsigned long flags, int vm_write)
246{
247 struct task_struct *task;
248 struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT];
249 struct page **process_pages = pp_stack;
250 struct mm_struct *mm;
251 unsigned long i;
252 ssize_t rc = 0;
253 ssize_t bytes_copied_loop;
254 ssize_t bytes_copied = 0;
255 unsigned long nr_pages = 0;
256 unsigned long nr_pages_iov;
257 unsigned long iov_l_curr_idx = 0;
258 size_t iov_l_curr_offset = 0;
259 ssize_t iov_len;
260
261 /*
262 * Work out how many pages of struct pages we're going to need
263 * when eventually calling get_user_pages
264 */
265 for (i = 0; i < riovcnt; i++) {
266 iov_len = rvec[i].iov_len;
267 if (iov_len > 0) {
268 nr_pages_iov = ((unsigned long)rvec[i].iov_base
269 + iov_len)
270 / PAGE_SIZE - (unsigned long)rvec[i].iov_base
271 / PAGE_SIZE + 1;
272 nr_pages = max(nr_pages, nr_pages_iov);
273 }
274 }
275
276 if (nr_pages == 0)
277 return 0;
278
279 if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) {
280 /* For reliability don't try to kmalloc more than
281 2 pages worth */
282 process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES,
283 sizeof(struct pages *)*nr_pages),
284 GFP_KERNEL);
285
286 if (!process_pages)
287 return -ENOMEM;
288 }
289
290 /* Get process information */
291 rcu_read_lock();
292 task = find_task_by_vpid(pid);
293 if (task)
294 get_task_struct(task);
295 rcu_read_unlock();
296 if (!task) {
297 rc = -ESRCH;
298 goto free_proc_pages;
299 }
300
301 task_lock(task);
302 if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
303 task_unlock(task);
304 rc = -EPERM;
305 goto put_task_struct;
306 }
307 mm = task->mm;
308
309 if (!mm || (task->flags & PF_KTHREAD)) {
310 task_unlock(task);
311 rc = -EINVAL;
312 goto put_task_struct;
313 }
314
315 atomic_inc(&mm->mm_users);
316 task_unlock(task);
317
318 for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) {
319 rc = process_vm_rw_single_vec(
320 (unsigned long)rvec[i].iov_base, rvec[i].iov_len,
321 lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset,
322 process_pages, mm, task, vm_write, &bytes_copied_loop);
323 bytes_copied += bytes_copied_loop;
324 if (rc != 0) {
325 /* If we have managed to copy any data at all then
326 we return the number of bytes copied. Otherwise
327 we return the error code */
328 if (bytes_copied)
329 rc = bytes_copied;
330 goto put_mm;
331 }
332 }
333
334 rc = bytes_copied;
335put_mm:
336 mmput(mm);
337
338put_task_struct:
339 put_task_struct(task);
340
341free_proc_pages:
342 if (process_pages != pp_stack)
343 kfree(process_pages);
344 return rc;
345}
346
347/**
348 * process_vm_rw - check iovecs before calling core routine
349 * @pid: PID of process to read/write from/to
350 * @lvec: iovec array specifying where to copy to/from locally
351 * @liovcnt: size of lvec array
352 * @rvec: iovec array specifying where to copy to/from in the other process
353 * @riovcnt: size of rvec array
354 * @flags: currently unused
355 * @vm_write: 0 if reading from other process, 1 if writing to other process
356 * Returns the number of bytes read/written or error code. May
357 * return less bytes than expected if an error occurs during the copying
358 * process.
359 */
360static ssize_t process_vm_rw(pid_t pid,
361 const struct iovec __user *lvec,
362 unsigned long liovcnt,
363 const struct iovec __user *rvec,
364 unsigned long riovcnt,
365 unsigned long flags, int vm_write)
366{
367 struct iovec iovstack_l[UIO_FASTIOV];
368 struct iovec iovstack_r[UIO_FASTIOV];
369 struct iovec *iov_l = iovstack_l;
370 struct iovec *iov_r = iovstack_r;
371 ssize_t rc;
372
373 if (flags != 0)
374 return -EINVAL;
375
376 /* Check iovecs */
377 if (vm_write)
378 rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV,
379 iovstack_l, &iov_l, 1);
380 else
381 rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV,
382 iovstack_l, &iov_l, 1);
383 if (rc <= 0)
384 goto free_iovecs;
385
386 rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV,
387 iovstack_r, &iov_r, 0);
388 if (rc <= 0)
389 goto free_iovecs;
390
391 rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
392 vm_write);
393
394free_iovecs:
395 if (iov_r != iovstack_r)
396 kfree(iov_r);
397 if (iov_l != iovstack_l)
398 kfree(iov_l);
399
400 return rc;
401}
402
403SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec,
404 unsigned long, liovcnt, const struct iovec __user *, rvec,
405 unsigned long, riovcnt, unsigned long, flags)
406{
407 return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0);
408}
409
410SYSCALL_DEFINE6(process_vm_writev, pid_t, pid,
411 const struct iovec __user *, lvec,
412 unsigned long, liovcnt, const struct iovec __user *, rvec,
413 unsigned long, riovcnt, unsigned long, flags)
414{
415 return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1);
416}
417
418#ifdef CONFIG_COMPAT
419
420asmlinkage ssize_t
421compat_process_vm_rw(compat_pid_t pid,
422 const struct compat_iovec __user *lvec,
423 unsigned long liovcnt,
424 const struct compat_iovec __user *rvec,
425 unsigned long riovcnt,
426 unsigned long flags, int vm_write)
427{
428 struct iovec iovstack_l[UIO_FASTIOV];
429 struct iovec iovstack_r[UIO_FASTIOV];
430 struct iovec *iov_l = iovstack_l;
431 struct iovec *iov_r = iovstack_r;
432 ssize_t rc = -EFAULT;
433
434 if (flags != 0)
435 return -EINVAL;
436
437 if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec)))
438 goto out;
439
440 if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec)))
441 goto out;
442
443 if (vm_write)
444 rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
445 UIO_FASTIOV, iovstack_l,
446 &iov_l, 1);
447 else
448 rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt,
449 UIO_FASTIOV, iovstack_l,
450 &iov_l, 1);
451 if (rc <= 0)
452 goto free_iovecs;
453 rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt,
454 UIO_FASTIOV, iovstack_r,
455 &iov_r, 0);
456 if (rc <= 0)
457 goto free_iovecs;
458
459 rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
460 vm_write);
461
462free_iovecs:
463 if (iov_r != iovstack_r)
464 kfree(iov_r);
465 if (iov_l != iovstack_l)
466 kfree(iov_l);
467
468out:
469 return rc;
470}
471
472asmlinkage ssize_t
473compat_sys_process_vm_readv(compat_pid_t pid,
474 const struct compat_iovec __user *lvec,
475 unsigned long liovcnt,
476 const struct compat_iovec __user *rvec,
477 unsigned long riovcnt,
478 unsigned long flags)
479{
480 return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
481 riovcnt, flags, 0);
482}
483
484asmlinkage ssize_t
485compat_sys_process_vm_writev(compat_pid_t pid,
486 const struct compat_iovec __user *lvec,
487 unsigned long liovcnt,
488 const struct compat_iovec __user *rvec,
489 unsigned long riovcnt,
490 unsigned long flags)
491{
492 return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
493 riovcnt, flags, 1);
494}
495
496#endif