aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPavel Emelyanov <xemul@parallels.com>2014-09-18 11:56:17 -0400
committerBenjamin LaHaise <bcrl@kvack.org>2014-12-13 17:49:50 -0500
commite4a0d3e720e7e508749c1439b5ba3aff56c92976 (patch)
treeb2307b592b1caff8529a4f8d392b54dbd5c5547f
parentb2776bf7149bddd1f4161f14f79520f17fc1d71d (diff)
aio: Make it possible to remap aio ring
There are actually two issues this patch addresses. Let me start with the one I tried to solve in the beginning. So, in the checkpoint-restore project (criu) we try to dump tasks' state and restore one back exactly as it was. One of the tasks' state bits is rings set up with io_setup() call. There's (almost) no problems in dumping them, there's a problem restoring them -- if I dump a task with aio ring originally mapped at address A, I want to restore one back at exactly the same address A. Unfortunately, the io_setup() does not allow for that -- it mmaps the ring at whatever place mm finds appropriate (it calls do_mmap_pgoff() with zero address and without the MAP_FIXED flag). To make restore possible I'm going to mremap() the freshly created ring into the address A (under which it was seen before dump). The problem is that the ring's virtual address is passed back to the user-space as the context ID and this ID is then used as search key by all the other io_foo() calls. Reworking this ID to be just some integer doesn't seem to work, as this value is already used by libaio as a pointer using which this library accesses memory for aio meta-data. So, to make restore work we need to make sure that a) ring is mapped at desired virtual address b) kioctx->user_id matches this value Having said that, the patch makes mremap() on aio region update the kioctx's user_id and mmap_base values. Here appears the 2nd issue I mentioned in the beginning of this mail. If (regardless of the C/R dances I do) someone creates an io context with io_setup(), then mremap()-s the ring and then destroys the context, the kill_ioctx() routine will call munmap() on wrong (old) address. This will result in a) aio ring remaining in memory and b) some other vma get unexpectedly unmapped. What do you think? Signed-off-by: Pavel Emelyanov <xemul@parallels.com> Acked-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
-rw-r--r--fs/aio.c25
-rw-r--r--include/linux/fs.h1
-rw-r--r--mm/mremap.c3
3 files changed, 28 insertions, 1 deletions
diff --git a/fs/aio.c b/fs/aio.c
index 14b93159ef83..bfab55607a4d 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -286,12 +286,37 @@ static void aio_free_ring(struct kioctx *ctx)
286 286
287static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) 287static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
288{ 288{
289 vma->vm_flags |= VM_DONTEXPAND;
289 vma->vm_ops = &generic_file_vm_ops; 290 vma->vm_ops = &generic_file_vm_ops;
290 return 0; 291 return 0;
291} 292}
292 293
294static void aio_ring_remap(struct file *file, struct vm_area_struct *vma)
295{
296 struct mm_struct *mm = vma->vm_mm;
297 struct kioctx_table *table;
298 int i;
299
300 spin_lock(&mm->ioctx_lock);
301 rcu_read_lock();
302 table = rcu_dereference(mm->ioctx_table);
303 for (i = 0; i < table->nr; i++) {
304 struct kioctx *ctx;
305
306 ctx = table->table[i];
307 if (ctx && ctx->aio_ring_file == file) {
308 ctx->user_id = ctx->mmap_base = vma->vm_start;
309 break;
310 }
311 }
312
313 rcu_read_unlock();
314 spin_unlock(&mm->ioctx_lock);
315}
316
293static const struct file_operations aio_ring_fops = { 317static const struct file_operations aio_ring_fops = {
294 .mmap = aio_ring_mmap, 318 .mmap = aio_ring_mmap,
319 .mremap = aio_ring_remap,
295}; 320};
296 321
297#if IS_ENABLED(CONFIG_MIGRATION) 322#if IS_ENABLED(CONFIG_MIGRATION)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9ab779e8a63c..85f378c55c26 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1497,6 +1497,7 @@ struct file_operations {
1497 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); 1497 long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
1498 long (*compat_ioctl) (struct file *, unsigned int, unsigned long); 1498 long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
1499 int (*mmap) (struct file *, struct vm_area_struct *); 1499 int (*mmap) (struct file *, struct vm_area_struct *);
1500 void (*mremap)(struct file *, struct vm_area_struct *);
1500 int (*open) (struct inode *, struct file *); 1501 int (*open) (struct inode *, struct file *);
1501 int (*flush) (struct file *, fl_owner_t id); 1502 int (*flush) (struct file *, fl_owner_t id);
1502 int (*release) (struct inode *, struct file *); 1503 int (*release) (struct inode *, struct file *);
diff --git a/mm/mremap.c b/mm/mremap.c
index b147f66f4c40..c855922497a3 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -288,7 +288,8 @@ static unsigned long move_vma(struct vm_area_struct *vma,
288 old_len = new_len; 288 old_len = new_len;
289 old_addr = new_addr; 289 old_addr = new_addr;
290 new_addr = -ENOMEM; 290 new_addr = -ENOMEM;
291 } 291 } else if (vma->vm_file && vma->vm_file->f_op->mremap)
292 vma->vm_file->f_op->mremap(vma->vm_file, new_vma);
292 293
293 /* Conceal VM_ACCOUNT so old reservation is not undone */ 294 /* Conceal VM_ACCOUNT so old reservation is not undone */
294 if (vm_flags & VM_ACCOUNT) { 295 if (vm_flags & VM_ACCOUNT) {