aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorGu Zheng <guz.fnst@cn.fujitsu.com>2013-07-16 05:56:16 -0400
committerBenjamin LaHaise <bcrl@kvack.org>2013-07-16 09:32:18 -0400
commit36bc08cc01709b4a9bb563b35aa530241ddc63e3 (patch)
treedc441c213b61e83416f77400cb0492b09d2dbf7a /fs
parent55708698c5f153f4e390175cdfc395333b2eafbd (diff)
fs/aio: Add support to aio ring pages migration
As the aio job will pin the ring pages, that will lead to mem migrated failed. In order to fix this problem we use an anon inode to manage the aio ring pages, and setup the migratepage callback in the anon inode's address space, so that when mem migrating the aio ring pages will be moved to other mem node safely. Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com> Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/aio.c119
1 files changed, 108 insertions, 11 deletions
diff --git a/fs/aio.c b/fs/aio.c
index 9b5ca1137419..cbd0afe77273 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -35,6 +35,9 @@
35#include <linux/eventfd.h> 35#include <linux/eventfd.h>
36#include <linux/blkdev.h> 36#include <linux/blkdev.h>
37#include <linux/compat.h> 37#include <linux/compat.h>
38#include <linux/anon_inodes.h>
39#include <linux/migrate.h>
40#include <linux/ramfs.h>
38 41
39#include <asm/kmap_types.h> 42#include <asm/kmap_types.h>
40#include <asm/uaccess.h> 43#include <asm/uaccess.h>
@@ -110,6 +113,7 @@ struct kioctx {
110 } ____cacheline_aligned_in_smp; 113 } ____cacheline_aligned_in_smp;
111 114
112 struct page *internal_pages[AIO_RING_PAGES]; 115 struct page *internal_pages[AIO_RING_PAGES];
116 struct file *aio_ring_file;
113}; 117};
114 118
115/*------ sysctl variables----*/ 119/*------ sysctl variables----*/
@@ -138,15 +142,78 @@ __initcall(aio_setup);
138 142
139static void aio_free_ring(struct kioctx *ctx) 143static void aio_free_ring(struct kioctx *ctx)
140{ 144{
141 long i; 145 int i;
146 struct file *aio_ring_file = ctx->aio_ring_file;
142 147
143 for (i = 0; i < ctx->nr_pages; i++) 148 for (i = 0; i < ctx->nr_pages; i++) {
149 pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
150 page_count(ctx->ring_pages[i]));
144 put_page(ctx->ring_pages[i]); 151 put_page(ctx->ring_pages[i]);
152 }
145 153
146 if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) 154 if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
147 kfree(ctx->ring_pages); 155 kfree(ctx->ring_pages);
156
157 if (aio_ring_file) {
158 truncate_setsize(aio_ring_file->f_inode, 0);
159 pr_debug("pid(%d) i_nlink=%u d_count=%d d_unhashed=%d i_count=%d\n",
160 current->pid, aio_ring_file->f_inode->i_nlink,
161 aio_ring_file->f_path.dentry->d_count,
162 d_unhashed(aio_ring_file->f_path.dentry),
163 atomic_read(&aio_ring_file->f_inode->i_count));
164 fput(aio_ring_file);
165 ctx->aio_ring_file = NULL;
166 }
148} 167}
149 168
169static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
170{
171 vma->vm_ops = &generic_file_vm_ops;
172 return 0;
173}
174
175static const struct file_operations aio_ring_fops = {
176 .mmap = aio_ring_mmap,
177};
178
179static int aio_set_page_dirty(struct page *page)
180{
181 return 0;
182}
183
184static int aio_migratepage(struct address_space *mapping, struct page *new,
185 struct page *old, enum migrate_mode mode)
186{
187 struct kioctx *ctx = mapping->private_data;
188 unsigned long flags;
189 unsigned idx = old->index;
190 int rc;
191
192 /* Writeback must be complete */
193 BUG_ON(PageWriteback(old));
194 put_page(old);
195
196 rc = migrate_page_move_mapping(mapping, new, old, NULL, mode);
197 if (rc != MIGRATEPAGE_SUCCESS) {
198 get_page(old);
199 return rc;
200 }
201
202 get_page(new);
203
204 spin_lock_irqsave(&ctx->completion_lock, flags);
205 migrate_page_copy(new, old);
206 ctx->ring_pages[idx] = new;
207 spin_unlock_irqrestore(&ctx->completion_lock, flags);
208
209 return rc;
210}
211
212static const struct address_space_operations aio_ctx_aops = {
213 .set_page_dirty = aio_set_page_dirty,
214 .migratepage = aio_migratepage,
215};
216
150static int aio_setup_ring(struct kioctx *ctx) 217static int aio_setup_ring(struct kioctx *ctx)
151{ 218{
152 struct aio_ring *ring; 219 struct aio_ring *ring;
@@ -154,20 +221,45 @@ static int aio_setup_ring(struct kioctx *ctx)
154 struct mm_struct *mm = current->mm; 221 struct mm_struct *mm = current->mm;
155 unsigned long size, populate; 222 unsigned long size, populate;
156 int nr_pages; 223 int nr_pages;
224 int i;
225 struct file *file;
157 226
158 /* Compensate for the ring buffer's head/tail overlap entry */ 227 /* Compensate for the ring buffer's head/tail overlap entry */
159 nr_events += 2; /* 1 is required, 2 for good luck */ 228 nr_events += 2; /* 1 is required, 2 for good luck */
160 229
161 size = sizeof(struct aio_ring); 230 size = sizeof(struct aio_ring);
162 size += sizeof(struct io_event) * nr_events; 231 size += sizeof(struct io_event) * nr_events;
163 nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
164 232
233 nr_pages = PFN_UP(size);
165 if (nr_pages < 0) 234 if (nr_pages < 0)
166 return -EINVAL; 235 return -EINVAL;
167 236
168 nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); 237 file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR);
238 if (IS_ERR(file)) {
239 ctx->aio_ring_file = NULL;
240 return -EAGAIN;
241 }
242
243 file->f_inode->i_mapping->a_ops = &aio_ctx_aops;
244 file->f_inode->i_mapping->private_data = ctx;
245 file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages;
246
247 for (i = 0; i < nr_pages; i++) {
248 struct page *page;
249 page = find_or_create_page(file->f_inode->i_mapping,
250 i, GFP_HIGHUSER | __GFP_ZERO);
251 if (!page)
252 break;
253 pr_debug("pid(%d) page[%d]->count=%d\n",
254 current->pid, i, page_count(page));
255 SetPageUptodate(page);
256 SetPageDirty(page);
257 unlock_page(page);
258 }
259 ctx->aio_ring_file = file;
260 nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
261 / sizeof(struct io_event);
169 262
170 ctx->nr_events = 0;
171 ctx->ring_pages = ctx->internal_pages; 263 ctx->ring_pages = ctx->internal_pages;
172 if (nr_pages > AIO_RING_PAGES) { 264 if (nr_pages > AIO_RING_PAGES) {
173 ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), 265 ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
@@ -178,28 +270,31 @@ static int aio_setup_ring(struct kioctx *ctx)
178 270
179 ctx->mmap_size = nr_pages * PAGE_SIZE; 271 ctx->mmap_size = nr_pages * PAGE_SIZE;
180 pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); 272 pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
273
181 down_write(&mm->mmap_sem); 274 down_write(&mm->mmap_sem);
182 ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, 275 ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
183 PROT_READ|PROT_WRITE, 276 PROT_READ | PROT_WRITE,
184 MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); 277 MAP_SHARED | MAP_POPULATE, 0, &populate);
185 if (IS_ERR((void *)ctx->mmap_base)) { 278 if (IS_ERR((void *)ctx->mmap_base)) {
186 up_write(&mm->mmap_sem); 279 up_write(&mm->mmap_sem);
187 ctx->mmap_size = 0; 280 ctx->mmap_size = 0;
188 aio_free_ring(ctx); 281 aio_free_ring(ctx);
189 return -EAGAIN; 282 return -EAGAIN;
190 } 283 }
284 up_write(&mm->mmap_sem);
285
286 mm_populate(ctx->mmap_base, populate);
191 287
192 pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); 288 pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
193 ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, 289 ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
194 1, 0, ctx->ring_pages, NULL); 290 1, 0, ctx->ring_pages, NULL);
195 up_write(&mm->mmap_sem); 291 for (i = 0; i < ctx->nr_pages; i++)
292 put_page(ctx->ring_pages[i]);
196 293
197 if (unlikely(ctx->nr_pages != nr_pages)) { 294 if (unlikely(ctx->nr_pages != nr_pages)) {
198 aio_free_ring(ctx); 295 aio_free_ring(ctx);
199 return -EAGAIN; 296 return -EAGAIN;
200 } 297 }
201 if (populate)
202 mm_populate(ctx->mmap_base, populate);
203 298
204 ctx->user_id = ctx->mmap_base; 299 ctx->user_id = ctx->mmap_base;
205 ctx->nr_events = nr_events; /* trusted copy */ 300 ctx->nr_events = nr_events; /* trusted copy */
@@ -399,6 +494,8 @@ out_cleanup:
399 err = -EAGAIN; 494 err = -EAGAIN;
400 aio_free_ring(ctx); 495 aio_free_ring(ctx);
401out_freectx: 496out_freectx:
497 if (ctx->aio_ring_file)
498 fput(ctx->aio_ring_file);
402 kmem_cache_free(kioctx_cachep, ctx); 499 kmem_cache_free(kioctx_cachep, ctx);
403 pr_debug("error allocating ioctx %d\n", err); 500 pr_debug("error allocating ioctx %d\n", err);
404 return ERR_PTR(err); 501 return ERR_PTR(err);