diff options
author | Gu Zheng <guz.fnst@cn.fujitsu.com> | 2013-07-16 05:56:16 -0400 |
---|---|---|
committer | Benjamin LaHaise <bcrl@kvack.org> | 2013-07-16 09:32:18 -0400 |
commit | 36bc08cc01709b4a9bb563b35aa530241ddc63e3 (patch) | |
tree | dc441c213b61e83416f77400cb0492b09d2dbf7a /fs | |
parent | 55708698c5f153f4e390175cdfc395333b2eafbd (diff) |
fs/aio: Add support to aio ring pages migration
As the aio job will pin the ring pages, that will lead to mem migrated
failed. In order to fix this problem we use an anon inode to manage the aio ring
pages, and setup the migratepage callback in the anon inode's address space, so
that when mem migrating the aio ring pages will be moved to other mem node safely.
Signed-off-by: Gu Zheng <guz.fnst@cn.fujitsu.com>
Signed-off-by: Benjamin LaHaise <bcrl@kvack.org>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/aio.c | 119 |
1 files changed, 108 insertions, 11 deletions
@@ -35,6 +35,9 @@ | |||
35 | #include <linux/eventfd.h> | 35 | #include <linux/eventfd.h> |
36 | #include <linux/blkdev.h> | 36 | #include <linux/blkdev.h> |
37 | #include <linux/compat.h> | 37 | #include <linux/compat.h> |
38 | #include <linux/anon_inodes.h> | ||
39 | #include <linux/migrate.h> | ||
40 | #include <linux/ramfs.h> | ||
38 | 41 | ||
39 | #include <asm/kmap_types.h> | 42 | #include <asm/kmap_types.h> |
40 | #include <asm/uaccess.h> | 43 | #include <asm/uaccess.h> |
@@ -110,6 +113,7 @@ struct kioctx { | |||
110 | } ____cacheline_aligned_in_smp; | 113 | } ____cacheline_aligned_in_smp; |
111 | 114 | ||
112 | struct page *internal_pages[AIO_RING_PAGES]; | 115 | struct page *internal_pages[AIO_RING_PAGES]; |
116 | struct file *aio_ring_file; | ||
113 | }; | 117 | }; |
114 | 118 | ||
115 | /*------ sysctl variables----*/ | 119 | /*------ sysctl variables----*/ |
@@ -138,15 +142,78 @@ __initcall(aio_setup); | |||
138 | 142 | ||
139 | static void aio_free_ring(struct kioctx *ctx) | 143 | static void aio_free_ring(struct kioctx *ctx) |
140 | { | 144 | { |
141 | long i; | 145 | int i; |
146 | struct file *aio_ring_file = ctx->aio_ring_file; | ||
142 | 147 | ||
143 | for (i = 0; i < ctx->nr_pages; i++) | 148 | for (i = 0; i < ctx->nr_pages; i++) { |
149 | pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i, | ||
150 | page_count(ctx->ring_pages[i])); | ||
144 | put_page(ctx->ring_pages[i]); | 151 | put_page(ctx->ring_pages[i]); |
152 | } | ||
145 | 153 | ||
146 | if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) | 154 | if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages) |
147 | kfree(ctx->ring_pages); | 155 | kfree(ctx->ring_pages); |
156 | |||
157 | if (aio_ring_file) { | ||
158 | truncate_setsize(aio_ring_file->f_inode, 0); | ||
159 | pr_debug("pid(%d) i_nlink=%u d_count=%d d_unhashed=%d i_count=%d\n", | ||
160 | current->pid, aio_ring_file->f_inode->i_nlink, | ||
161 | aio_ring_file->f_path.dentry->d_count, | ||
162 | d_unhashed(aio_ring_file->f_path.dentry), | ||
163 | atomic_read(&aio_ring_file->f_inode->i_count)); | ||
164 | fput(aio_ring_file); | ||
165 | ctx->aio_ring_file = NULL; | ||
166 | } | ||
148 | } | 167 | } |
149 | 168 | ||
169 | static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma) | ||
170 | { | ||
171 | vma->vm_ops = &generic_file_vm_ops; | ||
172 | return 0; | ||
173 | } | ||
174 | |||
175 | static const struct file_operations aio_ring_fops = { | ||
176 | .mmap = aio_ring_mmap, | ||
177 | }; | ||
178 | |||
179 | static int aio_set_page_dirty(struct page *page) | ||
180 | { | ||
181 | return 0; | ||
182 | } | ||
183 | |||
184 | static int aio_migratepage(struct address_space *mapping, struct page *new, | ||
185 | struct page *old, enum migrate_mode mode) | ||
186 | { | ||
187 | struct kioctx *ctx = mapping->private_data; | ||
188 | unsigned long flags; | ||
189 | unsigned idx = old->index; | ||
190 | int rc; | ||
191 | |||
192 | /* Writeback must be complete */ | ||
193 | BUG_ON(PageWriteback(old)); | ||
194 | put_page(old); | ||
195 | |||
196 | rc = migrate_page_move_mapping(mapping, new, old, NULL, mode); | ||
197 | if (rc != MIGRATEPAGE_SUCCESS) { | ||
198 | get_page(old); | ||
199 | return rc; | ||
200 | } | ||
201 | |||
202 | get_page(new); | ||
203 | |||
204 | spin_lock_irqsave(&ctx->completion_lock, flags); | ||
205 | migrate_page_copy(new, old); | ||
206 | ctx->ring_pages[idx] = new; | ||
207 | spin_unlock_irqrestore(&ctx->completion_lock, flags); | ||
208 | |||
209 | return rc; | ||
210 | } | ||
211 | |||
212 | static const struct address_space_operations aio_ctx_aops = { | ||
213 | .set_page_dirty = aio_set_page_dirty, | ||
214 | .migratepage = aio_migratepage, | ||
215 | }; | ||
216 | |||
150 | static int aio_setup_ring(struct kioctx *ctx) | 217 | static int aio_setup_ring(struct kioctx *ctx) |
151 | { | 218 | { |
152 | struct aio_ring *ring; | 219 | struct aio_ring *ring; |
@@ -154,20 +221,45 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
154 | struct mm_struct *mm = current->mm; | 221 | struct mm_struct *mm = current->mm; |
155 | unsigned long size, populate; | 222 | unsigned long size, populate; |
156 | int nr_pages; | 223 | int nr_pages; |
224 | int i; | ||
225 | struct file *file; | ||
157 | 226 | ||
158 | /* Compensate for the ring buffer's head/tail overlap entry */ | 227 | /* Compensate for the ring buffer's head/tail overlap entry */ |
159 | nr_events += 2; /* 1 is required, 2 for good luck */ | 228 | nr_events += 2; /* 1 is required, 2 for good luck */ |
160 | 229 | ||
161 | size = sizeof(struct aio_ring); | 230 | size = sizeof(struct aio_ring); |
162 | size += sizeof(struct io_event) * nr_events; | 231 | size += sizeof(struct io_event) * nr_events; |
163 | nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT; | ||
164 | 232 | ||
233 | nr_pages = PFN_UP(size); | ||
165 | if (nr_pages < 0) | 234 | if (nr_pages < 0) |
166 | return -EINVAL; | 235 | return -EINVAL; |
167 | 236 | ||
168 | nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event); | 237 | file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR); |
238 | if (IS_ERR(file)) { | ||
239 | ctx->aio_ring_file = NULL; | ||
240 | return -EAGAIN; | ||
241 | } | ||
242 | |||
243 | file->f_inode->i_mapping->a_ops = &aio_ctx_aops; | ||
244 | file->f_inode->i_mapping->private_data = ctx; | ||
245 | file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages; | ||
246 | |||
247 | for (i = 0; i < nr_pages; i++) { | ||
248 | struct page *page; | ||
249 | page = find_or_create_page(file->f_inode->i_mapping, | ||
250 | i, GFP_HIGHUSER | __GFP_ZERO); | ||
251 | if (!page) | ||
252 | break; | ||
253 | pr_debug("pid(%d) page[%d]->count=%d\n", | ||
254 | current->pid, i, page_count(page)); | ||
255 | SetPageUptodate(page); | ||
256 | SetPageDirty(page); | ||
257 | unlock_page(page); | ||
258 | } | ||
259 | ctx->aio_ring_file = file; | ||
260 | nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) | ||
261 | / sizeof(struct io_event); | ||
169 | 262 | ||
170 | ctx->nr_events = 0; | ||
171 | ctx->ring_pages = ctx->internal_pages; | 263 | ctx->ring_pages = ctx->internal_pages; |
172 | if (nr_pages > AIO_RING_PAGES) { | 264 | if (nr_pages > AIO_RING_PAGES) { |
173 | ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), | 265 | ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *), |
@@ -178,28 +270,31 @@ static int aio_setup_ring(struct kioctx *ctx) | |||
178 | 270 | ||
179 | ctx->mmap_size = nr_pages * PAGE_SIZE; | 271 | ctx->mmap_size = nr_pages * PAGE_SIZE; |
180 | pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); | 272 | pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size); |
273 | |||
181 | down_write(&mm->mmap_sem); | 274 | down_write(&mm->mmap_sem); |
182 | ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size, | 275 | ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size, |
183 | PROT_READ|PROT_WRITE, | 276 | PROT_READ | PROT_WRITE, |
184 | MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate); | 277 | MAP_SHARED | MAP_POPULATE, 0, &populate); |
185 | if (IS_ERR((void *)ctx->mmap_base)) { | 278 | if (IS_ERR((void *)ctx->mmap_base)) { |
186 | up_write(&mm->mmap_sem); | 279 | up_write(&mm->mmap_sem); |
187 | ctx->mmap_size = 0; | 280 | ctx->mmap_size = 0; |
188 | aio_free_ring(ctx); | 281 | aio_free_ring(ctx); |
189 | return -EAGAIN; | 282 | return -EAGAIN; |
190 | } | 283 | } |
284 | up_write(&mm->mmap_sem); | ||
285 | |||
286 | mm_populate(ctx->mmap_base, populate); | ||
191 | 287 | ||
192 | pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); | 288 | pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base); |
193 | ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, | 289 | ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages, |
194 | 1, 0, ctx->ring_pages, NULL); | 290 | 1, 0, ctx->ring_pages, NULL); |
195 | up_write(&mm->mmap_sem); | 291 | for (i = 0; i < ctx->nr_pages; i++) |
292 | put_page(ctx->ring_pages[i]); | ||
196 | 293 | ||
197 | if (unlikely(ctx->nr_pages != nr_pages)) { | 294 | if (unlikely(ctx->nr_pages != nr_pages)) { |
198 | aio_free_ring(ctx); | 295 | aio_free_ring(ctx); |
199 | return -EAGAIN; | 296 | return -EAGAIN; |
200 | } | 297 | } |
201 | if (populate) | ||
202 | mm_populate(ctx->mmap_base, populate); | ||
203 | 298 | ||
204 | ctx->user_id = ctx->mmap_base; | 299 | ctx->user_id = ctx->mmap_base; |
205 | ctx->nr_events = nr_events; /* trusted copy */ | 300 | ctx->nr_events = nr_events; /* trusted copy */ |
@@ -399,6 +494,8 @@ out_cleanup: | |||
399 | err = -EAGAIN; | 494 | err = -EAGAIN; |
400 | aio_free_ring(ctx); | 495 | aio_free_ring(ctx); |
401 | out_freectx: | 496 | out_freectx: |
497 | if (ctx->aio_ring_file) | ||
498 | fput(ctx->aio_ring_file); | ||
402 | kmem_cache_free(kioctx_cachep, ctx); | 499 | kmem_cache_free(kioctx_cachep, ctx); |
403 | pr_debug("error allocating ioctx %d\n", err); | 500 | pr_debug("error allocating ioctx %d\n", err); |
404 | return ERR_PTR(err); | 501 | return ERR_PTR(err); |