aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph/file.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ceph/file.c')
-rw-r--r--fs/ceph/file.c264
1 files changed, 66 insertions, 198 deletions
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 66e4da6dba22..7d0e4a82d898 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -1,5 +1,6 @@
1#include "ceph_debug.h" 1#include <linux/ceph/ceph_debug.h>
2 2
3#include <linux/module.h>
3#include <linux/sched.h> 4#include <linux/sched.h>
4#include <linux/slab.h> 5#include <linux/slab.h>
5#include <linux/file.h> 6#include <linux/file.h>
@@ -38,8 +39,8 @@
38static struct ceph_mds_request * 39static struct ceph_mds_request *
39prepare_open_request(struct super_block *sb, int flags, int create_mode) 40prepare_open_request(struct super_block *sb, int flags, int create_mode)
40{ 41{
41 struct ceph_client *client = ceph_sb_to_client(sb); 42 struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
42 struct ceph_mds_client *mdsc = &client->mdsc; 43 struct ceph_mds_client *mdsc = fsc->mdsc;
43 struct ceph_mds_request *req; 44 struct ceph_mds_request *req;
44 int want_auth = USE_ANY_MDS; 45 int want_auth = USE_ANY_MDS;
45 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; 46 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
@@ -117,8 +118,8 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
117int ceph_open(struct inode *inode, struct file *file) 118int ceph_open(struct inode *inode, struct file *file)
118{ 119{
119 struct ceph_inode_info *ci = ceph_inode(inode); 120 struct ceph_inode_info *ci = ceph_inode(inode);
120 struct ceph_client *client = ceph_sb_to_client(inode->i_sb); 121 struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
121 struct ceph_mds_client *mdsc = &client->mdsc; 122 struct ceph_mds_client *mdsc = fsc->mdsc;
122 struct ceph_mds_request *req; 123 struct ceph_mds_request *req;
123 struct ceph_file_info *cf = file->private_data; 124 struct ceph_file_info *cf = file->private_data;
124 struct inode *parent_inode = file->f_dentry->d_parent->d_inode; 125 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
@@ -153,11 +154,13 @@ int ceph_open(struct inode *inode, struct file *file)
153 } 154 }
154 155
155 /* 156 /*
156 * No need to block if we have any caps. Update wanted set 157 * No need to block if we have caps on the auth MDS (for
158 * write) or any MDS (for read). Update wanted set
157 * asynchronously. 159 * asynchronously.
158 */ 160 */
159 spin_lock(&inode->i_lock); 161 spin_lock(&inode->i_lock);
160 if (__ceph_is_any_real_caps(ci)) { 162 if (__ceph_is_any_real_caps(ci) &&
163 (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
161 int mds_wanted = __ceph_caps_mds_wanted(ci); 164 int mds_wanted = __ceph_caps_mds_wanted(ci);
162 int issued = __ceph_caps_issued(ci, NULL); 165 int issued = __ceph_caps_issued(ci, NULL);
163 166
@@ -216,8 +219,8 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
216 struct nameidata *nd, int mode, 219 struct nameidata *nd, int mode,
217 int locked_dir) 220 int locked_dir)
218{ 221{
219 struct ceph_client *client = ceph_sb_to_client(dir->i_sb); 222 struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb);
220 struct ceph_mds_client *mdsc = &client->mdsc; 223 struct ceph_mds_client *mdsc = fsc->mdsc;
221 struct file *file = nd->intent.open.file; 224 struct file *file = nd->intent.open.file;
222 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry); 225 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
223 struct ceph_mds_request *req; 226 struct ceph_mds_request *req;
@@ -270,163 +273,6 @@ int ceph_release(struct inode *inode, struct file *file)
270} 273}
271 274
272/* 275/*
273 * build a vector of user pages
274 */
275static struct page **get_direct_page_vector(const char __user *data,
276 int num_pages,
277 loff_t off, size_t len)
278{
279 struct page **pages;
280 int rc;
281
282 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
283 if (!pages)
284 return ERR_PTR(-ENOMEM);
285
286 down_read(&current->mm->mmap_sem);
287 rc = get_user_pages(current, current->mm, (unsigned long)data,
288 num_pages, 0, 0, pages, NULL);
289 up_read(&current->mm->mmap_sem);
290 if (rc < 0)
291 goto fail;
292 return pages;
293
294fail:
295 kfree(pages);
296 return ERR_PTR(rc);
297}
298
299static void put_page_vector(struct page **pages, int num_pages)
300{
301 int i;
302
303 for (i = 0; i < num_pages; i++)
304 put_page(pages[i]);
305 kfree(pages);
306}
307
308void ceph_release_page_vector(struct page **pages, int num_pages)
309{
310 int i;
311
312 for (i = 0; i < num_pages; i++)
313 __free_pages(pages[i], 0);
314 kfree(pages);
315}
316
317/*
318 * allocate a vector new pages
319 */
320static struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
321{
322 struct page **pages;
323 int i;
324
325 pages = kmalloc(sizeof(*pages) * num_pages, flags);
326 if (!pages)
327 return ERR_PTR(-ENOMEM);
328 for (i = 0; i < num_pages; i++) {
329 pages[i] = __page_cache_alloc(flags);
330 if (pages[i] == NULL) {
331 ceph_release_page_vector(pages, i);
332 return ERR_PTR(-ENOMEM);
333 }
334 }
335 return pages;
336}
337
338/*
339 * copy user data into a page vector
340 */
341static int copy_user_to_page_vector(struct page **pages,
342 const char __user *data,
343 loff_t off, size_t len)
344{
345 int i = 0;
346 int po = off & ~PAGE_CACHE_MASK;
347 int left = len;
348 int l, bad;
349
350 while (left > 0) {
351 l = min_t(int, PAGE_CACHE_SIZE-po, left);
352 bad = copy_from_user(page_address(pages[i]) + po, data, l);
353 if (bad == l)
354 return -EFAULT;
355 data += l - bad;
356 left -= l - bad;
357 po += l - bad;
358 if (po == PAGE_CACHE_SIZE) {
359 po = 0;
360 i++;
361 }
362 }
363 return len;
364}
365
366/*
367 * copy user data from a page vector into a user pointer
368 */
369static int copy_page_vector_to_user(struct page **pages, char __user *data,
370 loff_t off, size_t len)
371{
372 int i = 0;
373 int po = off & ~PAGE_CACHE_MASK;
374 int left = len;
375 int l, bad;
376
377 while (left > 0) {
378 l = min_t(int, left, PAGE_CACHE_SIZE-po);
379 bad = copy_to_user(data, page_address(pages[i]) + po, l);
380 if (bad == l)
381 return -EFAULT;
382 data += l - bad;
383 left -= l - bad;
384 if (po) {
385 po += l - bad;
386 if (po == PAGE_CACHE_SIZE)
387 po = 0;
388 }
389 i++;
390 }
391 return len;
392}
393
394/*
395 * Zero an extent within a page vector. Offset is relative to the
396 * start of the first page.
397 */
398static void zero_page_vector_range(int off, int len, struct page **pages)
399{
400 int i = off >> PAGE_CACHE_SHIFT;
401
402 off &= ~PAGE_CACHE_MASK;
403
404 dout("zero_page_vector_page %u~%u\n", off, len);
405
406 /* leading partial page? */
407 if (off) {
408 int end = min((int)PAGE_CACHE_SIZE, off + len);
409 dout("zeroing %d %p head from %d\n", i, pages[i],
410 (int)off);
411 zero_user_segment(pages[i], off, end);
412 len -= (end - off);
413 i++;
414 }
415 while (len >= PAGE_CACHE_SIZE) {
416 dout("zeroing %d %p len=%d\n", i, pages[i], len);
417 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
418 len -= PAGE_CACHE_SIZE;
419 i++;
420 }
421 /* trailing partial page? */
422 if (len) {
423 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
424 zero_user_segment(pages[i], 0, len);
425 }
426}
427
428
429/*
430 * Read a range of bytes striped over one or more objects. Iterate over 276 * Read a range of bytes striped over one or more objects. Iterate over
431 * objects we stripe over. (That's not atomic, but good enough for now.) 277 * objects we stripe over. (That's not atomic, but good enough for now.)
432 * 278 *
@@ -436,11 +282,13 @@ static void zero_page_vector_range(int off, int len, struct page **pages)
436static int striped_read(struct inode *inode, 282static int striped_read(struct inode *inode,
437 u64 off, u64 len, 283 u64 off, u64 len,
438 struct page **pages, int num_pages, 284 struct page **pages, int num_pages,
439 int *checkeof) 285 int *checkeof, bool align_to_pages,
286 unsigned long buf_align)
440{ 287{
441 struct ceph_client *client = ceph_inode_to_client(inode); 288 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
442 struct ceph_inode_info *ci = ceph_inode(inode); 289 struct ceph_inode_info *ci = ceph_inode(inode);
443 u64 pos, this_len; 290 u64 pos, this_len;
291 int io_align, page_align;
444 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ 292 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
445 int left, pages_left; 293 int left, pages_left;
446 int read; 294 int read;
@@ -456,14 +304,19 @@ static int striped_read(struct inode *inode,
456 page_pos = pages; 304 page_pos = pages;
457 pages_left = num_pages; 305 pages_left = num_pages;
458 read = 0; 306 read = 0;
307 io_align = off & ~PAGE_MASK;
459 308
460more: 309more:
310 if (align_to_pages)
311 page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
312 else
313 page_align = pos & ~PAGE_MASK;
461 this_len = left; 314 this_len = left;
462 ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode), 315 ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
463 &ci->i_layout, pos, &this_len, 316 &ci->i_layout, pos, &this_len,
464 ci->i_truncate_seq, 317 ci->i_truncate_seq,
465 ci->i_truncate_size, 318 ci->i_truncate_size,
466 page_pos, pages_left); 319 page_pos, pages_left, page_align);
467 hit_stripe = this_len < left; 320 hit_stripe = this_len < left;
468 was_short = ret >= 0 && ret < this_len; 321 was_short = ret >= 0 && ret < this_len;
469 if (ret == -ENOENT) 322 if (ret == -ENOENT)
@@ -477,8 +330,8 @@ more:
477 330
478 if (read < pos - off) { 331 if (read < pos - off) {
479 dout(" zero gap %llu to %llu\n", off + read, pos); 332 dout(" zero gap %llu to %llu\n", off + read, pos);
480 zero_page_vector_range(page_off + read, 333 ceph_zero_page_vector_range(page_off + read,
481 pos - off - read, pages); 334 pos - off - read, pages);
482 } 335 }
483 pos += ret; 336 pos += ret;
484 read = pos - off; 337 read = pos - off;
@@ -495,8 +348,8 @@ more:
495 /* was original extent fully inside i_size? */ 348 /* was original extent fully inside i_size? */
496 if (pos + left <= inode->i_size) { 349 if (pos + left <= inode->i_size) {
497 dout("zero tail\n"); 350 dout("zero tail\n");
498 zero_page_vector_range(page_off + read, len - read, 351 ceph_zero_page_vector_range(page_off + read, len - read,
499 pages); 352 pages);
500 read = len; 353 read = len;
501 goto out; 354 goto out;
502 } 355 }
@@ -524,41 +377,43 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
524 struct inode *inode = file->f_dentry->d_inode; 377 struct inode *inode = file->f_dentry->d_inode;
525 struct page **pages; 378 struct page **pages;
526 u64 off = *poff; 379 u64 off = *poff;
527 int num_pages = calc_pages_for(off, len); 380 int num_pages, ret;
528 int ret;
529 381
530 dout("sync_read on file %p %llu~%u %s\n", file, off, len, 382 dout("sync_read on file %p %llu~%u %s\n", file, off, len,
531 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 383 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
532 384
533 if (file->f_flags & O_DIRECT) { 385 if (file->f_flags & O_DIRECT) {
534 pages = get_direct_page_vector(data, num_pages, off, len); 386 num_pages = calc_pages_for((unsigned long)data, len);
535 387 pages = ceph_get_direct_page_vector(data, num_pages, true);
536 /*
537 * flush any page cache pages in this range. this
538 * will make concurrent normal and O_DIRECT io slow,
539 * but it will at least behave sensibly when they are
540 * in sequence.
541 */
542 } else { 388 } else {
389 num_pages = calc_pages_for(off, len);
543 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); 390 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
544 } 391 }
545 if (IS_ERR(pages)) 392 if (IS_ERR(pages))
546 return PTR_ERR(pages); 393 return PTR_ERR(pages);
547 394
395 /*
396 * flush any page cache pages in this range. this
397 * will make concurrent normal and sync io slow,
398 * but it will at least behave sensibly when they are
399 * in sequence.
400 */
548 ret = filemap_write_and_wait(inode->i_mapping); 401 ret = filemap_write_and_wait(inode->i_mapping);
549 if (ret < 0) 402 if (ret < 0)
550 goto done; 403 goto done;
551 404
552 ret = striped_read(inode, off, len, pages, num_pages, checkeof); 405 ret = striped_read(inode, off, len, pages, num_pages, checkeof,
406 file->f_flags & O_DIRECT,
407 (unsigned long)data & ~PAGE_MASK);
553 408
554 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) 409 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
555 ret = copy_page_vector_to_user(pages, data, off, ret); 410 ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
556 if (ret >= 0) 411 if (ret >= 0)
557 *poff = off + ret; 412 *poff = off + ret;
558 413
559done: 414done:
560 if (file->f_flags & O_DIRECT) 415 if (file->f_flags & O_DIRECT)
561 put_page_vector(pages, num_pages); 416 ceph_put_page_vector(pages, num_pages, true);
562 else 417 else
563 ceph_release_page_vector(pages, num_pages); 418 ceph_release_page_vector(pages, num_pages);
564 dout("sync_read result %d\n", ret); 419 dout("sync_read result %d\n", ret);
@@ -594,7 +449,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
594{ 449{
595 struct inode *inode = file->f_dentry->d_inode; 450 struct inode *inode = file->f_dentry->d_inode;
596 struct ceph_inode_info *ci = ceph_inode(inode); 451 struct ceph_inode_info *ci = ceph_inode(inode);
597 struct ceph_client *client = ceph_inode_to_client(inode); 452 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
598 struct ceph_osd_request *req; 453 struct ceph_osd_request *req;
599 struct page **pages; 454 struct page **pages;
600 int num_pages; 455 int num_pages;
@@ -604,6 +459,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
604 int flags; 459 int flags;
605 int do_sync = 0; 460 int do_sync = 0;
606 int check_caps = 0; 461 int check_caps = 0;
462 int page_align, io_align;
463 unsigned long buf_align;
607 int ret; 464 int ret;
608 struct timespec mtime = CURRENT_TIME; 465 struct timespec mtime = CURRENT_TIME;
609 466
@@ -618,6 +475,9 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
618 else 475 else
619 pos = *offset; 476 pos = *offset;
620 477
478 io_align = pos & ~PAGE_MASK;
479 buf_align = (unsigned long)data & ~PAGE_MASK;
480
621 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); 481 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
622 if (ret < 0) 482 if (ret < 0)
623 return ret; 483 return ret;
@@ -642,20 +502,27 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
642 */ 502 */
643more: 503more:
644 len = left; 504 len = left;
645 req = ceph_osdc_new_request(&client->osdc, &ci->i_layout, 505 if (file->f_flags & O_DIRECT) {
506 /* write from beginning of first page, regardless of
507 io alignment */
508 page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
509 num_pages = calc_pages_for((unsigned long)data, len);
510 } else {
511 page_align = pos & ~PAGE_MASK;
512 num_pages = calc_pages_for(pos, len);
513 }
514 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
646 ceph_vino(inode), pos, &len, 515 ceph_vino(inode), pos, &len,
647 CEPH_OSD_OP_WRITE, flags, 516 CEPH_OSD_OP_WRITE, flags,
648 ci->i_snap_realm->cached_context, 517 ci->i_snap_realm->cached_context,
649 do_sync, 518 do_sync,
650 ci->i_truncate_seq, ci->i_truncate_size, 519 ci->i_truncate_seq, ci->i_truncate_size,
651 &mtime, false, 2); 520 &mtime, false, 2, page_align);
652 if (!req) 521 if (!req)
653 return -ENOMEM; 522 return -ENOMEM;
654 523
655 num_pages = calc_pages_for(pos, len);
656
657 if (file->f_flags & O_DIRECT) { 524 if (file->f_flags & O_DIRECT) {
658 pages = get_direct_page_vector(data, num_pages, pos, len); 525 pages = ceph_get_direct_page_vector(data, num_pages, false);
659 if (IS_ERR(pages)) { 526 if (IS_ERR(pages)) {
660 ret = PTR_ERR(pages); 527 ret = PTR_ERR(pages);
661 goto out; 528 goto out;
@@ -673,7 +540,7 @@ more:
673 ret = PTR_ERR(pages); 540 ret = PTR_ERR(pages);
674 goto out; 541 goto out;
675 } 542 }
676 ret = copy_user_to_page_vector(pages, data, pos, len); 543 ret = ceph_copy_user_to_page_vector(pages, data, pos, len);
677 if (ret < 0) { 544 if (ret < 0) {
678 ceph_release_page_vector(pages, num_pages); 545 ceph_release_page_vector(pages, num_pages);
679 goto out; 546 goto out;
@@ -689,7 +556,7 @@ more:
689 req->r_num_pages = num_pages; 556 req->r_num_pages = num_pages;
690 req->r_inode = inode; 557 req->r_inode = inode;
691 558
692 ret = ceph_osdc_start_request(&client->osdc, req, false); 559 ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
693 if (!ret) { 560 if (!ret) {
694 if (req->r_safe_callback) { 561 if (req->r_safe_callback) {
695 /* 562 /*
@@ -701,11 +568,11 @@ more:
701 spin_unlock(&ci->i_unsafe_lock); 568 spin_unlock(&ci->i_unsafe_lock);
702 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR); 569 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
703 } 570 }
704 ret = ceph_osdc_wait_request(&client->osdc, req); 571 ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
705 } 572 }
706 573
707 if (file->f_flags & O_DIRECT) 574 if (file->f_flags & O_DIRECT)
708 put_page_vector(pages, num_pages); 575 ceph_put_page_vector(pages, num_pages, false);
709 else if (file->f_flags & O_SYNC) 576 else if (file->f_flags & O_SYNC)
710 ceph_release_page_vector(pages, num_pages); 577 ceph_release_page_vector(pages, num_pages);
711 578
@@ -814,7 +681,8 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
814 struct ceph_file_info *fi = file->private_data; 681 struct ceph_file_info *fi = file->private_data;
815 struct inode *inode = file->f_dentry->d_inode; 682 struct inode *inode = file->f_dentry->d_inode;
816 struct ceph_inode_info *ci = ceph_inode(inode); 683 struct ceph_inode_info *ci = ceph_inode(inode);
817 struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc; 684 struct ceph_osd_client *osdc =
685 &ceph_sb_to_client(inode->i_sb)->client->osdc;
818 loff_t endoff = pos + iov->iov_len; 686 loff_t endoff = pos + iov->iov_len;
819 int want, got = 0; 687 int want, got = 0;
820 int ret, err; 688 int ret, err;