diff options
author | Sage Weil <sage@newdream.net> | 2010-11-09 15:43:12 -0500 |
---|---|---|
committer | Sage Weil <sage@newdream.net> | 2010-11-09 15:43:12 -0500 |
commit | b7495fc2ff941db6a118a93ab8d61149e3f4cef8 (patch) | |
tree | 231c339d74760e2fa13e5e6f41c10bc28cea51b3 /fs/ceph/file.c | |
parent | e98b6fed84d0f0155d7b398e0dfeac74c792f2d0 (diff) |
ceph: make page alignment explicit in osd interface
We used to infer alignment of IOs within a page based on the file offset,
which assumed they matched. This broke with direct IO that was not aligned
to pages (e.g., 512-byte aligned IO). We were also trusting the alignment
specified in the OSD reply, which could have been adjusted by the server.
Explicitly specify the page alignment when setting up OSD IO requests.
Signed-off-by: Sage Weil <sage@newdream.net>
Diffstat (limited to 'fs/ceph/file.c')
-rw-r--r-- | fs/ceph/file.c | 26 |
1 files changed, 21 insertions, 5 deletions
diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 603fd00af0a6..8d79b8912e31 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c | |||
@@ -282,11 +282,12 @@ int ceph_release(struct inode *inode, struct file *file) | |||
282 | static int striped_read(struct inode *inode, | 282 | static int striped_read(struct inode *inode, |
283 | u64 off, u64 len, | 283 | u64 off, u64 len, |
284 | struct page **pages, int num_pages, | 284 | struct page **pages, int num_pages, |
285 | int *checkeof) | 285 | int *checkeof, bool align_to_pages) |
286 | { | 286 | { |
287 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); | 287 | struct ceph_fs_client *fsc = ceph_inode_to_client(inode); |
288 | struct ceph_inode_info *ci = ceph_inode(inode); | 288 | struct ceph_inode_info *ci = ceph_inode(inode); |
289 | u64 pos, this_len; | 289 | u64 pos, this_len; |
290 | int io_align, page_align; | ||
290 | int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ | 291 | int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ |
291 | int left, pages_left; | 292 | int left, pages_left; |
292 | int read; | 293 | int read; |
@@ -302,14 +303,19 @@ static int striped_read(struct inode *inode, | |||
302 | page_pos = pages; | 303 | page_pos = pages; |
303 | pages_left = num_pages; | 304 | pages_left = num_pages; |
304 | read = 0; | 305 | read = 0; |
306 | io_align = off & ~PAGE_MASK; | ||
305 | 307 | ||
306 | more: | 308 | more: |
309 | if (align_to_pages) | ||
310 | page_align = (pos - io_align) & ~PAGE_MASK; | ||
311 | else | ||
312 | page_align = pos & ~PAGE_MASK; | ||
307 | this_len = left; | 313 | this_len = left; |
308 | ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), | 314 | ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), |
309 | &ci->i_layout, pos, &this_len, | 315 | &ci->i_layout, pos, &this_len, |
310 | ci->i_truncate_seq, | 316 | ci->i_truncate_seq, |
311 | ci->i_truncate_size, | 317 | ci->i_truncate_size, |
312 | page_pos, pages_left); | 318 | page_pos, pages_left, page_align); |
313 | hit_stripe = this_len < left; | 319 | hit_stripe = this_len < left; |
314 | was_short = ret >= 0 && ret < this_len; | 320 | was_short = ret >= 0 && ret < this_len; |
315 | if (ret == -ENOENT) | 321 | if (ret == -ENOENT) |
@@ -393,7 +399,8 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data, | |||
393 | if (ret < 0) | 399 | if (ret < 0) |
394 | goto done; | 400 | goto done; |
395 | 401 | ||
396 | ret = striped_read(inode, off, len, pages, num_pages, checkeof); | 402 | ret = striped_read(inode, off, len, pages, num_pages, checkeof, |
403 | file->f_flags & O_DIRECT); | ||
397 | 404 | ||
398 | if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) | 405 | if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) |
399 | ret = ceph_copy_page_vector_to_user(pages, data, off, ret); | 406 | ret = ceph_copy_page_vector_to_user(pages, data, off, ret); |
@@ -448,6 +455,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, | |||
448 | int flags; | 455 | int flags; |
449 | int do_sync = 0; | 456 | int do_sync = 0; |
450 | int check_caps = 0; | 457 | int check_caps = 0; |
458 | int page_align, io_align; | ||
451 | int ret; | 459 | int ret; |
452 | struct timespec mtime = CURRENT_TIME; | 460 | struct timespec mtime = CURRENT_TIME; |
453 | 461 | ||
@@ -462,6 +470,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, | |||
462 | else | 470 | else |
463 | pos = *offset; | 471 | pos = *offset; |
464 | 472 | ||
473 | io_align = pos & ~PAGE_MASK; | ||
474 | |||
465 | ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); | 475 | ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); |
466 | if (ret < 0) | 476 | if (ret < 0) |
467 | return ret; | 477 | return ret; |
@@ -486,20 +496,26 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, | |||
486 | */ | 496 | */ |
487 | more: | 497 | more: |
488 | len = left; | 498 | len = left; |
499 | if (file->f_flags & O_DIRECT) | ||
500 | /* write from beginning of first page, regardless of | ||
501 | io alignment */ | ||
502 | page_align = (pos - io_align) & ~PAGE_MASK; | ||
503 | else | ||
504 | page_align = pos & ~PAGE_MASK; | ||
489 | req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, | 505 | req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, |
490 | ceph_vino(inode), pos, &len, | 506 | ceph_vino(inode), pos, &len, |
491 | CEPH_OSD_OP_WRITE, flags, | 507 | CEPH_OSD_OP_WRITE, flags, |
492 | ci->i_snap_realm->cached_context, | 508 | ci->i_snap_realm->cached_context, |
493 | do_sync, | 509 | do_sync, |
494 | ci->i_truncate_seq, ci->i_truncate_size, | 510 | ci->i_truncate_seq, ci->i_truncate_size, |
495 | &mtime, false, 2); | 511 | &mtime, false, 2, page_align); |
496 | if (!req) | 512 | if (!req) |
497 | return -ENOMEM; | 513 | return -ENOMEM; |
498 | 514 | ||
499 | num_pages = calc_pages_for(pos, len); | 515 | num_pages = calc_pages_for(pos, len); |
500 | 516 | ||
501 | if (file->f_flags & O_DIRECT) { | 517 | if (file->f_flags & O_DIRECT) { |
502 | pages = ceph_get_direct_page_vector(data, num_pages, pos, len); | 518 | pages = ceph_get_direct_page_vector(data, num_pages); |
503 | if (IS_ERR(pages)) { | 519 | if (IS_ERR(pages)) { |
504 | ret = PTR_ERR(pages); | 520 | ret = PTR_ERR(pages); |
505 | goto out; | 521 | goto out; |