aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/ceph/addr.c6
-rw-r--r--fs/ceph/caps.c17
-rw-r--r--fs/ceph/dir.c16
-rw-r--r--fs/ceph/file.c52
-rw-r--r--fs/ceph/inode.c49
-rw-r--r--fs/ceph/mds_client.c7
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/super.h4
-rw-r--r--include/linux/ceph/libceph.h3
-rw-r--r--include/linux/ceph/messenger.h1
-rw-r--r--include/linux/ceph/osd_client.h7
-rw-r--r--net/ceph/messenger.c13
-rw-r--r--net/ceph/osd_client.c25
-rw-r--r--net/ceph/pagevec.c3
14 files changed, 129 insertions, 76 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index e9c874abc9e1..561438b6a50c 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -204,7 +204,7 @@ static int readpage_nounlock(struct file *filp, struct page *page)
204 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 204 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
205 page->index << PAGE_CACHE_SHIFT, &len, 205 page->index << PAGE_CACHE_SHIFT, &len,
206 ci->i_truncate_seq, ci->i_truncate_size, 206 ci->i_truncate_seq, ci->i_truncate_size,
207 &page, 1); 207 &page, 1, 0);
208 if (err == -ENOENT) 208 if (err == -ENOENT)
209 err = 0; 209 err = 0;
210 if (err < 0) { 210 if (err < 0) {
@@ -287,7 +287,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
287 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout, 287 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
288 offset, &len, 288 offset, &len,
289 ci->i_truncate_seq, ci->i_truncate_size, 289 ci->i_truncate_seq, ci->i_truncate_size,
290 pages, nr_pages); 290 pages, nr_pages, 0);
291 if (rc == -ENOENT) 291 if (rc == -ENOENT)
292 rc = 0; 292 rc = 0;
293 if (rc < 0) 293 if (rc < 0)
@@ -774,7 +774,7 @@ get_more_pages:
774 snapc, do_sync, 774 snapc, do_sync,
775 ci->i_truncate_seq, 775 ci->i_truncate_seq,
776 ci->i_truncate_size, 776 ci->i_truncate_size,
777 &inode->i_mtime, true, 1); 777 &inode->i_mtime, true, 1, 0);
778 max_pages = req->r_num_pages; 778 max_pages = req->r_num_pages;
779 779
780 alloc_page_vec(fsc, req); 780 alloc_page_vec(fsc, req);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 98ab13e2b71d..60d27bc9eb83 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -1430,8 +1430,8 @@ static int try_nonblocking_invalidate(struct inode *inode)
1430 invalidating_gen == ci->i_rdcache_gen) { 1430 invalidating_gen == ci->i_rdcache_gen) {
1431 /* success. */ 1431 /* success. */
1432 dout("try_nonblocking_invalidate %p success\n", inode); 1432 dout("try_nonblocking_invalidate %p success\n", inode);
1433 ci->i_rdcache_gen = 0; 1433 /* save any racing async invalidate some trouble */
1434 ci->i_rdcache_revoking = 0; 1434 ci->i_rdcache_revoking = ci->i_rdcache_gen - 1;
1435 return 0; 1435 return 0;
1436 } 1436 }
1437 dout("try_nonblocking_invalidate %p failed\n", inode); 1437 dout("try_nonblocking_invalidate %p failed\n", inode);
@@ -2273,8 +2273,7 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2273{ 2273{
2274 struct ceph_inode_info *ci = ceph_inode(inode); 2274 struct ceph_inode_info *ci = ceph_inode(inode);
2275 int mds = session->s_mds; 2275 int mds = session->s_mds;
2276 unsigned seq = le32_to_cpu(grant->seq); 2276 int seq = le32_to_cpu(grant->seq);
2277 unsigned issue_seq = le32_to_cpu(grant->issue_seq);
2278 int newcaps = le32_to_cpu(grant->caps); 2277 int newcaps = le32_to_cpu(grant->caps);
2279 int issued, implemented, used, wanted, dirty; 2278 int issued, implemented, used, wanted, dirty;
2280 u64 size = le64_to_cpu(grant->size); 2279 u64 size = le64_to_cpu(grant->size);
@@ -2286,8 +2285,8 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2286 int revoked_rdcache = 0; 2285 int revoked_rdcache = 0;
2287 int queue_invalidate = 0; 2286 int queue_invalidate = 0;
2288 2287
2289 dout("handle_cap_grant inode %p cap %p mds%d seq %u/%u %s\n", 2288 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2290 inode, cap, mds, seq, issue_seq, ceph_cap_string(newcaps)); 2289 inode, cap, mds, seq, ceph_cap_string(newcaps));
2291 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, 2290 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2292 inode->i_size); 2291 inode->i_size);
2293 2292
@@ -2383,7 +2382,6 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2383 } 2382 }
2384 2383
2385 cap->seq = seq; 2384 cap->seq = seq;
2386 cap->issue_seq = issue_seq;
2387 2385
2388 /* file layout may have changed */ 2386 /* file layout may have changed */
2389 ci->i_layout = grant->layout; 2387 ci->i_layout = grant->layout;
@@ -2691,6 +2689,11 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
2691 NULL /* no caps context */); 2689 NULL /* no caps context */);
2692 try_flush_caps(inode, session, NULL); 2690 try_flush_caps(inode, session, NULL);
2693 up_read(&mdsc->snap_rwsem); 2691 up_read(&mdsc->snap_rwsem);
2692
2693 /* make sure we re-request max_size, if necessary */
2694 spin_lock(&inode->i_lock);
2695 ci->i_requested_max_size = 0;
2696 spin_unlock(&inode->i_lock);
2694} 2697}
2695 2698
2696/* 2699/*
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index e0a2dc6fcafc..7d447af84ec4 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -336,7 +336,10 @@ more:
336 if (req->r_reply_info.dir_end) { 336 if (req->r_reply_info.dir_end) {
337 kfree(fi->last_name); 337 kfree(fi->last_name);
338 fi->last_name = NULL; 338 fi->last_name = NULL;
339 fi->next_offset = 2; 339 if (ceph_frag_is_rightmost(frag))
340 fi->next_offset = 2;
341 else
342 fi->next_offset = 0;
340 } else { 343 } else {
341 rinfo = &req->r_reply_info; 344 rinfo = &req->r_reply_info;
342 err = note_last_dentry(fi, 345 err = note_last_dentry(fi,
@@ -355,18 +358,22 @@ more:
355 u64 pos = ceph_make_fpos(frag, off); 358 u64 pos = ceph_make_fpos(frag, off);
356 struct ceph_mds_reply_inode *in = 359 struct ceph_mds_reply_inode *in =
357 rinfo->dir_in[off - fi->offset].in; 360 rinfo->dir_in[off - fi->offset].in;
361 struct ceph_vino vino;
362 ino_t ino;
363
358 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n", 364 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
359 off, off - fi->offset, rinfo->dir_nr, pos, 365 off, off - fi->offset, rinfo->dir_nr, pos,
360 rinfo->dir_dname_len[off - fi->offset], 366 rinfo->dir_dname_len[off - fi->offset],
361 rinfo->dir_dname[off - fi->offset], in); 367 rinfo->dir_dname[off - fi->offset], in);
362 BUG_ON(!in); 368 BUG_ON(!in);
363 ftype = le32_to_cpu(in->mode) >> 12; 369 ftype = le32_to_cpu(in->mode) >> 12;
370 vino.ino = le64_to_cpu(in->ino);
371 vino.snap = le64_to_cpu(in->snapid);
372 ino = ceph_vino_to_ino(vino);
364 if (filldir(dirent, 373 if (filldir(dirent,
365 rinfo->dir_dname[off - fi->offset], 374 rinfo->dir_dname[off - fi->offset],
366 rinfo->dir_dname_len[off - fi->offset], 375 rinfo->dir_dname_len[off - fi->offset],
367 pos, 376 pos, ino, ftype) < 0) {
368 le64_to_cpu(in->ino),
369 ftype) < 0) {
370 dout("filldir stopping us...\n"); 377 dout("filldir stopping us...\n");
371 return 0; 378 return 0;
372 } 379 }
@@ -414,6 +421,7 @@ static void reset_readdir(struct ceph_file_info *fi)
414 fi->last_readdir = NULL; 421 fi->last_readdir = NULL;
415 } 422 }
416 kfree(fi->last_name); 423 kfree(fi->last_name);
424 fi->last_name = NULL;
417 fi->next_offset = 2; /* compensate for . and .. */ 425 fi->next_offset = 2; /* compensate for . and .. */
418 if (fi->dentry) { 426 if (fi->dentry) {
419 dput(fi->dentry); 427 dput(fi->dentry);
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index e77c28cf3690..8d79b8912e31 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -154,11 +154,13 @@ int ceph_open(struct inode *inode, struct file *file)
154 } 154 }
155 155
156 /* 156 /*
157 * No need to block if we have any caps. Update wanted set 157 * No need to block if we have caps on the auth MDS (for
158 * write) or any MDS (for read). Update wanted set
158 * asynchronously. 159 * asynchronously.
159 */ 160 */
160 spin_lock(&inode->i_lock); 161 spin_lock(&inode->i_lock);
161 if (__ceph_is_any_real_caps(ci)) { 162 if (__ceph_is_any_real_caps(ci) &&
163 (((fmode & CEPH_FILE_MODE_WR) == 0) || ci->i_auth_cap)) {
162 int mds_wanted = __ceph_caps_mds_wanted(ci); 164 int mds_wanted = __ceph_caps_mds_wanted(ci);
163 int issued = __ceph_caps_issued(ci, NULL); 165 int issued = __ceph_caps_issued(ci, NULL);
164 166
@@ -280,11 +282,12 @@ int ceph_release(struct inode *inode, struct file *file)
280static int striped_read(struct inode *inode, 282static int striped_read(struct inode *inode,
281 u64 off, u64 len, 283 u64 off, u64 len,
282 struct page **pages, int num_pages, 284 struct page **pages, int num_pages,
283 int *checkeof) 285 int *checkeof, bool align_to_pages)
284{ 286{
285 struct ceph_fs_client *fsc = ceph_inode_to_client(inode); 287 struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
286 struct ceph_inode_info *ci = ceph_inode(inode); 288 struct ceph_inode_info *ci = ceph_inode(inode);
287 u64 pos, this_len; 289 u64 pos, this_len;
290 int io_align, page_align;
288 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */ 291 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
289 int left, pages_left; 292 int left, pages_left;
290 int read; 293 int read;
@@ -300,14 +303,19 @@ static int striped_read(struct inode *inode,
300 page_pos = pages; 303 page_pos = pages;
301 pages_left = num_pages; 304 pages_left = num_pages;
302 read = 0; 305 read = 0;
306 io_align = off & ~PAGE_MASK;
303 307
304more: 308more:
309 if (align_to_pages)
310 page_align = (pos - io_align) & ~PAGE_MASK;
311 else
312 page_align = pos & ~PAGE_MASK;
305 this_len = left; 313 this_len = left;
306 ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), 314 ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
307 &ci->i_layout, pos, &this_len, 315 &ci->i_layout, pos, &this_len,
308 ci->i_truncate_seq, 316 ci->i_truncate_seq,
309 ci->i_truncate_size, 317 ci->i_truncate_size,
310 page_pos, pages_left); 318 page_pos, pages_left, page_align);
311 hit_stripe = this_len < left; 319 hit_stripe = this_len < left;
312 was_short = ret >= 0 && ret < this_len; 320 was_short = ret >= 0 && ret < this_len;
313 if (ret == -ENOENT) 321 if (ret == -ENOENT)
@@ -374,26 +382,25 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
374 dout("sync_read on file %p %llu~%u %s\n", file, off, len, 382 dout("sync_read on file %p %llu~%u %s\n", file, off, len,
375 (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); 383 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
376 384
377 if (file->f_flags & O_DIRECT) { 385 if (file->f_flags & O_DIRECT)
378 pages = ceph_get_direct_page_vector(data, num_pages, off, len); 386 pages = ceph_get_direct_page_vector(data, num_pages);
379 387 else
380 /*
381 * flush any page cache pages in this range. this
382 * will make concurrent normal and O_DIRECT io slow,
383 * but it will at least behave sensibly when they are
384 * in sequence.
385 */
386 } else {
387 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); 388 pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
388 }
389 if (IS_ERR(pages)) 389 if (IS_ERR(pages))
390 return PTR_ERR(pages); 390 return PTR_ERR(pages);
391 391
392 /*
393 * flush any page cache pages in this range. this
394 * will make concurrent normal and sync io slow,
395 * but it will at least behave sensibly when they are
396 * in sequence.
397 */
392 ret = filemap_write_and_wait(inode->i_mapping); 398 ret = filemap_write_and_wait(inode->i_mapping);
393 if (ret < 0) 399 if (ret < 0)
394 goto done; 400 goto done;
395 401
396 ret = striped_read(inode, off, len, pages, num_pages, checkeof); 402 ret = striped_read(inode, off, len, pages, num_pages, checkeof,
403 file->f_flags & O_DIRECT);
397 404
398 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0) 405 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
399 ret = ceph_copy_page_vector_to_user(pages, data, off, ret); 406 ret = ceph_copy_page_vector_to_user(pages, data, off, ret);
@@ -448,6 +455,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
448 int flags; 455 int flags;
449 int do_sync = 0; 456 int do_sync = 0;
450 int check_caps = 0; 457 int check_caps = 0;
458 int page_align, io_align;
451 int ret; 459 int ret;
452 struct timespec mtime = CURRENT_TIME; 460 struct timespec mtime = CURRENT_TIME;
453 461
@@ -462,6 +470,8 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
462 else 470 else
463 pos = *offset; 471 pos = *offset;
464 472
473 io_align = pos & ~PAGE_MASK;
474
465 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left); 475 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
466 if (ret < 0) 476 if (ret < 0)
467 return ret; 477 return ret;
@@ -486,20 +496,26 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
486 */ 496 */
487more: 497more:
488 len = left; 498 len = left;
499 if (file->f_flags & O_DIRECT)
500 /* write from beginning of first page, regardless of
501 io alignment */
502 page_align = (pos - io_align) & ~PAGE_MASK;
503 else
504 page_align = pos & ~PAGE_MASK;
489 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, 505 req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
490 ceph_vino(inode), pos, &len, 506 ceph_vino(inode), pos, &len,
491 CEPH_OSD_OP_WRITE, flags, 507 CEPH_OSD_OP_WRITE, flags,
492 ci->i_snap_realm->cached_context, 508 ci->i_snap_realm->cached_context,
493 do_sync, 509 do_sync,
494 ci->i_truncate_seq, ci->i_truncate_size, 510 ci->i_truncate_seq, ci->i_truncate_size,
495 &mtime, false, 2); 511 &mtime, false, 2, page_align);
496 if (!req) 512 if (!req)
497 return -ENOMEM; 513 return -ENOMEM;
498 514
499 num_pages = calc_pages_for(pos, len); 515 num_pages = calc_pages_for(pos, len);
500 516
501 if (file->f_flags & O_DIRECT) { 517 if (file->f_flags & O_DIRECT) {
502 pages = ceph_get_direct_page_vector(data, num_pages, pos, len); 518 pages = ceph_get_direct_page_vector(data, num_pages);
503 if (IS_ERR(pages)) { 519 if (IS_ERR(pages)) {
504 ret = PTR_ERR(pages); 520 ret = PTR_ERR(pages);
505 goto out; 521 goto out;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 524b80be4482..bf1286588f26 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -470,7 +470,9 @@ void ceph_fill_file_time(struct inode *inode, int issued,
470 470
471 if (issued & (CEPH_CAP_FILE_EXCL| 471 if (issued & (CEPH_CAP_FILE_EXCL|
472 CEPH_CAP_FILE_WR| 472 CEPH_CAP_FILE_WR|
473 CEPH_CAP_FILE_BUFFER)) { 473 CEPH_CAP_FILE_BUFFER|
474 CEPH_CAP_AUTH_EXCL|
475 CEPH_CAP_XATTR_EXCL)) {
474 if (timespec_compare(ctime, &inode->i_ctime) > 0) { 476 if (timespec_compare(ctime, &inode->i_ctime) > 0) {
475 dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n", 477 dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
476 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec, 478 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
@@ -510,7 +512,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
510 warn = 1; 512 warn = 1;
511 } 513 }
512 } else { 514 } else {
513 /* we have no write caps; whatever the MDS says is true */ 515 /* we have no write|excl caps; whatever the MDS says is true */
514 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) { 516 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
515 inode->i_ctime = *ctime; 517 inode->i_ctime = *ctime;
516 inode->i_mtime = *mtime; 518 inode->i_mtime = *mtime;
@@ -566,12 +568,17 @@ static int fill_inode(struct inode *inode,
566 568
567 /* 569 /*
568 * provided version will be odd if inode value is projected, 570 * provided version will be odd if inode value is projected,
569 * even if stable. skip the update if we have a newer info 571 * even if stable. skip the update if we have newer stable
570 * (e.g., due to inode info racing form multiple MDSs), or if 572 * info (ours>=theirs, e.g. due to racing mds replies), unless
571 * we are getting projected (unstable) inode info. 573 * we are getting projected (unstable) info (in which case the
574 * version is odd, and we want ours>theirs).
575 * us them
576 * 2 2 skip
577 * 3 2 skip
578 * 3 3 update
572 */ 579 */
573 if (le64_to_cpu(info->version) > 0 && 580 if (le64_to_cpu(info->version) > 0 &&
574 (ci->i_version & ~1) > le64_to_cpu(info->version)) 581 (ci->i_version & ~1) >= le64_to_cpu(info->version))
575 goto no_change; 582 goto no_change;
576 583
577 issued = __ceph_caps_issued(ci, &implemented); 584 issued = __ceph_caps_issued(ci, &implemented);
@@ -605,7 +612,14 @@ static int fill_inode(struct inode *inode,
605 le32_to_cpu(info->time_warp_seq), 612 le32_to_cpu(info->time_warp_seq),
606 &ctime, &mtime, &atime); 613 &ctime, &mtime, &atime);
607 614
608 ci->i_max_size = le64_to_cpu(info->max_size); 615 /* only update max_size on auth cap */
616 if ((info->cap.flags & CEPH_CAP_FLAG_AUTH) &&
617 ci->i_max_size != le64_to_cpu(info->max_size)) {
618 dout("max_size %lld -> %llu\n", ci->i_max_size,
619 le64_to_cpu(info->max_size));
620 ci->i_max_size = le64_to_cpu(info->max_size);
621 }
622
609 ci->i_layout = info->layout; 623 ci->i_layout = info->layout;
610 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; 624 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
611 625
@@ -1054,7 +1068,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
1054 ininfo = rinfo->targeti.in; 1068 ininfo = rinfo->targeti.in;
1055 vino.ino = le64_to_cpu(ininfo->ino); 1069 vino.ino = le64_to_cpu(ininfo->ino);
1056 vino.snap = le64_to_cpu(ininfo->snapid); 1070 vino.snap = le64_to_cpu(ininfo->snapid);
1057 if (!dn->d_inode) { 1071 in = dn->d_inode;
1072 if (!in) {
1058 in = ceph_get_inode(sb, vino); 1073 in = ceph_get_inode(sb, vino);
1059 if (IS_ERR(in)) { 1074 if (IS_ERR(in)) {
1060 pr_err("fill_trace bad get_inode " 1075 pr_err("fill_trace bad get_inode "
@@ -1385,11 +1400,8 @@ static void ceph_invalidate_work(struct work_struct *work)
1385 spin_lock(&inode->i_lock); 1400 spin_lock(&inode->i_lock);
1386 dout("invalidate_pages %p gen %d revoking %d\n", inode, 1401 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1387 ci->i_rdcache_gen, ci->i_rdcache_revoking); 1402 ci->i_rdcache_gen, ci->i_rdcache_revoking);
1388 if (ci->i_rdcache_gen == 0 || 1403 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1389 ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1390 BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
1391 /* nevermind! */ 1404 /* nevermind! */
1392 ci->i_rdcache_revoking = 0;
1393 spin_unlock(&inode->i_lock); 1405 spin_unlock(&inode->i_lock);
1394 goto out; 1406 goto out;
1395 } 1407 }
@@ -1399,15 +1411,16 @@ static void ceph_invalidate_work(struct work_struct *work)
1399 ceph_invalidate_nondirty_pages(inode->i_mapping); 1411 ceph_invalidate_nondirty_pages(inode->i_mapping);
1400 1412
1401 spin_lock(&inode->i_lock); 1413 spin_lock(&inode->i_lock);
1402 if (orig_gen == ci->i_rdcache_gen) { 1414 if (orig_gen == ci->i_rdcache_gen &&
1415 orig_gen == ci->i_rdcache_revoking) {
1403 dout("invalidate_pages %p gen %d successful\n", inode, 1416 dout("invalidate_pages %p gen %d successful\n", inode,
1404 ci->i_rdcache_gen); 1417 ci->i_rdcache_gen);
1405 ci->i_rdcache_gen = 0; 1418 ci->i_rdcache_revoking--;
1406 ci->i_rdcache_revoking = 0;
1407 check = 1; 1419 check = 1;
1408 } else { 1420 } else {
1409 dout("invalidate_pages %p gen %d raced, gen now %d\n", 1421 dout("invalidate_pages %p gen %d raced, now %d revoking %d\n",
1410 inode, orig_gen, ci->i_rdcache_gen); 1422 inode, orig_gen, ci->i_rdcache_gen,
1423 ci->i_rdcache_revoking);
1411 } 1424 }
1412 spin_unlock(&inode->i_lock); 1425 spin_unlock(&inode->i_lock);
1413 1426
@@ -1738,7 +1751,7 @@ int ceph_do_getattr(struct inode *inode, int mask)
1738 return 0; 1751 return 0;
1739 } 1752 }
1740 1753
1741 dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask)); 1754 dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode);
1742 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) 1755 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
1743 return 0; 1756 return 0;
1744 1757
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 7799cac2b629..098b18508479 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -528,6 +528,9 @@ static void __register_request(struct ceph_mds_client *mdsc,
528 ceph_mdsc_get_request(req); 528 ceph_mdsc_get_request(req);
529 __insert_request(mdsc, req); 529 __insert_request(mdsc, req);
530 530
531 req->r_uid = current_fsuid();
532 req->r_gid = current_fsgid();
533
531 if (dir) { 534 if (dir) {
532 struct ceph_inode_info *ci = ceph_inode(dir); 535 struct ceph_inode_info *ci = ceph_inode(dir);
533 536
@@ -1587,8 +1590,8 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1587 1590
1588 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch); 1591 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
1589 head->op = cpu_to_le32(req->r_op); 1592 head->op = cpu_to_le32(req->r_op);
1590 head->caller_uid = cpu_to_le32(current_fsuid()); 1593 head->caller_uid = cpu_to_le32(req->r_uid);
1591 head->caller_gid = cpu_to_le32(current_fsgid()); 1594 head->caller_gid = cpu_to_le32(req->r_gid);
1592 head->args = req->r_args; 1595 head->args = req->r_args;
1593 1596
1594 ceph_encode_filepath(&p, end, ino1, path1); 1597 ceph_encode_filepath(&p, end, ino1, path1);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index d66d63c72355..9341fd4f1432 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -170,6 +170,8 @@ struct ceph_mds_request {
170 170
171 union ceph_mds_request_args r_args; 171 union ceph_mds_request_args r_args;
172 int r_fmode; /* file mode, if expecting cap */ 172 int r_fmode; /* file mode, if expecting cap */
173 uid_t r_uid;
174 gid_t r_gid;
173 175
174 /* for choosing which mds to send this request to */ 176 /* for choosing which mds to send this request to */
175 int r_direct_mode; 177 int r_direct_mode;
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 1886294e12f7..7f01728a4657 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -293,9 +293,7 @@ struct ceph_inode_info {
293 int i_rd_ref, i_rdcache_ref, i_wr_ref; 293 int i_rd_ref, i_rdcache_ref, i_wr_ref;
294 int i_wrbuffer_ref, i_wrbuffer_ref_head; 294 int i_wrbuffer_ref, i_wrbuffer_ref_head;
295 u32 i_shared_gen; /* increment each time we get FILE_SHARED */ 295 u32 i_shared_gen; /* increment each time we get FILE_SHARED */
296 u32 i_rdcache_gen; /* we increment this each time we get 296 u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */
297 FILE_CACHE. If it's non-zero, we
298 _may_ have cached pages. */
299 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ 297 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
300 298
301 struct list_head i_unsafe_writes; /* uncommitted sync writes */ 299 struct list_head i_unsafe_writes; /* uncommitted sync writes */
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index f22b2e941686..9e76d35670d2 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -227,8 +227,7 @@ extern int ceph_open_session(struct ceph_client *client);
227extern void ceph_release_page_vector(struct page **pages, int num_pages); 227extern void ceph_release_page_vector(struct page **pages, int num_pages);
228 228
229extern struct page **ceph_get_direct_page_vector(const char __user *data, 229extern struct page **ceph_get_direct_page_vector(const char __user *data,
230 int num_pages, 230 int num_pages);
231 loff_t off, size_t len);
232extern void ceph_put_page_vector(struct page **pages, int num_pages); 231extern void ceph_put_page_vector(struct page **pages, int num_pages);
233extern void ceph_release_page_vector(struct page **pages, int num_pages); 232extern void ceph_release_page_vector(struct page **pages, int num_pages);
234extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); 233extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index 5956d62c3057..a108b425fee2 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -82,6 +82,7 @@ struct ceph_msg {
82 struct ceph_buffer *middle; 82 struct ceph_buffer *middle;
83 struct page **pages; /* data payload. NOT OWNER. */ 83 struct page **pages; /* data payload. NOT OWNER. */
84 unsigned nr_pages; /* size of page array */ 84 unsigned nr_pages; /* size of page array */
85 unsigned page_alignment; /* io offset in first page */
85 struct ceph_pagelist *pagelist; /* instead of pages */ 86 struct ceph_pagelist *pagelist; /* instead of pages */
86 struct list_head list_head; 87 struct list_head list_head;
87 struct kref kref; 88 struct kref kref;
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 6c91fb032c39..a1af29648fb5 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -79,6 +79,7 @@ struct ceph_osd_request {
79 struct ceph_file_layout r_file_layout; 79 struct ceph_file_layout r_file_layout;
80 struct ceph_snap_context *r_snapc; /* snap context for writes */ 80 struct ceph_snap_context *r_snapc; /* snap context for writes */
81 unsigned r_num_pages; /* size of page array (follows) */ 81 unsigned r_num_pages; /* size of page array (follows) */
82 unsigned r_page_alignment; /* io offset in first page */
82 struct page **r_pages; /* pages for data payload */ 83 struct page **r_pages; /* pages for data payload */
83 int r_pages_from_pool; 84 int r_pages_from_pool;
84 int r_own_pages; /* if true, i own page list */ 85 int r_own_pages; /* if true, i own page list */
@@ -194,7 +195,8 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
194 int do_sync, u32 truncate_seq, 195 int do_sync, u32 truncate_seq,
195 u64 truncate_size, 196 u64 truncate_size,
196 struct timespec *mtime, 197 struct timespec *mtime,
197 bool use_mempool, int num_reply); 198 bool use_mempool, int num_reply,
199 int page_align);
198 200
199static inline void ceph_osdc_get_request(struct ceph_osd_request *req) 201static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
200{ 202{
@@ -218,7 +220,8 @@ extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
218 struct ceph_file_layout *layout, 220 struct ceph_file_layout *layout,
219 u64 off, u64 *plen, 221 u64 off, u64 *plen,
220 u32 truncate_seq, u64 truncate_size, 222 u32 truncate_seq, u64 truncate_size,
221 struct page **pages, int nr_pages); 223 struct page **pages, int nr_pages,
224 int page_align);
222 225
223extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, 226extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
224 struct ceph_vino vino, 227 struct ceph_vino vino,
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index 0e8157ee5d43..1c7a2ec4f3cc 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -540,8 +540,7 @@ static void prepare_write_message(struct ceph_connection *con)
540 /* initialize page iterator */ 540 /* initialize page iterator */
541 con->out_msg_pos.page = 0; 541 con->out_msg_pos.page = 0;
542 if (m->pages) 542 if (m->pages)
543 con->out_msg_pos.page_pos = 543 con->out_msg_pos.page_pos = m->page_alignment;
544 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
545 else 544 else
546 con->out_msg_pos.page_pos = 0; 545 con->out_msg_pos.page_pos = 0;
547 con->out_msg_pos.data_pos = 0; 546 con->out_msg_pos.data_pos = 0;
@@ -1491,7 +1490,7 @@ static int read_partial_message(struct ceph_connection *con)
1491 struct ceph_msg *m = con->in_msg; 1490 struct ceph_msg *m = con->in_msg;
1492 int ret; 1491 int ret;
1493 int to, left; 1492 int to, left;
1494 unsigned front_len, middle_len, data_len, data_off; 1493 unsigned front_len, middle_len, data_len;
1495 int datacrc = con->msgr->nocrc; 1494 int datacrc = con->msgr->nocrc;
1496 int skip; 1495 int skip;
1497 u64 seq; 1496 u64 seq;
@@ -1527,19 +1526,17 @@ static int read_partial_message(struct ceph_connection *con)
1527 data_len = le32_to_cpu(con->in_hdr.data_len); 1526 data_len = le32_to_cpu(con->in_hdr.data_len);
1528 if (data_len > CEPH_MSG_MAX_DATA_LEN) 1527 if (data_len > CEPH_MSG_MAX_DATA_LEN)
1529 return -EIO; 1528 return -EIO;
1530 data_off = le16_to_cpu(con->in_hdr.data_off);
1531 1529
1532 /* verify seq# */ 1530 /* verify seq# */
1533 seq = le64_to_cpu(con->in_hdr.seq); 1531 seq = le64_to_cpu(con->in_hdr.seq);
1534 if ((s64)seq - (s64)con->in_seq < 1) { 1532 if ((s64)seq - (s64)con->in_seq < 1) {
1535 pr_info("skipping %s%lld %s seq %lld, expected %lld\n", 1533 pr_info("skipping %s%lld %s seq %lld expected %lld\n",
1536 ENTITY_NAME(con->peer_name), 1534 ENTITY_NAME(con->peer_name),
1537 ceph_pr_addr(&con->peer_addr.in_addr), 1535 ceph_pr_addr(&con->peer_addr.in_addr),
1538 seq, con->in_seq + 1); 1536 seq, con->in_seq + 1);
1539 con->in_base_pos = -front_len - middle_len - data_len - 1537 con->in_base_pos = -front_len - middle_len - data_len -
1540 sizeof(m->footer); 1538 sizeof(m->footer);
1541 con->in_tag = CEPH_MSGR_TAG_READY; 1539 con->in_tag = CEPH_MSGR_TAG_READY;
1542 con->in_seq++;
1543 return 0; 1540 return 0;
1544 } else if ((s64)seq - (s64)con->in_seq > 1) { 1541 } else if ((s64)seq - (s64)con->in_seq > 1) {
1545 pr_err("read_partial_message bad seq %lld expected %lld\n", 1542 pr_err("read_partial_message bad seq %lld expected %lld\n",
@@ -1576,7 +1573,7 @@ static int read_partial_message(struct ceph_connection *con)
1576 1573
1577 con->in_msg_pos.page = 0; 1574 con->in_msg_pos.page = 0;
1578 if (m->pages) 1575 if (m->pages)
1579 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK; 1576 con->in_msg_pos.page_pos = m->page_alignment;
1580 else 1577 else
1581 con->in_msg_pos.page_pos = 0; 1578 con->in_msg_pos.page_pos = 0;
1582 con->in_msg_pos.data_pos = 0; 1579 con->in_msg_pos.data_pos = 0;
@@ -2301,6 +2298,7 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
2301 2298
2302 /* data */ 2299 /* data */
2303 m->nr_pages = 0; 2300 m->nr_pages = 0;
2301 m->page_alignment = 0;
2304 m->pages = NULL; 2302 m->pages = NULL;
2305 m->pagelist = NULL; 2303 m->pagelist = NULL;
2306 m->bio = NULL; 2304 m->bio = NULL;
@@ -2370,6 +2368,7 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2370 type, front_len); 2368 type, front_len);
2371 return NULL; 2369 return NULL;
2372 } 2370 }
2371 msg->page_alignment = le16_to_cpu(hdr->data_off);
2373 } 2372 }
2374 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr)); 2373 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2375 2374
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 79391994b3ed..3e20a122ffa2 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -71,6 +71,7 @@ void ceph_calc_raw_layout(struct ceph_osd_client *osdc,
71 op->extent.length = objlen; 71 op->extent.length = objlen;
72 } 72 }
73 req->r_num_pages = calc_pages_for(off, *plen); 73 req->r_num_pages = calc_pages_for(off, *plen);
74 req->r_page_alignment = off & ~PAGE_MASK;
74 if (op->op == CEPH_OSD_OP_WRITE) 75 if (op->op == CEPH_OSD_OP_WRITE)
75 op->payload_len = *plen; 76 op->payload_len = *plen;
76 77
@@ -390,6 +391,8 @@ void ceph_osdc_build_request(struct ceph_osd_request *req,
390 req->r_request->hdr.data_len = cpu_to_le32(data_len); 391 req->r_request->hdr.data_len = cpu_to_le32(data_len);
391 } 392 }
392 393
394 req->r_request->page_alignment = req->r_page_alignment;
395
393 BUG_ON(p > msg->front.iov_base + msg->front.iov_len); 396 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
394 msg_size = p - msg->front.iov_base; 397 msg_size = p - msg->front.iov_base;
395 msg->front.iov_len = msg_size; 398 msg->front.iov_len = msg_size;
@@ -419,7 +422,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
419 u32 truncate_seq, 422 u32 truncate_seq,
420 u64 truncate_size, 423 u64 truncate_size,
421 struct timespec *mtime, 424 struct timespec *mtime,
422 bool use_mempool, int num_reply) 425 bool use_mempool, int num_reply,
426 int page_align)
423{ 427{
424 struct ceph_osd_req_op ops[3]; 428 struct ceph_osd_req_op ops[3];
425 struct ceph_osd_request *req; 429 struct ceph_osd_request *req;
@@ -447,6 +451,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
447 calc_layout(osdc, vino, layout, off, plen, req, ops); 451 calc_layout(osdc, vino, layout, off, plen, req, ops);
448 req->r_file_layout = *layout; /* keep a copy */ 452 req->r_file_layout = *layout; /* keep a copy */
449 453
454 /* in case it differs from natural alignment that calc_layout
455 filled in for us */
456 req->r_page_alignment = page_align;
457
450 ceph_osdc_build_request(req, off, plen, ops, 458 ceph_osdc_build_request(req, off, plen, ops,
451 snapc, 459 snapc,
452 mtime, 460 mtime,
@@ -1489,7 +1497,7 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1489 struct ceph_vino vino, struct ceph_file_layout *layout, 1497 struct ceph_vino vino, struct ceph_file_layout *layout,
1490 u64 off, u64 *plen, 1498 u64 off, u64 *plen,
1491 u32 truncate_seq, u64 truncate_size, 1499 u32 truncate_seq, u64 truncate_size,
1492 struct page **pages, int num_pages) 1500 struct page **pages, int num_pages, int page_align)
1493{ 1501{
1494 struct ceph_osd_request *req; 1502 struct ceph_osd_request *req;
1495 int rc = 0; 1503 int rc = 0;
@@ -1499,15 +1507,15 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1499 req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 1507 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1500 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, 1508 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1501 NULL, 0, truncate_seq, truncate_size, NULL, 1509 NULL, 0, truncate_seq, truncate_size, NULL,
1502 false, 1); 1510 false, 1, page_align);
1503 if (!req) 1511 if (!req)
1504 return -ENOMEM; 1512 return -ENOMEM;
1505 1513
1506 /* it may be a short read due to an object boundary */ 1514 /* it may be a short read due to an object boundary */
1507 req->r_pages = pages; 1515 req->r_pages = pages;
1508 1516
1509 dout("readpages final extent is %llu~%llu (%d pages)\n", 1517 dout("readpages final extent is %llu~%llu (%d pages align %d)\n",
1510 off, *plen, req->r_num_pages); 1518 off, *plen, req->r_num_pages, page_align);
1511 1519
1512 rc = ceph_osdc_start_request(osdc, req, false); 1520 rc = ceph_osdc_start_request(osdc, req, false);
1513 if (!rc) 1521 if (!rc)
@@ -1533,6 +1541,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1533{ 1541{
1534 struct ceph_osd_request *req; 1542 struct ceph_osd_request *req;
1535 int rc = 0; 1543 int rc = 0;
1544 int page_align = off & ~PAGE_MASK;
1536 1545
1537 BUG_ON(vino.snap != CEPH_NOSNAP); 1546 BUG_ON(vino.snap != CEPH_NOSNAP);
1538 req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 1547 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
@@ -1541,7 +1550,7 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1541 CEPH_OSD_FLAG_WRITE, 1550 CEPH_OSD_FLAG_WRITE,
1542 snapc, do_sync, 1551 snapc, do_sync,
1543 truncate_seq, truncate_size, mtime, 1552 truncate_seq, truncate_size, mtime,
1544 nofail, 1); 1553 nofail, 1, page_align);
1545 if (!req) 1554 if (!req)
1546 return -ENOMEM; 1555 return -ENOMEM;
1547 1556
@@ -1638,8 +1647,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1638 m = ceph_msg_get(req->r_reply); 1647 m = ceph_msg_get(req->r_reply);
1639 1648
1640 if (data_len > 0) { 1649 if (data_len > 0) {
1641 unsigned data_off = le16_to_cpu(hdr->data_off); 1650 int want = calc_pages_for(req->r_page_alignment, data_len);
1642 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1643 1651
1644 if (unlikely(req->r_num_pages < want)) { 1652 if (unlikely(req->r_num_pages < want)) {
1645 pr_warning("tid %lld reply %d > expected %d pages\n", 1653 pr_warning("tid %lld reply %d > expected %d pages\n",
@@ -1651,6 +1659,7 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
1651 } 1659 }
1652 m->pages = req->r_pages; 1660 m->pages = req->r_pages;
1653 m->nr_pages = req->r_num_pages; 1661 m->nr_pages = req->r_num_pages;
1662 m->page_alignment = req->r_page_alignment;
1654#ifdef CONFIG_BLOCK 1663#ifdef CONFIG_BLOCK
1655 m->bio = req->r_bio; 1664 m->bio = req->r_bio;
1656#endif 1665#endif
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index 54caf0687155..ac34feeb2b3a 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -13,8 +13,7 @@
13 * build a vector of user pages 13 * build a vector of user pages
14 */ 14 */
15struct page **ceph_get_direct_page_vector(const char __user *data, 15struct page **ceph_get_direct_page_vector(const char __user *data,
16 int num_pages, 16 int num_pages)
17 loff_t off, size_t len)
18{ 17{
19 struct page **pages; 18 struct page **pages;
20 int rc; 19 int rc;