aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph
diff options
context:
space:
mode:
authorSteve French <sfrench@us.ibm.com>2010-05-13 18:19:32 -0400
committerSteve French <sfrench@us.ibm.com>2010-05-13 18:19:32 -0400
commitbaa456331738b4e76a92318b62b354377a30ad80 (patch)
tree75c828a7c8a9f1b5f7f41b2e53271eafb7e561ef /fs/ceph
parentaa3e5572c538d753dce11bf93532a75f95d22b40 (diff)
parent4462dc02842698f173f518c1f5ce79c0fb89395a (diff)
Merge branch 'master' of /pub/scm/linux/kernel/git/torvalds/linux-2.6
Conflicts: fs/cifs/inode.c
Diffstat (limited to 'fs/ceph')
-rw-r--r--fs/ceph/addr.c8
-rw-r--r--fs/ceph/auth.c1
-rw-r--r--fs/ceph/auth_none.h2
-rw-r--r--fs/ceph/auth_x.c32
-rw-r--r--fs/ceph/caps.c21
-rw-r--r--fs/ceph/dir.c9
-rw-r--r--fs/ceph/file.c3
-rw-r--r--fs/ceph/inode.c8
-rw-r--r--fs/ceph/mds_client.c34
-rw-r--r--fs/ceph/messenger.c39
-rw-r--r--fs/ceph/messenger.h1
-rw-r--r--fs/ceph/osd_client.c26
-rw-r--r--fs/ceph/osd_client.h3
-rw-r--r--fs/ceph/osdmap.c29
-rw-r--r--fs/ceph/osdmap.h2
-rw-r--r--fs/ceph/rados.h1
-rw-r--r--fs/ceph/snap.c24
-rw-r--r--fs/ceph/super.c30
-rw-r--r--fs/ceph/super.h1
19 files changed, 187 insertions, 87 deletions
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 412593703d1e..a9005d862ed4 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -504,12 +504,11 @@ static void writepages_finish(struct ceph_osd_request *req,
504 int i; 504 int i;
505 struct ceph_snap_context *snapc = req->r_snapc; 505 struct ceph_snap_context *snapc = req->r_snapc;
506 struct address_space *mapping = inode->i_mapping; 506 struct address_space *mapping = inode->i_mapping;
507 struct writeback_control *wbc = req->r_wbc;
508 __s32 rc = -EIO; 507 __s32 rc = -EIO;
509 u64 bytes = 0; 508 u64 bytes = 0;
510 struct ceph_client *client = ceph_inode_to_client(inode); 509 struct ceph_client *client = ceph_inode_to_client(inode);
511 long writeback_stat; 510 long writeback_stat;
512 unsigned issued = __ceph_caps_issued(ci, NULL); 511 unsigned issued = ceph_caps_issued(ci);
513 512
514 /* parse reply */ 513 /* parse reply */
515 replyhead = msg->front.iov_base; 514 replyhead = msg->front.iov_base;
@@ -546,10 +545,6 @@ static void writepages_finish(struct ceph_osd_request *req,
546 clear_bdi_congested(&client->backing_dev_info, 545 clear_bdi_congested(&client->backing_dev_info,
547 BLK_RW_ASYNC); 546 BLK_RW_ASYNC);
548 547
549 if (i >= wrote) {
550 dout("inode %p skipping page %p\n", inode, page);
551 wbc->pages_skipped++;
552 }
553 ceph_put_snap_context((void *)page->private); 548 ceph_put_snap_context((void *)page->private);
554 page->private = 0; 549 page->private = 0;
555 ClearPagePrivate(page); 550 ClearPagePrivate(page);
@@ -799,7 +794,6 @@ get_more_pages:
799 alloc_page_vec(client, req); 794 alloc_page_vec(client, req);
800 req->r_callback = writepages_finish; 795 req->r_callback = writepages_finish;
801 req->r_inode = inode; 796 req->r_inode = inode;
802 req->r_wbc = wbc;
803 } 797 }
804 798
805 /* note position of first page in pvec */ 799 /* note position of first page in pvec */
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index f6394b94b866..818afe72e6c7 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -3,6 +3,7 @@
3#include <linux/module.h> 3#include <linux/module.h>
4#include <linux/slab.h> 4#include <linux/slab.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/slab.h>
6 7
7#include "types.h" 8#include "types.h"
8#include "auth_none.h" 9#include "auth_none.h"
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
index 56c05533a31c..8164df1a08be 100644
--- a/fs/ceph/auth_none.h
+++ b/fs/ceph/auth_none.h
@@ -1,6 +1,8 @@
1#ifndef _FS_CEPH_AUTH_NONE_H 1#ifndef _FS_CEPH_AUTH_NONE_H
2#define _FS_CEPH_AUTH_NONE_H 2#define _FS_CEPH_AUTH_NONE_H
3 3
4#include <linux/slab.h>
5
4#include "auth.h" 6#include "auth.h"
5 7
6/* 8/*
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index d9001a4dc8cc..fee5a08da881 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -12,8 +12,6 @@
12#include "auth.h" 12#include "auth.h"
13#include "decode.h" 13#include "decode.h"
14 14
15struct kmem_cache *ceph_x_ticketbuf_cachep;
16
17#define TEMP_TICKET_BUF_LEN 256 15#define TEMP_TICKET_BUF_LEN 256
18 16
19static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); 17static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
@@ -131,13 +129,12 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
131 char *ticket_buf; 129 char *ticket_buf;
132 u8 struct_v; 130 u8 struct_v;
133 131
134 dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC); 132 dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
135 if (!dbuf) 133 if (!dbuf)
136 return -ENOMEM; 134 return -ENOMEM;
137 135
138 ret = -ENOMEM; 136 ret = -ENOMEM;
139 ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, 137 ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
140 GFP_NOFS | GFP_ATOMIC);
141 if (!ticket_buf) 138 if (!ticket_buf)
142 goto out_dbuf; 139 goto out_dbuf;
143 140
@@ -251,9 +248,9 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
251 248
252 ret = 0; 249 ret = 0;
253out: 250out:
254 kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf); 251 kfree(ticket_buf);
255out_dbuf: 252out_dbuf:
256 kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf); 253 kfree(dbuf);
257 return ret; 254 return ret;
258 255
259bad: 256bad:
@@ -605,8 +602,6 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
605 remove_ticket_handler(ac, th); 602 remove_ticket_handler(ac, th);
606 } 603 }
607 604
608 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
609
610 kfree(ac->private); 605 kfree(ac->private);
611 ac->private = NULL; 606 ac->private = NULL;
612} 607}
@@ -641,26 +636,20 @@ int ceph_x_init(struct ceph_auth_client *ac)
641 int ret; 636 int ret;
642 637
643 dout("ceph_x_init %p\n", ac); 638 dout("ceph_x_init %p\n", ac);
639 ret = -ENOMEM;
644 xi = kzalloc(sizeof(*xi), GFP_NOFS); 640 xi = kzalloc(sizeof(*xi), GFP_NOFS);
645 if (!xi) 641 if (!xi)
646 return -ENOMEM; 642 goto out;
647 643
648 ret = -ENOMEM;
649 ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
650 TEMP_TICKET_BUF_LEN, 8,
651 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
652 NULL);
653 if (!ceph_x_ticketbuf_cachep)
654 goto done_nomem;
655 ret = -EINVAL; 644 ret = -EINVAL;
656 if (!ac->secret) { 645 if (!ac->secret) {
657 pr_err("no secret set (for auth_x protocol)\n"); 646 pr_err("no secret set (for auth_x protocol)\n");
658 goto done_nomem; 647 goto out_nomem;
659 } 648 }
660 649
661 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret); 650 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
662 if (ret) 651 if (ret)
663 goto done_nomem; 652 goto out_nomem;
664 653
665 xi->starting = true; 654 xi->starting = true;
666 xi->ticket_handlers = RB_ROOT; 655 xi->ticket_handlers = RB_ROOT;
@@ -670,10 +659,9 @@ int ceph_x_init(struct ceph_auth_client *ac)
670 ac->ops = &ceph_x_ops; 659 ac->ops = &ceph_x_ops;
671 return 0; 660 return 0;
672 661
673done_nomem: 662out_nomem:
674 kfree(xi); 663 kfree(xi);
675 if (ceph_x_ticketbuf_cachep) 664out:
676 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
677 return ret; 665 return ret;
678} 666}
679 667
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index aa2239fa9a3b..d9400534b279 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -858,6 +858,8 @@ static int __ceph_is_any_caps(struct ceph_inode_info *ci)
858} 858}
859 859
860/* 860/*
861 * Remove a cap. Take steps to deal with a racing iterate_session_caps.
862 *
861 * caller should hold i_lock. 863 * caller should hold i_lock.
862 * caller will not hold session s_mutex if called from destroy_inode. 864 * caller will not hold session s_mutex if called from destroy_inode.
863 */ 865 */
@@ -866,15 +868,10 @@ void __ceph_remove_cap(struct ceph_cap *cap)
866 struct ceph_mds_session *session = cap->session; 868 struct ceph_mds_session *session = cap->session;
867 struct ceph_inode_info *ci = cap->ci; 869 struct ceph_inode_info *ci = cap->ci;
868 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc; 870 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
871 int removed = 0;
869 872
870 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode); 873 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
871 874
872 /* remove from inode list */
873 rb_erase(&cap->ci_node, &ci->i_caps);
874 cap->ci = NULL;
875 if (ci->i_auth_cap == cap)
876 ci->i_auth_cap = NULL;
877
878 /* remove from session list */ 875 /* remove from session list */
879 spin_lock(&session->s_cap_lock); 876 spin_lock(&session->s_cap_lock);
880 if (session->s_cap_iterator == cap) { 877 if (session->s_cap_iterator == cap) {
@@ -885,10 +882,18 @@ void __ceph_remove_cap(struct ceph_cap *cap)
885 list_del_init(&cap->session_caps); 882 list_del_init(&cap->session_caps);
886 session->s_nr_caps--; 883 session->s_nr_caps--;
887 cap->session = NULL; 884 cap->session = NULL;
885 removed = 1;
888 } 886 }
887 /* protect backpointer with s_cap_lock: see iterate_session_caps */
888 cap->ci = NULL;
889 spin_unlock(&session->s_cap_lock); 889 spin_unlock(&session->s_cap_lock);
890 890
891 if (cap->session == NULL) 891 /* remove from inode list */
892 rb_erase(&cap->ci_node, &ci->i_caps);
893 if (ci->i_auth_cap == cap)
894 ci->i_auth_cap = NULL;
895
896 if (removed)
892 ceph_put_cap(cap); 897 ceph_put_cap(cap);
893 898
894 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) { 899 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
@@ -1861,8 +1866,8 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1861 } else { 1866 } else {
1862 pr_err("%p auth cap %p not mds%d ???\n", inode, 1867 pr_err("%p auth cap %p not mds%d ???\n", inode,
1863 cap, session->s_mds); 1868 cap, session->s_mds);
1864 spin_unlock(&inode->i_lock);
1865 } 1869 }
1870 spin_unlock(&inode->i_lock);
1866 } 1871 }
1867} 1872}
1868 1873
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index ea8ee2e526aa..650d2db5ed26 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -880,7 +880,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
880 * do_request, above). If there is no trace, we need 880 * do_request, above). If there is no trace, we need
881 * to do it here. 881 * to do it here.
882 */ 882 */
883
884 /* d_move screws up d_subdirs order */
885 ceph_i_clear(new_dir, CEPH_I_COMPLETE);
886
883 d_move(old_dentry, new_dentry); 887 d_move(old_dentry, new_dentry);
888
889 /* ensure target dentry is invalidated, despite
890 rehashing bug in vfs_rename_dir */
891 new_dentry->d_time = jiffies;
892 ceph_dentry(new_dentry)->lease_shared_gen = 0;
884 } 893 }
885 ceph_mdsc_put_request(req); 894 ceph_mdsc_put_request(req);
886 return err; 895 return err;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 4add3d5da2c1..ed6f19721d6e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -665,7 +665,8 @@ more:
665 * throw out any page cache pages in this range. this 665 * throw out any page cache pages in this range. this
666 * may block. 666 * may block.
667 */ 667 */
668 truncate_inode_pages_range(inode->i_mapping, pos, pos+len); 668 truncate_inode_pages_range(inode->i_mapping, pos,
669 (pos+len) | (PAGE_CACHE_SIZE-1));
669 } else { 670 } else {
670 pages = alloc_page_vector(num_pages); 671 pages = alloc_page_vector(num_pages);
671 if (IS_ERR(pages)) { 672 if (IS_ERR(pages)) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 26f883c275e8..85b4d2ffdeba 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -733,6 +733,10 @@ no_change:
733 __ceph_get_fmode(ci, cap_fmode); 733 __ceph_get_fmode(ci, cap_fmode);
734 spin_unlock(&inode->i_lock); 734 spin_unlock(&inode->i_lock);
735 } 735 }
736 } else if (cap_fmode >= 0) {
737 pr_warning("mds issued no caps on %llx.%llx\n",
738 ceph_vinop(inode));
739 __ceph_get_fmode(ci, cap_fmode);
736 } 740 }
737 741
738 /* update delegation info? */ 742 /* update delegation info? */
@@ -997,6 +1001,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
997 dn, dn->d_name.len, dn->d_name.name); 1001 dn, dn->d_name.len, dn->d_name.name);
998 dout("fill_trace doing d_move %p -> %p\n", 1002 dout("fill_trace doing d_move %p -> %p\n",
999 req->r_old_dentry, dn); 1003 req->r_old_dentry, dn);
1004
1005 /* d_move screws up d_subdirs order */
1006 ceph_i_clear(dir, CEPH_I_COMPLETE);
1007
1000 d_move(req->r_old_dentry, dn); 1008 d_move(req->r_old_dentry, dn);
1001 dout(" src %p '%.*s' dst %p '%.*s'\n", 1009 dout(" src %p '%.*s' dst %p '%.*s'\n",
1002 req->r_old_dentry, 1010 req->r_old_dentry,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 60a9a4ae47be..24561a557e01 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -736,9 +736,10 @@ static void cleanup_cap_releases(struct ceph_mds_session *session)
736} 736}
737 737
738/* 738/*
739 * Helper to safely iterate over all caps associated with a session. 739 * Helper to safely iterate over all caps associated with a session, with
740 * special care taken to handle a racing __ceph_remove_cap().
740 * 741 *
741 * caller must hold session s_mutex 742 * Caller must hold session s_mutex.
742 */ 743 */
743static int iterate_session_caps(struct ceph_mds_session *session, 744static int iterate_session_caps(struct ceph_mds_session *session,
744 int (*cb)(struct inode *, struct ceph_cap *, 745 int (*cb)(struct inode *, struct ceph_cap *,
@@ -2136,7 +2137,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
2136 struct ceph_mds_session *session = NULL; 2137 struct ceph_mds_session *session = NULL;
2137 struct ceph_msg *reply; 2138 struct ceph_msg *reply;
2138 struct rb_node *p; 2139 struct rb_node *p;
2139 int err; 2140 int err = -ENOMEM;
2140 struct ceph_pagelist *pagelist; 2141 struct ceph_pagelist *pagelist;
2141 2142
2142 pr_info("reconnect to recovering mds%d\n", mds); 2143 pr_info("reconnect to recovering mds%d\n", mds);
@@ -2185,7 +2186,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
2185 goto fail; 2186 goto fail;
2186 err = iterate_session_caps(session, encode_caps_cb, pagelist); 2187 err = iterate_session_caps(session, encode_caps_cb, pagelist);
2187 if (err < 0) 2188 if (err < 0)
2188 goto out; 2189 goto fail;
2189 2190
2190 /* 2191 /*
2191 * snaprealms. we provide mds with the ino, seq (version), and 2192 * snaprealms. we provide mds with the ino, seq (version), and
@@ -2213,28 +2214,31 @@ send:
2213 reply->nr_pages = calc_pages_for(0, pagelist->length); 2214 reply->nr_pages = calc_pages_for(0, pagelist->length);
2214 ceph_con_send(&session->s_con, reply); 2215 ceph_con_send(&session->s_con, reply);
2215 2216
2216 if (session) { 2217 session->s_state = CEPH_MDS_SESSION_OPEN;
2217 session->s_state = CEPH_MDS_SESSION_OPEN; 2218 mutex_unlock(&session->s_mutex);
2218 __wake_requests(mdsc, &session->s_waiting); 2219
2219 } 2220 mutex_lock(&mdsc->mutex);
2221 __wake_requests(mdsc, &session->s_waiting);
2222 mutex_unlock(&mdsc->mutex);
2223
2224 ceph_put_mds_session(session);
2220 2225
2221out:
2222 up_read(&mdsc->snap_rwsem); 2226 up_read(&mdsc->snap_rwsem);
2223 if (session) {
2224 mutex_unlock(&session->s_mutex);
2225 ceph_put_mds_session(session);
2226 }
2227 mutex_lock(&mdsc->mutex); 2227 mutex_lock(&mdsc->mutex);
2228 return; 2228 return;
2229 2229
2230fail: 2230fail:
2231 ceph_msg_put(reply); 2231 ceph_msg_put(reply);
2232 up_read(&mdsc->snap_rwsem);
2233 mutex_unlock(&session->s_mutex);
2234 ceph_put_mds_session(session);
2232fail_nomsg: 2235fail_nomsg:
2233 ceph_pagelist_release(pagelist); 2236 ceph_pagelist_release(pagelist);
2234 kfree(pagelist); 2237 kfree(pagelist);
2235fail_nopagelist: 2238fail_nopagelist:
2236 pr_err("ENOMEM preparing reconnect for mds%d\n", mds); 2239 pr_err("error %d preparing reconnect for mds%d\n", err, mds);
2237 goto out; 2240 mutex_lock(&mdsc->mutex);
2241 return;
2238} 2242}
2239 2243
2240 2244
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index cdaaa131add3..cd4fadb6491a 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -492,7 +492,14 @@ static void prepare_write_message(struct ceph_connection *con)
492 list_move_tail(&m->list_head, &con->out_sent); 492 list_move_tail(&m->list_head, &con->out_sent);
493 } 493 }
494 494
495 m->hdr.seq = cpu_to_le64(++con->out_seq); 495 /*
496 * only assign outgoing seq # if we haven't sent this message
497 * yet. if it is requeued, resend with it's original seq.
498 */
499 if (m->needs_out_seq) {
500 m->hdr.seq = cpu_to_le64(++con->out_seq);
501 m->needs_out_seq = false;
502 }
496 503
497 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n", 504 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
498 m, con->out_seq, le16_to_cpu(m->hdr.type), 505 m, con->out_seq, le16_to_cpu(m->hdr.type),
@@ -1334,6 +1341,7 @@ static int read_partial_message(struct ceph_connection *con)
1334 unsigned front_len, middle_len, data_len, data_off; 1341 unsigned front_len, middle_len, data_len, data_off;
1335 int datacrc = con->msgr->nocrc; 1342 int datacrc = con->msgr->nocrc;
1336 int skip; 1343 int skip;
1344 u64 seq;
1337 1345
1338 dout("read_partial_message con %p msg %p\n", con, m); 1346 dout("read_partial_message con %p msg %p\n", con, m);
1339 1347
@@ -1368,6 +1376,25 @@ static int read_partial_message(struct ceph_connection *con)
1368 return -EIO; 1376 return -EIO;
1369 data_off = le16_to_cpu(con->in_hdr.data_off); 1377 data_off = le16_to_cpu(con->in_hdr.data_off);
1370 1378
1379 /* verify seq# */
1380 seq = le64_to_cpu(con->in_hdr.seq);
1381 if ((s64)seq - (s64)con->in_seq < 1) {
1382 pr_info("skipping %s%lld %s seq %lld, expected %lld\n",
1383 ENTITY_NAME(con->peer_name),
1384 pr_addr(&con->peer_addr.in_addr),
1385 seq, con->in_seq + 1);
1386 con->in_base_pos = -front_len - middle_len - data_len -
1387 sizeof(m->footer);
1388 con->in_tag = CEPH_MSGR_TAG_READY;
1389 con->in_seq++;
1390 return 0;
1391 } else if ((s64)seq - (s64)con->in_seq > 1) {
1392 pr_err("read_partial_message bad seq %lld expected %lld\n",
1393 seq, con->in_seq + 1);
1394 con->error_msg = "bad message sequence # for incoming message";
1395 return -EBADMSG;
1396 }
1397
1371 /* allocate message? */ 1398 /* allocate message? */
1372 if (!con->in_msg) { 1399 if (!con->in_msg) {
1373 dout("got hdr type %d front %d data %d\n", con->in_hdr.type, 1400 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
@@ -1379,6 +1406,7 @@ static int read_partial_message(struct ceph_connection *con)
1379 con->in_base_pos = -front_len - middle_len - data_len - 1406 con->in_base_pos = -front_len - middle_len - data_len -
1380 sizeof(m->footer); 1407 sizeof(m->footer);
1381 con->in_tag = CEPH_MSGR_TAG_READY; 1408 con->in_tag = CEPH_MSGR_TAG_READY;
1409 con->in_seq++;
1382 return 0; 1410 return 0;
1383 } 1411 }
1384 if (IS_ERR(con->in_msg)) { 1412 if (IS_ERR(con->in_msg)) {
@@ -1965,6 +1993,8 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1965 1993
1966 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len)); 1994 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1967 1995
1996 msg->needs_out_seq = true;
1997
1968 /* queue */ 1998 /* queue */
1969 mutex_lock(&con->mutex); 1999 mutex_lock(&con->mutex);
1970 BUG_ON(!list_empty(&msg->list_head)); 2000 BUG_ON(!list_empty(&msg->list_head));
@@ -2030,6 +2060,7 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2030 ceph_msg_put(con->in_msg); 2060 ceph_msg_put(con->in_msg);
2031 con->in_msg = NULL; 2061 con->in_msg = NULL;
2032 con->in_tag = CEPH_MSGR_TAG_READY; 2062 con->in_tag = CEPH_MSGR_TAG_READY;
2063 con->in_seq++;
2033 } else { 2064 } else {
2034 dout("con_revoke_pages %p msg %p pages %p no-op\n", 2065 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2035 con, con->in_msg, msg); 2066 con, con->in_msg, msg);
@@ -2063,15 +2094,19 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
2063 kref_init(&m->kref); 2094 kref_init(&m->kref);
2064 INIT_LIST_HEAD(&m->list_head); 2095 INIT_LIST_HEAD(&m->list_head);
2065 2096
2097 m->hdr.tid = 0;
2066 m->hdr.type = cpu_to_le16(type); 2098 m->hdr.type = cpu_to_le16(type);
2099 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2100 m->hdr.version = 0;
2067 m->hdr.front_len = cpu_to_le32(front_len); 2101 m->hdr.front_len = cpu_to_le32(front_len);
2068 m->hdr.middle_len = 0; 2102 m->hdr.middle_len = 0;
2069 m->hdr.data_len = cpu_to_le32(page_len); 2103 m->hdr.data_len = cpu_to_le32(page_len);
2070 m->hdr.data_off = cpu_to_le16(page_off); 2104 m->hdr.data_off = cpu_to_le16(page_off);
2071 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT); 2105 m->hdr.reserved = 0;
2072 m->footer.front_crc = 0; 2106 m->footer.front_crc = 0;
2073 m->footer.middle_crc = 0; 2107 m->footer.middle_crc = 0;
2074 m->footer.data_crc = 0; 2108 m->footer.data_crc = 0;
2109 m->footer.flags = 0;
2075 m->front_max = front_len; 2110 m->front_max = front_len;
2076 m->front_is_vmalloc = false; 2111 m->front_is_vmalloc = false;
2077 m->more_to_follow = false; 2112 m->more_to_follow = false;
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index a343dae73cdc..a5caf91cc971 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -86,6 +86,7 @@ struct ceph_msg {
86 struct kref kref; 86 struct kref kref;
87 bool front_is_vmalloc; 87 bool front_is_vmalloc;
88 bool more_to_follow; 88 bool more_to_follow;
89 bool needs_out_seq;
89 int front_max; 90 int front_max;
90 91
91 struct ceph_msgpool *pool; 92 struct ceph_msgpool *pool;
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index c7b4dedaace6..3514f71ff85f 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -565,7 +565,8 @@ static int __map_osds(struct ceph_osd_client *osdc,
565{ 565{
566 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base; 566 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
567 struct ceph_pg pgid; 567 struct ceph_pg pgid;
568 int o = -1; 568 int acting[CEPH_PG_MAX_SIZE];
569 int o = -1, num = 0;
569 int err; 570 int err;
570 571
571 dout("map_osds %p tid %lld\n", req, req->r_tid); 572 dout("map_osds %p tid %lld\n", req, req->r_tid);
@@ -576,10 +577,16 @@ static int __map_osds(struct ceph_osd_client *osdc,
576 pgid = reqhead->layout.ol_pgid; 577 pgid = reqhead->layout.ol_pgid;
577 req->r_pgid = pgid; 578 req->r_pgid = pgid;
578 579
579 o = ceph_calc_pg_primary(osdc->osdmap, pgid); 580 err = ceph_calc_pg_acting(osdc->osdmap, pgid, acting);
581 if (err > 0) {
582 o = acting[0];
583 num = err;
584 }
580 585
581 if ((req->r_osd && req->r_osd->o_osd == o && 586 if ((req->r_osd && req->r_osd->o_osd == o &&
582 req->r_sent >= req->r_osd->o_incarnation) || 587 req->r_sent >= req->r_osd->o_incarnation &&
588 req->r_num_pg_osds == num &&
589 memcmp(req->r_pg_osds, acting, sizeof(acting[0])*num) == 0) ||
583 (req->r_osd == NULL && o == -1)) 590 (req->r_osd == NULL && o == -1))
584 return 0; /* no change */ 591 return 0; /* no change */
585 592
@@ -587,6 +594,10 @@ static int __map_osds(struct ceph_osd_client *osdc,
587 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o, 594 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
588 req->r_osd ? req->r_osd->o_osd : -1); 595 req->r_osd ? req->r_osd->o_osd : -1);
589 596
597 /* record full pg acting set */
598 memcpy(req->r_pg_osds, acting, sizeof(acting[0]) * num);
599 req->r_num_pg_osds = num;
600
590 if (req->r_osd) { 601 if (req->r_osd) {
591 __cancel_request(req); 602 __cancel_request(req);
592 list_del_init(&req->r_osd_item); 603 list_del_init(&req->r_osd_item);
@@ -612,7 +623,7 @@ static int __map_osds(struct ceph_osd_client *osdc,
612 __remove_osd_from_lru(req->r_osd); 623 __remove_osd_from_lru(req->r_osd);
613 list_add(&req->r_osd_item, &req->r_osd->o_requests); 624 list_add(&req->r_osd_item, &req->r_osd->o_requests);
614 } 625 }
615 err = 1; /* osd changed */ 626 err = 1; /* osd or pg changed */
616 627
617out: 628out:
618 return err; 629 return err;
@@ -779,16 +790,18 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
779 struct ceph_osd_request *req; 790 struct ceph_osd_request *req;
780 u64 tid; 791 u64 tid;
781 int numops, object_len, flags; 792 int numops, object_len, flags;
793 s32 result;
782 794
783 tid = le64_to_cpu(msg->hdr.tid); 795 tid = le64_to_cpu(msg->hdr.tid);
784 if (msg->front.iov_len < sizeof(*rhead)) 796 if (msg->front.iov_len < sizeof(*rhead))
785 goto bad; 797 goto bad;
786 numops = le32_to_cpu(rhead->num_ops); 798 numops = le32_to_cpu(rhead->num_ops);
787 object_len = le32_to_cpu(rhead->object_len); 799 object_len = le32_to_cpu(rhead->object_len);
800 result = le32_to_cpu(rhead->result);
788 if (msg->front.iov_len != sizeof(*rhead) + object_len + 801 if (msg->front.iov_len != sizeof(*rhead) + object_len +
789 numops * sizeof(struct ceph_osd_op)) 802 numops * sizeof(struct ceph_osd_op))
790 goto bad; 803 goto bad;
791 dout("handle_reply %p tid %llu\n", msg, tid); 804 dout("handle_reply %p tid %llu result %d\n", msg, tid, (int)result);
792 805
793 /* lookup */ 806 /* lookup */
794 mutex_lock(&osdc->request_mutex); 807 mutex_lock(&osdc->request_mutex);
@@ -834,7 +847,8 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
834 dout("handle_reply tid %llu flags %d\n", tid, flags); 847 dout("handle_reply tid %llu flags %d\n", tid, flags);
835 848
836 /* either this is a read, or we got the safe response */ 849 /* either this is a read, or we got the safe response */
837 if ((flags & CEPH_OSD_FLAG_ONDISK) || 850 if (result < 0 ||
851 (flags & CEPH_OSD_FLAG_ONDISK) ||
838 ((flags & CEPH_OSD_FLAG_WRITE) == 0)) 852 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
839 __unregister_request(osdc, req); 853 __unregister_request(osdc, req);
840 854
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
index b0759911e7c3..ce776989ef6a 100644
--- a/fs/ceph/osd_client.h
+++ b/fs/ceph/osd_client.h
@@ -48,6 +48,8 @@ struct ceph_osd_request {
48 struct list_head r_osd_item; 48 struct list_head r_osd_item;
49 struct ceph_osd *r_osd; 49 struct ceph_osd *r_osd;
50 struct ceph_pg r_pgid; 50 struct ceph_pg r_pgid;
51 int r_pg_osds[CEPH_PG_MAX_SIZE];
52 int r_num_pg_osds;
51 53
52 struct ceph_connection *r_con_filling_msg; 54 struct ceph_connection *r_con_filling_msg;
53 55
@@ -66,7 +68,6 @@ struct ceph_osd_request {
66 struct list_head r_unsafe_item; 68 struct list_head r_unsafe_item;
67 69
68 struct inode *r_inode; /* for use by callbacks */ 70 struct inode *r_inode; /* for use by callbacks */
69 struct writeback_control *r_wbc; /* ditto */
70 71
71 char r_oid[40]; /* object name */ 72 char r_oid[40]; /* object name */
72 int r_oid_len; 73 int r_oid_len;
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index 2e2c15eed82a..cfdd8f4388b7 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -1041,12 +1041,33 @@ static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1041} 1041}
1042 1042
1043/* 1043/*
1044 * Return acting set for given pgid.
1045 */
1046int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
1047 int *acting)
1048{
1049 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1050 int i, o, num = CEPH_PG_MAX_SIZE;
1051
1052 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1053 if (!osds)
1054 return -1;
1055
1056 /* primary is first up osd */
1057 o = 0;
1058 for (i = 0; i < num; i++)
1059 if (ceph_osd_is_up(osdmap, osds[i]))
1060 acting[o++] = osds[i];
1061 return o;
1062}
1063
1064/*
1044 * Return primary osd for given pgid, or -1 if none. 1065 * Return primary osd for given pgid, or -1 if none.
1045 */ 1066 */
1046int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid) 1067int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1047{ 1068{
1048 int rawosds[10], *osds; 1069 int rawosds[CEPH_PG_MAX_SIZE], *osds;
1049 int i, num = ARRAY_SIZE(rawosds); 1070 int i, num = CEPH_PG_MAX_SIZE;
1050 1071
1051 osds = calc_pg_raw(osdmap, pgid, rawosds, &num); 1072 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1052 if (!osds) 1073 if (!osds)
@@ -1054,9 +1075,7 @@ int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1054 1075
1055 /* primary is first up osd */ 1076 /* primary is first up osd */
1056 for (i = 0; i < num; i++) 1077 for (i = 0; i < num; i++)
1057 if (ceph_osd_is_up(osdmap, osds[i])) { 1078 if (ceph_osd_is_up(osdmap, osds[i]))
1058 return osds[i]; 1079 return osds[i];
1059 break;
1060 }
1061 return -1; 1080 return -1;
1062} 1081}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
index 8bc9f1e4f562..970b547e510d 100644
--- a/fs/ceph/osdmap.h
+++ b/fs/ceph/osdmap.h
@@ -120,6 +120,8 @@ extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
120 const char *oid, 120 const char *oid,
121 struct ceph_file_layout *fl, 121 struct ceph_file_layout *fl,
122 struct ceph_osdmap *osdmap); 122 struct ceph_osdmap *osdmap);
123extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
124 int *acting);
123extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, 125extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
124 struct ceph_pg pgid); 126 struct ceph_pg pgid);
125 127
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index a1fc1d017b58..fd56451a871f 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -58,6 +58,7 @@ struct ceph_timespec {
58#define CEPH_PG_LAYOUT_LINEAR 2 58#define CEPH_PG_LAYOUT_LINEAR 2
59#define CEPH_PG_LAYOUT_HYBRID 3 59#define CEPH_PG_LAYOUT_HYBRID 3
60 60
61#define CEPH_PG_MAX_SIZE 16 /* max # osds in a single pg */
61 62
62/* 63/*
63 * placement group. 64 * placement group.
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 2b881262ef67..d5114db70453 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -869,16 +869,20 @@ skip_inode:
869 continue; 869 continue;
870 ci = ceph_inode(inode); 870 ci = ceph_inode(inode);
871 spin_lock(&inode->i_lock); 871 spin_lock(&inode->i_lock);
872 if (!ci->i_snap_realm) 872 if (list_empty(&ci->i_snap_realm_item)) {
873 goto split_skip_inode; 873 struct ceph_snap_realm *oldrealm =
874 ceph_put_snap_realm(mdsc, ci->i_snap_realm); 874 ci->i_snap_realm;
875 spin_lock(&realm->inodes_with_caps_lock); 875
876 list_add(&ci->i_snap_realm_item, 876 dout(" moving %p to split realm %llx %p\n",
877 &realm->inodes_with_caps); 877 inode, realm->ino, realm);
878 ci->i_snap_realm = realm; 878 spin_lock(&realm->inodes_with_caps_lock);
879 spin_unlock(&realm->inodes_with_caps_lock); 879 list_add(&ci->i_snap_realm_item,
880 ceph_get_snap_realm(mdsc, realm); 880 &realm->inodes_with_caps);
881split_skip_inode: 881 ci->i_snap_realm = realm;
882 spin_unlock(&realm->inodes_with_caps_lock);
883 ceph_get_snap_realm(mdsc, realm);
884 ceph_put_snap_realm(mdsc, oldrealm);
885 }
882 spin_unlock(&inode->i_lock); 886 spin_unlock(&inode->i_lock);
883 iput(inode); 887 iput(inode);
884 } 888 }
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 75d02eaa1279..110857ba9269 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -47,10 +47,20 @@ const char *ceph_file_part(const char *s, int len)
47 */ 47 */
48static void ceph_put_super(struct super_block *s) 48static void ceph_put_super(struct super_block *s)
49{ 49{
50 struct ceph_client *cl = ceph_client(s); 50 struct ceph_client *client = ceph_sb_to_client(s);
51 51
52 dout("put_super\n"); 52 dout("put_super\n");
53 ceph_mdsc_close_sessions(&cl->mdsc); 53 ceph_mdsc_close_sessions(&client->mdsc);
54
55 /*
56 * ensure we release the bdi before put_anon_super releases
57 * the device name.
58 */
59 if (s->s_bdi == &client->backing_dev_info) {
60 bdi_unregister(&client->backing_dev_info);
61 s->s_bdi = NULL;
62 }
63
54 return; 64 return;
55} 65}
56 66
@@ -636,6 +646,8 @@ static void ceph_destroy_client(struct ceph_client *client)
636 destroy_workqueue(client->pg_inv_wq); 646 destroy_workqueue(client->pg_inv_wq);
637 destroy_workqueue(client->trunc_wq); 647 destroy_workqueue(client->trunc_wq);
638 648
649 bdi_destroy(&client->backing_dev_info);
650
639 if (client->msgr) 651 if (client->msgr)
640 ceph_messenger_destroy(client->msgr); 652 ceph_messenger_destroy(client->msgr);
641 mempool_destroy(client->wb_pagevec_pool); 653 mempool_destroy(client->wb_pagevec_pool);
@@ -876,14 +888,14 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
876{ 888{
877 int err; 889 int err;
878 890
879 sb->s_bdi = &client->backing_dev_info;
880
881 /* set ra_pages based on rsize mount option? */ 891 /* set ra_pages based on rsize mount option? */
882 if (client->mount_args->rsize >= PAGE_CACHE_SIZE) 892 if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
883 client->backing_dev_info.ra_pages = 893 client->backing_dev_info.ra_pages =
884 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1) 894 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
885 >> PAGE_SHIFT; 895 >> PAGE_SHIFT;
886 err = bdi_register_dev(&client->backing_dev_info, sb->s_dev); 896 err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
897 if (!err)
898 sb->s_bdi = &client->backing_dev_info;
887 return err; 899 return err;
888} 900}
889 901
@@ -957,9 +969,6 @@ static void ceph_kill_sb(struct super_block *s)
957 dout("kill_sb %p\n", s); 969 dout("kill_sb %p\n", s);
958 ceph_mdsc_pre_umount(&client->mdsc); 970 ceph_mdsc_pre_umount(&client->mdsc);
959 kill_anon_super(s); /* will call put_super after sb is r/o */ 971 kill_anon_super(s); /* will call put_super after sb is r/o */
960 if (s->s_bdi == &client->backing_dev_info)
961 bdi_unregister(&client->backing_dev_info);
962 bdi_destroy(&client->backing_dev_info);
963 ceph_destroy_client(client); 972 ceph_destroy_client(client);
964} 973}
965 974
@@ -996,9 +1005,10 @@ static int __init init_ceph(void)
996 if (ret) 1005 if (ret)
997 goto out_icache; 1006 goto out_icache;
998 1007
999 pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n", 1008 pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n",
1000 CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH, 1009 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL,
1001 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL); 1010 CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
1011 CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
1002 return 0; 1012 return 0;
1003 1013
1004out_icache: 1014out_icache:
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index e30dfbb056c3..13513b80d87f 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -10,6 +10,7 @@
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <linux/mempool.h> 11#include <linux/mempool.h>
12#include <linux/pagemap.h> 12#include <linux/pagemap.h>
13#include <linux/slab.h>
13#include <linux/wait.h> 14#include <linux/wait.h>
14#include <linux/writeback.h> 15#include <linux/writeback.h>
15#include <linux/slab.h> 16#include <linux/slab.h>