aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/block/rbd.c19
-rw-r--r--fs/ceph/Makefile23
-rw-r--r--fs/ceph/debugfs.c9
-rw-r--r--fs/ceph/dir.c20
-rw-r--r--fs/ceph/export.c2
-rw-r--r--fs/ceph/inode.c4
-rw-r--r--fs/ceph/mds_client.c56
-rw-r--r--fs/ceph/mds_client.h2
-rw-r--r--fs/ceph/super.c13
-rw-r--r--fs/ceph/super.h2
-rw-r--r--include/linux/ceph/ceph_fs.h16
-rw-r--r--include/linux/ceph/messenger.h5
-rw-r--r--net/ceph/ceph_hash.c3
-rw-r--r--net/ceph/messenger.c46
-rw-r--r--net/ceph/osdmap.c4
15 files changed, 116 insertions, 108 deletions
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 008d4a00b50d..e1e38b11f48a 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1790,18 +1790,29 @@ static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count)
1790 1790
1791 rc = rbd_bus_add_dev(rbd_dev); 1791 rc = rbd_bus_add_dev(rbd_dev);
1792 if (rc) 1792 if (rc)
1793 goto err_out_disk; 1793 goto err_out_blkdev;
1794
1794 /* set up and announce blkdev mapping */ 1795 /* set up and announce blkdev mapping */
1795 rc = rbd_init_disk(rbd_dev); 1796 rc = rbd_init_disk(rbd_dev);
1796 if (rc) 1797 if (rc)
1797 goto err_out_blkdev; 1798 goto err_out_bus;
1798 1799
1799 return count; 1800 return count;
1800 1801
1802err_out_bus:
1803 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
1804 list_del_init(&rbd_dev->node);
1805 mutex_unlock(&ctl_mutex);
1806
1807 /* this will also clean up rest of rbd_dev stuff */
1808
1809 rbd_bus_del_dev(rbd_dev);
1810 kfree(options);
1811 kfree(mon_dev_name);
1812 return rc;
1813
1801err_out_blkdev: 1814err_out_blkdev:
1802 unregister_blkdev(rbd_dev->major, rbd_dev->name); 1815 unregister_blkdev(rbd_dev->major, rbd_dev->name);
1803err_out_disk:
1804 rbd_free_disk(rbd_dev);
1805err_out_client: 1816err_out_client:
1806 rbd_put_client(rbd_dev); 1817 rbd_put_client(rbd_dev);
1807 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); 1818 mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
index 9e6c4f2e8ff1..bd352125e829 100644
--- a/fs/ceph/Makefile
+++ b/fs/ceph/Makefile
@@ -2,31 +2,10 @@
2# Makefile for CEPH filesystem. 2# Makefile for CEPH filesystem.
3# 3#
4 4
5ifneq ($(KERNELRELEASE),)
6
7obj-$(CONFIG_CEPH_FS) += ceph.o 5obj-$(CONFIG_CEPH_FS) += ceph.o
8 6
9ceph-objs := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ 7ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \ 8 export.o caps.o snap.o xattr.o \
11 mds_client.o mdsmap.o strings.o ceph_frag.o \ 9 mds_client.o mdsmap.o strings.o ceph_frag.o \
12 debugfs.o 10 debugfs.o
13 11
14else
15#Otherwise we were called directly from the command
16# line; invoke the kernel build system.
17
18KERNELDIR ?= /lib/modules/$(shell uname -r)/build
19PWD := $(shell pwd)
20
21default: all
22
23all:
24 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
25
26modules_install:
27 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
28
29clean:
30 $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
31
32endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index 7ae1b3d55b58..08f65faac112 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -60,10 +60,13 @@ static int mdsc_show(struct seq_file *s, void *p)
60 for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) { 60 for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
61 req = rb_entry(rp, struct ceph_mds_request, r_node); 61 req = rb_entry(rp, struct ceph_mds_request, r_node);
62 62
63 if (req->r_request) 63 if (req->r_request && req->r_session)
64 seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds); 64 seq_printf(s, "%lld\tmds%d\t", req->r_tid,
65 else 65 req->r_session->s_mds);
66 else if (!req->r_request)
66 seq_printf(s, "%lld\t(no request)\t", req->r_tid); 67 seq_printf(s, "%lld\t(no request)\t", req->r_tid);
68 else
69 seq_printf(s, "%lld\t(no session)\t", req->r_tid);
67 70
68 seq_printf(s, "%s", ceph_mds_op_name(req->r_op)); 71 seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
69 72
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index fa7ca04ee816..0bc68de8edd7 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -1224,6 +1224,26 @@ void ceph_dentry_lru_del(struct dentry *dn)
1224 } 1224 }
1225} 1225}
1226 1226
1227/*
1228 * Return name hash for a given dentry. This is dependent on
1229 * the parent directory's hash function.
1230 */
1231unsigned ceph_dentry_hash(struct dentry *dn)
1232{
1233 struct inode *dir = dn->d_parent->d_inode;
1234 struct ceph_inode_info *dci = ceph_inode(dir);
1235
1236 switch (dci->i_dir_layout.dl_dir_hash) {
1237 case 0: /* for backward compat */
1238 case CEPH_STR_HASH_LINUX:
1239 return dn->d_name.hash;
1240
1241 default:
1242 return ceph_str_hash(dci->i_dir_layout.dl_dir_hash,
1243 dn->d_name.name, dn->d_name.len);
1244 }
1245}
1246
1227const struct file_operations ceph_dir_fops = { 1247const struct file_operations ceph_dir_fops = {
1228 .read = ceph_read_dir, 1248 .read = ceph_read_dir,
1229 .readdir = ceph_readdir, 1249 .readdir = ceph_readdir,
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 2297d9426992..e41056174bf8 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -59,7 +59,7 @@ static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
59 dout("encode_fh %p connectable\n", dentry); 59 dout("encode_fh %p connectable\n", dentry);
60 cfh->ino = ceph_ino(dentry->d_inode); 60 cfh->ino = ceph_ino(dentry->d_inode);
61 cfh->parent_ino = ceph_ino(parent->d_inode); 61 cfh->parent_ino = ceph_ino(parent->d_inode);
62 cfh->parent_name_hash = parent->d_name.hash; 62 cfh->parent_name_hash = ceph_dentry_hash(parent);
63 *max_len = connected_handle_length; 63 *max_len = connected_handle_length;
64 type = 2; 64 type = 2;
65 } else if (*max_len >= handle_length) { 65 } else if (*max_len >= handle_length) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index e61de4f7b99d..e835eff551e3 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -297,6 +297,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
297 ci->i_release_count = 0; 297 ci->i_release_count = 0;
298 ci->i_symlink = NULL; 298 ci->i_symlink = NULL;
299 299
300 memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
301
300 ci->i_fragtree = RB_ROOT; 302 ci->i_fragtree = RB_ROOT;
301 mutex_init(&ci->i_fragtree_mutex); 303 mutex_init(&ci->i_fragtree_mutex);
302 304
@@ -689,6 +691,8 @@ static int fill_inode(struct inode *inode,
689 inode->i_op = &ceph_dir_iops; 691 inode->i_op = &ceph_dir_iops;
690 inode->i_fop = &ceph_dir_fops; 692 inode->i_fop = &ceph_dir_fops;
691 693
694 ci->i_dir_layout = iinfo->dir_layout;
695
692 ci->i_files = le64_to_cpu(info->files); 696 ci->i_files = le64_to_cpu(info->files);
693 ci->i_subdirs = le64_to_cpu(info->subdirs); 697 ci->i_subdirs = le64_to_cpu(info->subdirs);
694 ci->i_rbytes = le64_to_cpu(info->rbytes); 698 ci->i_rbytes = le64_to_cpu(info->rbytes);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index a50fca1e03be..1e30d194a8e3 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -60,7 +60,8 @@ static const struct ceph_connection_operations mds_con_ops;
60 * parse individual inode info 60 * parse individual inode info
61 */ 61 */
62static int parse_reply_info_in(void **p, void *end, 62static int parse_reply_info_in(void **p, void *end,
63 struct ceph_mds_reply_info_in *info) 63 struct ceph_mds_reply_info_in *info,
64 int features)
64{ 65{
65 int err = -EIO; 66 int err = -EIO;
66 67
@@ -74,6 +75,12 @@ static int parse_reply_info_in(void **p, void *end,
74 info->symlink = *p; 75 info->symlink = *p;
75 *p += info->symlink_len; 76 *p += info->symlink_len;
76 77
78 if (features & CEPH_FEATURE_DIRLAYOUTHASH)
79 ceph_decode_copy_safe(p, end, &info->dir_layout,
80 sizeof(info->dir_layout), bad);
81 else
82 memset(&info->dir_layout, 0, sizeof(info->dir_layout));
83
77 ceph_decode_32_safe(p, end, info->xattr_len, bad); 84 ceph_decode_32_safe(p, end, info->xattr_len, bad);
78 ceph_decode_need(p, end, info->xattr_len, bad); 85 ceph_decode_need(p, end, info->xattr_len, bad);
79 info->xattr_data = *p; 86 info->xattr_data = *p;
@@ -88,12 +95,13 @@ bad:
88 * target inode. 95 * target inode.
89 */ 96 */
90static int parse_reply_info_trace(void **p, void *end, 97static int parse_reply_info_trace(void **p, void *end,
91 struct ceph_mds_reply_info_parsed *info) 98 struct ceph_mds_reply_info_parsed *info,
99 int features)
92{ 100{
93 int err; 101 int err;
94 102
95 if (info->head->is_dentry) { 103 if (info->head->is_dentry) {
96 err = parse_reply_info_in(p, end, &info->diri); 104 err = parse_reply_info_in(p, end, &info->diri, features);
97 if (err < 0) 105 if (err < 0)
98 goto out_bad; 106 goto out_bad;
99 107
@@ -114,7 +122,7 @@ static int parse_reply_info_trace(void **p, void *end,
114 } 122 }
115 123
116 if (info->head->is_target) { 124 if (info->head->is_target) {
117 err = parse_reply_info_in(p, end, &info->targeti); 125 err = parse_reply_info_in(p, end, &info->targeti, features);
118 if (err < 0) 126 if (err < 0)
119 goto out_bad; 127 goto out_bad;
120 } 128 }
@@ -134,7 +142,8 @@ out_bad:
134 * parse readdir results 142 * parse readdir results
135 */ 143 */
136static int parse_reply_info_dir(void **p, void *end, 144static int parse_reply_info_dir(void **p, void *end,
137 struct ceph_mds_reply_info_parsed *info) 145 struct ceph_mds_reply_info_parsed *info,
146 int features)
138{ 147{
139 u32 num, i = 0; 148 u32 num, i = 0;
140 int err; 149 int err;
@@ -182,7 +191,7 @@ static int parse_reply_info_dir(void **p, void *end,
182 *p += sizeof(struct ceph_mds_reply_lease); 191 *p += sizeof(struct ceph_mds_reply_lease);
183 192
184 /* inode */ 193 /* inode */
185 err = parse_reply_info_in(p, end, &info->dir_in[i]); 194 err = parse_reply_info_in(p, end, &info->dir_in[i], features);
186 if (err < 0) 195 if (err < 0)
187 goto out_bad; 196 goto out_bad;
188 i++; 197 i++;
@@ -205,7 +214,8 @@ out_bad:
205 * parse fcntl F_GETLK results 214 * parse fcntl F_GETLK results
206 */ 215 */
207static int parse_reply_info_filelock(void **p, void *end, 216static int parse_reply_info_filelock(void **p, void *end,
208 struct ceph_mds_reply_info_parsed *info) 217 struct ceph_mds_reply_info_parsed *info,
218 int features)
209{ 219{
210 if (*p + sizeof(*info->filelock_reply) > end) 220 if (*p + sizeof(*info->filelock_reply) > end)
211 goto bad; 221 goto bad;
@@ -225,19 +235,21 @@ bad:
225 * parse extra results 235 * parse extra results
226 */ 236 */
227static int parse_reply_info_extra(void **p, void *end, 237static int parse_reply_info_extra(void **p, void *end,
228 struct ceph_mds_reply_info_parsed *info) 238 struct ceph_mds_reply_info_parsed *info,
239 int features)
229{ 240{
230 if (info->head->op == CEPH_MDS_OP_GETFILELOCK) 241 if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
231 return parse_reply_info_filelock(p, end, info); 242 return parse_reply_info_filelock(p, end, info, features);
232 else 243 else
233 return parse_reply_info_dir(p, end, info); 244 return parse_reply_info_dir(p, end, info, features);
234} 245}
235 246
236/* 247/*
237 * parse entire mds reply 248 * parse entire mds reply
238 */ 249 */
239static int parse_reply_info(struct ceph_msg *msg, 250static int parse_reply_info(struct ceph_msg *msg,
240 struct ceph_mds_reply_info_parsed *info) 251 struct ceph_mds_reply_info_parsed *info,
252 int features)
241{ 253{
242 void *p, *end; 254 void *p, *end;
243 u32 len; 255 u32 len;
@@ -250,7 +262,7 @@ static int parse_reply_info(struct ceph_msg *msg,
250 /* trace */ 262 /* trace */
251 ceph_decode_32_safe(&p, end, len, bad); 263 ceph_decode_32_safe(&p, end, len, bad);
252 if (len > 0) { 264 if (len > 0) {
253 err = parse_reply_info_trace(&p, p+len, info); 265 err = parse_reply_info_trace(&p, p+len, info, features);
254 if (err < 0) 266 if (err < 0)
255 goto out_bad; 267 goto out_bad;
256 } 268 }
@@ -258,7 +270,7 @@ static int parse_reply_info(struct ceph_msg *msg,
258 /* extra */ 270 /* extra */
259 ceph_decode_32_safe(&p, end, len, bad); 271 ceph_decode_32_safe(&p, end, len, bad);
260 if (len > 0) { 272 if (len > 0) {
261 err = parse_reply_info_extra(&p, p+len, info); 273 err = parse_reply_info_extra(&p, p+len, info, features);
262 if (err < 0) 274 if (err < 0)
263 goto out_bad; 275 goto out_bad;
264 } 276 }
@@ -654,7 +666,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
654 } else { 666 } else {
655 /* dir + name */ 667 /* dir + name */
656 inode = dir; 668 inode = dir;
657 hash = req->r_dentry->d_name.hash; 669 hash = ceph_dentry_hash(req->r_dentry);
658 is_hash = true; 670 is_hash = true;
659 } 671 }
660 } 672 }
@@ -1693,7 +1705,6 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
1693 struct ceph_msg *msg; 1705 struct ceph_msg *msg;
1694 int flags = 0; 1706 int flags = 0;
1695 1707
1696 req->r_mds = mds;
1697 req->r_attempts++; 1708 req->r_attempts++;
1698 if (req->r_inode) { 1709 if (req->r_inode) {
1699 struct ceph_cap *cap = 1710 struct ceph_cap *cap =
@@ -1780,6 +1791,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
1780 goto finish; 1791 goto finish;
1781 } 1792 }
1782 1793
1794 put_request_session(req);
1795
1783 mds = __choose_mds(mdsc, req); 1796 mds = __choose_mds(mdsc, req);
1784 if (mds < 0 || 1797 if (mds < 0 ||
1785 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { 1798 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
@@ -1797,6 +1810,8 @@ static int __do_request(struct ceph_mds_client *mdsc,
1797 goto finish; 1810 goto finish;
1798 } 1811 }
1799 } 1812 }
1813 req->r_session = get_session(session);
1814
1800 dout("do_request mds%d session %p state %s\n", mds, session, 1815 dout("do_request mds%d session %p state %s\n", mds, session,
1801 session_state_name(session->s_state)); 1816 session_state_name(session->s_state));
1802 if (session->s_state != CEPH_MDS_SESSION_OPEN && 1817 if (session->s_state != CEPH_MDS_SESSION_OPEN &&
@@ -1809,7 +1824,6 @@ static int __do_request(struct ceph_mds_client *mdsc,
1809 } 1824 }
1810 1825
1811 /* send request */ 1826 /* send request */
1812 req->r_session = get_session(session);
1813 req->r_resend_mds = -1; /* forget any previous mds hint */ 1827 req->r_resend_mds = -1; /* forget any previous mds hint */
1814 1828
1815 if (req->r_request_started == 0) /* note request start time */ 1829 if (req->r_request_started == 0) /* note request start time */
@@ -1863,7 +1877,6 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
1863 if (req->r_session && 1877 if (req->r_session &&
1864 req->r_session->s_mds == mds) { 1878 req->r_session->s_mds == mds) {
1865 dout(" kicking tid %llu\n", req->r_tid); 1879 dout(" kicking tid %llu\n", req->r_tid);
1866 put_request_session(req);
1867 __do_request(mdsc, req); 1880 __do_request(mdsc, req);
1868 } 1881 }
1869 } 1882 }
@@ -2056,8 +2069,11 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2056 goto out; 2069 goto out;
2057 } else { 2070 } else {
2058 struct ceph_inode_info *ci = ceph_inode(req->r_inode); 2071 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
2059 struct ceph_cap *cap = 2072 struct ceph_cap *cap = NULL;
2060 ceph_get_cap_for_mds(ci, req->r_mds);; 2073
2074 if (req->r_session)
2075 cap = ceph_get_cap_for_mds(ci,
2076 req->r_session->s_mds);
2061 2077
2062 dout("already using auth"); 2078 dout("already using auth");
2063 if ((!cap || cap != ci->i_auth_cap) || 2079 if ((!cap || cap != ci->i_auth_cap) ||
@@ -2101,7 +2117,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
2101 2117
2102 dout("handle_reply tid %lld result %d\n", tid, result); 2118 dout("handle_reply tid %lld result %d\n", tid, result);
2103 rinfo = &req->r_reply_info; 2119 rinfo = &req->r_reply_info;
2104 err = parse_reply_info(msg, rinfo); 2120 err = parse_reply_info(msg, rinfo, session->s_con.peer_features);
2105 mutex_unlock(&mdsc->mutex); 2121 mutex_unlock(&mdsc->mutex);
2106 2122
2107 mutex_lock(&session->s_mutex); 2123 mutex_lock(&session->s_mutex);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index aabe563b54db..4e3a9cc0bba6 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -35,6 +35,7 @@ struct ceph_cap;
35 */ 35 */
36struct ceph_mds_reply_info_in { 36struct ceph_mds_reply_info_in {
37 struct ceph_mds_reply_inode *in; 37 struct ceph_mds_reply_inode *in;
38 struct ceph_dir_layout dir_layout;
38 u32 symlink_len; 39 u32 symlink_len;
39 char *symlink; 40 char *symlink;
40 u32 xattr_len; 41 u32 xattr_len;
@@ -165,7 +166,6 @@ struct ceph_mds_request {
165 struct ceph_mds_client *r_mdsc; 166 struct ceph_mds_client *r_mdsc;
166 167
167 int r_op; /* mds op code */ 168 int r_op; /* mds op code */
168 int r_mds;
169 169
170 /* operation on what? */ 170 /* operation on what? */
171 struct inode *r_inode; /* arg1 */ 171 struct inode *r_inode; /* arg1 */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 08b460ae0539..bf6f0f34082a 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -428,7 +428,8 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
428 goto fail; 428 goto fail;
429 } 429 }
430 fsc->client->extra_mon_dispatch = extra_mon_dispatch; 430 fsc->client->extra_mon_dispatch = extra_mon_dispatch;
431 fsc->client->supported_features |= CEPH_FEATURE_FLOCK; 431 fsc->client->supported_features |= CEPH_FEATURE_FLOCK |
432 CEPH_FEATURE_DIRLAYOUTHASH;
432 fsc->client->monc.want_mdsmap = 1; 433 fsc->client->monc.want_mdsmap = 1;
433 434
434 fsc->mount_options = fsopt; 435 fsc->mount_options = fsopt;
@@ -443,13 +444,17 @@ struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
443 goto fail_client; 444 goto fail_client;
444 445
445 err = -ENOMEM; 446 err = -ENOMEM;
446 fsc->wb_wq = create_workqueue("ceph-writeback"); 447 /*
448 * The number of concurrent works can be high but they don't need
449 * to be processed in parallel, limit concurrency.
450 */
451 fsc->wb_wq = alloc_workqueue("ceph-writeback", 0, 1);
447 if (fsc->wb_wq == NULL) 452 if (fsc->wb_wq == NULL)
448 goto fail_bdi; 453 goto fail_bdi;
449 fsc->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid"); 454 fsc->pg_inv_wq = alloc_workqueue("ceph-pg-invalid", 0, 1);
450 if (fsc->pg_inv_wq == NULL) 455 if (fsc->pg_inv_wq == NULL)
451 goto fail_wb_wq; 456 goto fail_wb_wq;
452 fsc->trunc_wq = create_singlethread_workqueue("ceph-trunc"); 457 fsc->trunc_wq = alloc_workqueue("ceph-trunc", 0, 1);
453 if (fsc->trunc_wq == NULL) 458 if (fsc->trunc_wq == NULL)
454 goto fail_pg_inv_wq; 459 goto fail_pg_inv_wq;
455 460
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 4553d8829edb..20b907d76ae2 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -239,6 +239,7 @@ struct ceph_inode_info {
239 unsigned i_ceph_flags; 239 unsigned i_ceph_flags;
240 unsigned long i_release_count; 240 unsigned long i_release_count;
241 241
242 struct ceph_dir_layout i_dir_layout;
242 struct ceph_file_layout i_layout; 243 struct ceph_file_layout i_layout;
243 char *i_symlink; 244 char *i_symlink;
244 245
@@ -768,6 +769,7 @@ extern void ceph_dentry_lru_add(struct dentry *dn);
768extern void ceph_dentry_lru_touch(struct dentry *dn); 769extern void ceph_dentry_lru_touch(struct dentry *dn);
769extern void ceph_dentry_lru_del(struct dentry *dn); 770extern void ceph_dentry_lru_del(struct dentry *dn);
770extern void ceph_invalidate_dentry_lease(struct dentry *dentry); 771extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
772extern unsigned ceph_dentry_hash(struct dentry *dn);
771 773
772/* 774/*
773 * our d_ops vary depending on whether the inode is live, 775 * our d_ops vary depending on whether the inode is live,
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index c3c74aef289d..09dcc0c2ffd5 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -43,6 +43,10 @@
43#define CEPH_FEATURE_NOSRCADDR (1<<1) 43#define CEPH_FEATURE_NOSRCADDR (1<<1)
44#define CEPH_FEATURE_MONCLOCKCHECK (1<<2) 44#define CEPH_FEATURE_MONCLOCKCHECK (1<<2)
45#define CEPH_FEATURE_FLOCK (1<<3) 45#define CEPH_FEATURE_FLOCK (1<<3)
46#define CEPH_FEATURE_SUBSCRIBE2 (1<<4)
47#define CEPH_FEATURE_MONNAMES (1<<5)
48#define CEPH_FEATURE_RECONNECT_SEQ (1<<6)
49#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7)
46 50
47 51
48/* 52/*
@@ -55,10 +59,10 @@ struct ceph_file_layout {
55 __le32 fl_stripe_count; /* over this many objects */ 59 __le32 fl_stripe_count; /* over this many objects */
56 __le32 fl_object_size; /* until objects are this big, then move to 60 __le32 fl_object_size; /* until objects are this big, then move to
57 new objects */ 61 new objects */
58 __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */ 62 __le32 fl_cas_hash; /* UNUSED. 0 = none; 1 = sha256 */
59 63
60 /* pg -> disk layout */ 64 /* pg -> disk layout */
61 __le32 fl_object_stripe_unit; /* for per-object parity, if any */ 65 __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */
62 66
63 /* object -> pg layout */ 67 /* object -> pg layout */
64 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */ 68 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
@@ -69,6 +73,12 @@ struct ceph_file_layout {
69 73
70int ceph_file_layout_is_valid(const struct ceph_file_layout *layout); 74int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
71 75
76struct ceph_dir_layout {
77 __u8 dl_dir_hash; /* see ceph_hash.h for ids */
78 __u8 dl_unused1;
79 __u16 dl_unused2;
80 __u32 dl_unused3;
81} __attribute__ ((packed));
72 82
73/* crypto algorithms */ 83/* crypto algorithms */
74#define CEPH_CRYPTO_NONE 0x0 84#define CEPH_CRYPTO_NONE 0x0
@@ -457,7 +467,7 @@ struct ceph_mds_reply_inode {
457 struct ceph_timespec rctime; 467 struct ceph_timespec rctime;
458 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */ 468 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
459} __attribute__ ((packed)); 469} __attribute__ ((packed));
460/* followed by frag array, then symlink string, then xattr blob */ 470/* followed by frag array, symlink string, dir layout, xattr blob */
461 471
462/* reply_lease follows dname, and reply_inode */ 472/* reply_lease follows dname, and reply_inode */
463struct ceph_mds_reply_lease { 473struct ceph_mds_reply_lease {
diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h
index a108b425fee2..c3011beac30d 100644
--- a/include/linux/ceph/messenger.h
+++ b/include/linux/ceph/messenger.h
@@ -110,17 +110,12 @@ struct ceph_msg_pos {
110 110
111/* 111/*
112 * ceph_connection state bit flags 112 * ceph_connection state bit flags
113 *
114 * QUEUED and BUSY are used together to ensure that only a single
115 * thread is currently opening, reading or writing data to the socket.
116 */ 113 */
117#define LOSSYTX 0 /* we can close channel or drop messages on errors */ 114#define LOSSYTX 0 /* we can close channel or drop messages on errors */
118#define CONNECTING 1 115#define CONNECTING 1
119#define NEGOTIATING 2 116#define NEGOTIATING 2
120#define KEEPALIVE_PENDING 3 117#define KEEPALIVE_PENDING 3
121#define WRITE_PENDING 4 /* we have data ready to send */ 118#define WRITE_PENDING 4 /* we have data ready to send */
122#define QUEUED 5 /* there is work queued on this connection */
123#define BUSY 6 /* work is being done */
124#define STANDBY 8 /* no outgoing messages, socket closed. we keep 119#define STANDBY 8 /* no outgoing messages, socket closed. we keep
125 * the ceph_connection around to maintain shared 120 * the ceph_connection around to maintain shared
126 * state with the peer. */ 121 * state with the peer. */
diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c
index 815ef8826796..0a1b53bce76d 100644
--- a/net/ceph/ceph_hash.c
+++ b/net/ceph/ceph_hash.c
@@ -1,5 +1,6 @@
1 1
2#include <linux/ceph/types.h> 2#include <linux/ceph/types.h>
3#include <linux/module.h>
3 4
4/* 5/*
5 * Robert Jenkin's hash function. 6 * Robert Jenkin's hash function.
@@ -104,6 +105,7 @@ unsigned ceph_str_hash(int type, const char *s, unsigned len)
104 return -1; 105 return -1;
105 } 106 }
106} 107}
108EXPORT_SYMBOL(ceph_str_hash);
107 109
108const char *ceph_str_hash_name(int type) 110const char *ceph_str_hash_name(int type)
109{ 111{
@@ -116,3 +118,4 @@ const char *ceph_str_hash_name(int type)
116 return "unknown"; 118 return "unknown";
117 } 119 }
118} 120}
121EXPORT_SYMBOL(ceph_str_hash_name);
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index b6ff4a1519ab..dff633d62e5b 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -96,7 +96,7 @@ struct workqueue_struct *ceph_msgr_wq;
96 96
97int ceph_msgr_init(void) 97int ceph_msgr_init(void)
98{ 98{
99 ceph_msgr_wq = create_workqueue("ceph-msgr"); 99 ceph_msgr_wq = alloc_workqueue("ceph-msgr", WQ_NON_REENTRANT, 0);
100 if (!ceph_msgr_wq) { 100 if (!ceph_msgr_wq) {
101 pr_err("msgr_init failed to create workqueue\n"); 101 pr_err("msgr_init failed to create workqueue\n");
102 return -ENOMEM; 102 return -ENOMEM;
@@ -1920,20 +1920,6 @@ bad_tag:
1920/* 1920/*
1921 * Atomically queue work on a connection. Bump @con reference to 1921 * Atomically queue work on a connection. Bump @con reference to
1922 * avoid races with connection teardown. 1922 * avoid races with connection teardown.
1923 *
1924 * There is some trickery going on with QUEUED and BUSY because we
1925 * only want a _single_ thread operating on each connection at any
1926 * point in time, but we want to use all available CPUs.
1927 *
1928 * The worker thread only proceeds if it can atomically set BUSY. It
1929 * clears QUEUED and does it's thing. When it thinks it's done, it
1930 * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
1931 * (tries again to set BUSY).
1932 *
1933 * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
1934 * try to queue work. If that fails (work is already queued, or BUSY)
1935 * we give up (work also already being done or is queued) but leave QUEUED
1936 * set so that the worker thread will loop if necessary.
1937 */ 1923 */
1938static void queue_con(struct ceph_connection *con) 1924static void queue_con(struct ceph_connection *con)
1939{ 1925{
@@ -1948,11 +1934,7 @@ static void queue_con(struct ceph_connection *con)
1948 return; 1934 return;
1949 } 1935 }
1950 1936
1951 set_bit(QUEUED, &con->state); 1937 if (!queue_delayed_work(ceph_msgr_wq, &con->work, 0)) {
1952 if (test_bit(BUSY, &con->state)) {
1953 dout("queue_con %p - already BUSY\n", con);
1954 con->ops->put(con);
1955 } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
1956 dout("queue_con %p - already queued\n", con); 1938 dout("queue_con %p - already queued\n", con);
1957 con->ops->put(con); 1939 con->ops->put(con);
1958 } else { 1940 } else {
@@ -1967,15 +1949,6 @@ static void con_work(struct work_struct *work)
1967{ 1949{
1968 struct ceph_connection *con = container_of(work, struct ceph_connection, 1950 struct ceph_connection *con = container_of(work, struct ceph_connection,
1969 work.work); 1951 work.work);
1970 int backoff = 0;
1971
1972more:
1973 if (test_and_set_bit(BUSY, &con->state) != 0) {
1974 dout("con_work %p BUSY already set\n", con);
1975 goto out;
1976 }
1977 dout("con_work %p start, clearing QUEUED\n", con);
1978 clear_bit(QUEUED, &con->state);
1979 1952
1980 mutex_lock(&con->mutex); 1953 mutex_lock(&con->mutex);
1981 1954
@@ -1994,28 +1967,13 @@ more:
1994 try_read(con) < 0 || 1967 try_read(con) < 0 ||
1995 try_write(con) < 0) { 1968 try_write(con) < 0) {
1996 mutex_unlock(&con->mutex); 1969 mutex_unlock(&con->mutex);
1997 backoff = 1;
1998 ceph_fault(con); /* error/fault path */ 1970 ceph_fault(con); /* error/fault path */
1999 goto done_unlocked; 1971 goto done_unlocked;
2000 } 1972 }
2001 1973
2002done: 1974done:
2003 mutex_unlock(&con->mutex); 1975 mutex_unlock(&con->mutex);
2004
2005done_unlocked: 1976done_unlocked:
2006 clear_bit(BUSY, &con->state);
2007 dout("con->state=%lu\n", con->state);
2008 if (test_bit(QUEUED, &con->state)) {
2009 if (!backoff || test_bit(OPENING, &con->state)) {
2010 dout("con_work %p QUEUED reset, looping\n", con);
2011 goto more;
2012 }
2013 dout("con_work %p QUEUED reset, but just faulted\n", con);
2014 clear_bit(QUEUED, &con->state);
2015 }
2016 dout("con_work %p done\n", con);
2017
2018out:
2019 con->ops->put(con); 1977 con->ops->put(con);
2020} 1978}
2021 1979
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index d73f3f6efa36..71603ac3dff5 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -605,8 +605,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end)
605 goto bad; 605 goto bad;
606 } 606 }
607 err = __decode_pool(p, end, pi); 607 err = __decode_pool(p, end, pi);
608 if (err < 0) 608 if (err < 0) {
609 kfree(pi);
609 goto bad; 610 goto bad;
611 }
610 __insert_pg_pool(&map->pg_pools, pi); 612 __insert_pg_pool(&map->pg_pools, pi);
611 } 613 }
612 614