aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/afs/security.c5
-rw-r--r--fs/btrfs/btrfs_inode.h5
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.h13
-rw-r--r--fs/btrfs/disk-io.c15
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c11
-rw-r--r--fs/btrfs/extent_io.c79
-rw-r--r--fs/btrfs/extent_io.h10
-rw-r--r--fs/btrfs/file.c23
-rw-r--r--fs/btrfs/inode.c139
-rw-r--r--fs/btrfs/ioctl.c706
-rw-r--r--fs/btrfs/ioctl.h111
-rw-r--r--fs/btrfs/ordered-data.c41
-rw-r--r--fs/btrfs/ordered-data.h7
-rw-r--r--fs/btrfs/relocation.c4
-rw-r--r--fs/btrfs/super.c238
-rw-r--r--fs/btrfs/transaction.c5
-rw-r--r--fs/btrfs/tree-log.c2
-rw-r--r--fs/btrfs/volumes.c39
-rw-r--r--fs/ceph/Kconfig27
-rw-r--r--fs/ceph/Makefile39
-rw-r--r--fs/ceph/README20
-rw-r--r--fs/ceph/addr.c1188
-rw-r--r--fs/ceph/armor.c99
-rw-r--r--fs/ceph/auth.c257
-rw-r--r--fs/ceph/auth.h84
-rw-r--r--fs/ceph/auth_none.c121
-rw-r--r--fs/ceph/auth_none.h28
-rw-r--r--fs/ceph/auth_x.c656
-rw-r--r--fs/ceph/auth_x.h49
-rw-r--r--fs/ceph/auth_x_protocol.h90
-rw-r--r--fs/ceph/buffer.c78
-rw-r--r--fs/ceph/buffer.h39
-rw-r--r--fs/ceph/caps.c2927
-rw-r--r--fs/ceph/ceph_debug.h37
-rw-r--r--fs/ceph/ceph_frag.c21
-rw-r--r--fs/ceph/ceph_frag.h109
-rw-r--r--fs/ceph/ceph_fs.c74
-rw-r--r--fs/ceph/ceph_fs.h650
-rw-r--r--fs/ceph/ceph_hash.c118
-rw-r--r--fs/ceph/ceph_hash.h13
-rw-r--r--fs/ceph/ceph_strings.c176
-rw-r--r--fs/ceph/crush/crush.c151
-rw-r--r--fs/ceph/crush/crush.h180
-rw-r--r--fs/ceph/crush/hash.c149
-rw-r--r--fs/ceph/crush/hash.h17
-rw-r--r--fs/ceph/crush/mapper.c596
-rw-r--r--fs/ceph/crush/mapper.h20
-rw-r--r--fs/ceph/crypto.c408
-rw-r--r--fs/ceph/crypto.h48
-rw-r--r--fs/ceph/debugfs.c483
-rw-r--r--fs/ceph/decode.h194
-rw-r--r--fs/ceph/dir.c1220
-rw-r--r--fs/ceph/export.c223
-rw-r--r--fs/ceph/file.c937
-rw-r--r--fs/ceph/inode.c1750
-rw-r--r--fs/ceph/ioctl.c160
-rw-r--r--fs/ceph/ioctl.h40
-rw-r--r--fs/ceph/mds_client.c3021
-rw-r--r--fs/ceph/mds_client.h335
-rw-r--r--fs/ceph/mdsmap.c174
-rw-r--r--fs/ceph/mdsmap.h54
-rw-r--r--fs/ceph/messenger.c2240
-rw-r--r--fs/ceph/messenger.h254
-rw-r--r--fs/ceph/mon_client.c834
-rw-r--r--fs/ceph/mon_client.h119
-rw-r--r--fs/ceph/msgpool.c186
-rw-r--r--fs/ceph/msgpool.h27
-rw-r--r--fs/ceph/msgr.h158
-rw-r--r--fs/ceph/osd_client.c1537
-rw-r--r--fs/ceph/osd_client.h166
-rw-r--r--fs/ceph/osdmap.c1019
-rw-r--r--fs/ceph/osdmap.h125
-rw-r--r--fs/ceph/pagelist.c54
-rw-r--r--fs/ceph/pagelist.h54
-rw-r--r--fs/ceph/rados.h374
-rw-r--r--fs/ceph/snap.c904
-rw-r--r--fs/ceph/super.c1030
-rw-r--r--fs/ceph/super.h901
-rw-r--r--fs/ceph/types.h29
-rw-r--r--fs/ceph/xattr.c844
-rw-r--r--fs/cifs/cifsfs.c3
-rw-r--r--fs/cifs/cifsfs.h3
-rw-r--r--fs/cifs/cifsglob.h1
-rw-r--r--fs/cifs/cifsproto.h6
-rw-r--r--fs/cifs/cifssmb.c135
-rw-r--r--fs/cifs/dir.c2
-rw-r--r--fs/cifs/file.c7
-rw-r--r--fs/cifs/inode.c297
-rw-r--r--fs/jffs2/readinode.c2
-rw-r--r--fs/nfs/callback_xdr.c1
-rw-r--r--fs/nfs/delegation.h6
-rw-r--r--fs/nfs/dir.c2
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/nfs/nfs4proc.c1
-rw-r--r--fs/nfs/pagelist.c23
-rw-r--r--fs/nfs/super.c25
-rw-r--r--fs/ntfs/super.c25
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c13
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c81
103 files changed, 29549 insertions, 462 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 7405f071be67..5f85b5947613 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -235,6 +235,7 @@ config NFS_COMMON
235 235
236source "net/sunrpc/Kconfig" 236source "net/sunrpc/Kconfig"
237source "fs/smbfs/Kconfig" 237source "fs/smbfs/Kconfig"
238source "fs/ceph/Kconfig"
238source "fs/cifs/Kconfig" 239source "fs/cifs/Kconfig"
239source "fs/ncpfs/Kconfig" 240source "fs/ncpfs/Kconfig"
240source "fs/coda/Kconfig" 241source "fs/coda/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index c3633aa46911..97f340f14ba2 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -125,3 +125,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
125obj-$(CONFIG_BTRFS_FS) += btrfs/ 125obj-$(CONFIG_BTRFS_FS) += btrfs/
126obj-$(CONFIG_GFS2_FS) += gfs2/ 126obj-$(CONFIG_GFS2_FS) += gfs2/
127obj-$(CONFIG_EXOFS_FS) += exofs/ 127obj-$(CONFIG_EXOFS_FS) += exofs/
128obj-$(CONFIG_CEPH_FS) += ceph/
diff --git a/fs/afs/security.c b/fs/afs/security.c
index 3ef504370034..bb4ed144d0e4 100644
--- a/fs/afs/security.c
+++ b/fs/afs/security.c
@@ -189,8 +189,9 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
189 if (!permits) 189 if (!permits)
190 goto out_unlock; 190 goto out_unlock;
191 191
192 memcpy(permits->permits, xpermits->permits, 192 if (xpermits)
193 count * sizeof(struct afs_permit)); 193 memcpy(permits->permits, xpermits->permits,
194 count * sizeof(struct afs_permit));
194 195
195 _debug("key %x access %x", 196 _debug("key %x access %x",
196 key_serial(key), vnode->status.caller_access); 197 key_serial(key), vnode->status.caller_access);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 3f1f50d9d916..7a4dee199832 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -153,6 +153,11 @@ struct btrfs_inode {
153 unsigned ordered_data_close:1; 153 unsigned ordered_data_close:1;
154 unsigned dummy_inode:1; 154 unsigned dummy_inode:1;
155 155
156 /*
157 * always compress this one file
158 */
159 unsigned force_compress:1;
160
156 struct inode vfs_inode; 161 struct inode vfs_inode;
157}; 162};
158 163
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index a11a32058b50..28b92a7218ab 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -478,7 +478,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
478 goto next; 478 goto next;
479 } 479 }
480 480
481 page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS); 481 page = alloc_page(mapping_gfp_mask(mapping) & ~__GFP_FS);
482 if (!page) 482 if (!page)
483 break; 483 break;
484 484
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 8b5cfdd4bfc1..0af2e3868573 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -373,11 +373,13 @@ struct btrfs_super_block {
373 * ones specified below then we will fail to mount 373 * ones specified below then we will fail to mount
374 */ 374 */
375#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) 375#define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0)
376#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (2ULL << 0)
376 377
377#define BTRFS_FEATURE_COMPAT_SUPP 0ULL 378#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
378#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL 379#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL
379#define BTRFS_FEATURE_INCOMPAT_SUPP \ 380#define BTRFS_FEATURE_INCOMPAT_SUPP \
380 BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF 381 (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
382 BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)
381 383
382/* 384/*
383 * A leaf is full of items. offset and size tell us where to find 385 * A leaf is full of items. offset and size tell us where to find
@@ -1182,7 +1184,6 @@ struct btrfs_root {
1182#define BTRFS_INODE_NOATIME (1 << 9) 1184#define BTRFS_INODE_NOATIME (1 << 9)
1183#define BTRFS_INODE_DIRSYNC (1 << 10) 1185#define BTRFS_INODE_DIRSYNC (1 << 10)
1184 1186
1185
1186/* some macros to generate set/get funcs for the struct fields. This 1187/* some macros to generate set/get funcs for the struct fields. This
1187 * assumes there is a lefoo_to_cpu for every type, so lets make a simple 1188 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
1188 * one for u8: 1189 * one for u8:
@@ -1842,7 +1843,7 @@ BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block,
1842BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, 1843BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block,
1843 compat_flags, 64); 1844 compat_flags, 64);
1844BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, 1845BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block,
1845 compat_flags, 64); 1846 compat_ro_flags, 64);
1846BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, 1847BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
1847 incompat_flags, 64); 1848 incompat_flags, 64);
1848BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, 1849BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
@@ -2310,7 +2311,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
2310 u32 min_type); 2311 u32 min_type);
2311 2312
2312int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); 2313int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
2313int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); 2314int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
2315 struct extent_state **cached_state);
2314int btrfs_writepages(struct address_space *mapping, 2316int btrfs_writepages(struct address_space *mapping,
2315 struct writeback_control *wbc); 2317 struct writeback_control *wbc);
2316int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, 2318int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
@@ -2335,7 +2337,7 @@ int btrfs_init_cachep(void);
2335void btrfs_destroy_cachep(void); 2337void btrfs_destroy_cachep(void);
2336long btrfs_ioctl_trans_end(struct file *file); 2338long btrfs_ioctl_trans_end(struct file *file);
2337struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 2339struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
2338 struct btrfs_root *root); 2340 struct btrfs_root *root, int *was_new);
2339int btrfs_commit_write(struct file *file, struct page *page, 2341int btrfs_commit_write(struct file *file, struct page *page,
2340 unsigned from, unsigned to); 2342 unsigned from, unsigned to);
2341struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, 2343struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2386,7 +2388,6 @@ void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
2386ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); 2388ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
2387 2389
2388/* super.c */ 2390/* super.c */
2389u64 btrfs_parse_size(char *str);
2390int btrfs_parse_options(struct btrfs_root *root, char *options); 2391int btrfs_parse_options(struct btrfs_root *root, char *options);
2391int btrfs_sync_fs(struct super_block *sb, int wait); 2392int btrfs_sync_fs(struct super_block *sb, int wait);
2392 2393
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0427183e3e05..11d0ad30e203 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -263,13 +263,15 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
263static int verify_parent_transid(struct extent_io_tree *io_tree, 263static int verify_parent_transid(struct extent_io_tree *io_tree,
264 struct extent_buffer *eb, u64 parent_transid) 264 struct extent_buffer *eb, u64 parent_transid)
265{ 265{
266 struct extent_state *cached_state = NULL;
266 int ret; 267 int ret;
267 268
268 if (!parent_transid || btrfs_header_generation(eb) == parent_transid) 269 if (!parent_transid || btrfs_header_generation(eb) == parent_transid)
269 return 0; 270 return 0;
270 271
271 lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); 272 lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
272 if (extent_buffer_uptodate(io_tree, eb) && 273 0, &cached_state, GFP_NOFS);
274 if (extent_buffer_uptodate(io_tree, eb, cached_state) &&
273 btrfs_header_generation(eb) == parent_transid) { 275 btrfs_header_generation(eb) == parent_transid) {
274 ret = 0; 276 ret = 0;
275 goto out; 277 goto out;
@@ -282,10 +284,10 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
282 (unsigned long long)btrfs_header_generation(eb)); 284 (unsigned long long)btrfs_header_generation(eb));
283 } 285 }
284 ret = 1; 286 ret = 1;
285 clear_extent_buffer_uptodate(io_tree, eb); 287 clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
286out: 288out:
287 unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, 289 unlock_extent_cached(io_tree, eb->start, eb->start + eb->len - 1,
288 GFP_NOFS); 290 &cached_state, GFP_NOFS);
289 return ret; 291 return ret;
290} 292}
291 293
@@ -2497,7 +2499,8 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid)
2497 int ret; 2499 int ret;
2498 struct inode *btree_inode = buf->first_page->mapping->host; 2500 struct inode *btree_inode = buf->first_page->mapping->host;
2499 2501
2500 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); 2502 ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf,
2503 NULL);
2501 if (!ret) 2504 if (!ret)
2502 return ret; 2505 return ret;
2503 2506
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ba5c3fd5ab8c..951ef09b82f4 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -95,7 +95,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
95 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); 95 btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
96 key.offset = 0; 96 key.offset = 0;
97 97
98 inode = btrfs_iget(sb, &key, root); 98 inode = btrfs_iget(sb, &key, root, NULL);
99 if (IS_ERR(inode)) { 99 if (IS_ERR(inode)) {
100 err = PTR_ERR(inode); 100 err = PTR_ERR(inode);
101 goto fail; 101 goto fail;
@@ -223,7 +223,7 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
223 223
224 key.type = BTRFS_INODE_ITEM_KEY; 224 key.type = BTRFS_INODE_ITEM_KEY;
225 key.offset = 0; 225 key.offset = 0;
226 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root)); 226 dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
227 if (!IS_ERR(dentry)) 227 if (!IS_ERR(dentry))
228 dentry->d_op = &btrfs_dentry_operations; 228 dentry->d_op = &btrfs_dentry_operations;
229 return dentry; 229 return dentry;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 559f72489b3b..1727b26fb194 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -6561,6 +6561,7 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
6561 struct btrfs_key key; 6561 struct btrfs_key key;
6562 struct inode *inode = NULL; 6562 struct inode *inode = NULL;
6563 struct btrfs_file_extent_item *fi; 6563 struct btrfs_file_extent_item *fi;
6564 struct extent_state *cached_state = NULL;
6564 u64 num_bytes; 6565 u64 num_bytes;
6565 u64 skip_objectid = 0; 6566 u64 skip_objectid = 0;
6566 u32 nritems; 6567 u32 nritems;
@@ -6589,12 +6590,14 @@ static noinline int invalidate_extent_cache(struct btrfs_root *root,
6589 } 6590 }
6590 num_bytes = btrfs_file_extent_num_bytes(leaf, fi); 6591 num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
6591 6592
6592 lock_extent(&BTRFS_I(inode)->io_tree, key.offset, 6593 lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
6593 key.offset + num_bytes - 1, GFP_NOFS); 6594 key.offset + num_bytes - 1, 0, &cached_state,
6595 GFP_NOFS);
6594 btrfs_drop_extent_cache(inode, key.offset, 6596 btrfs_drop_extent_cache(inode, key.offset,
6595 key.offset + num_bytes - 1, 1); 6597 key.offset + num_bytes - 1, 1);
6596 unlock_extent(&BTRFS_I(inode)->io_tree, key.offset, 6598 unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
6597 key.offset + num_bytes - 1, GFP_NOFS); 6599 key.offset + num_bytes - 1, &cached_state,
6600 GFP_NOFS);
6598 cond_resched(); 6601 cond_resched();
6599 } 6602 }
6600 iput(inode); 6603 iput(inode);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7073cbb1b2d4..c99121ac5d6b 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -513,7 +513,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
513 u64 last_end; 513 u64 last_end;
514 int err; 514 int err;
515 int set = 0; 515 int set = 0;
516 int clear = 0;
516 517
518 if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
519 clear = 1;
517again: 520again:
518 if (!prealloc && (mask & __GFP_WAIT)) { 521 if (!prealloc && (mask & __GFP_WAIT)) {
519 prealloc = alloc_extent_state(mask); 522 prealloc = alloc_extent_state(mask);
@@ -524,14 +527,20 @@ again:
524 spin_lock(&tree->lock); 527 spin_lock(&tree->lock);
525 if (cached_state) { 528 if (cached_state) {
526 cached = *cached_state; 529 cached = *cached_state;
527 *cached_state = NULL; 530
528 cached_state = NULL; 531 if (clear) {
532 *cached_state = NULL;
533 cached_state = NULL;
534 }
535
529 if (cached && cached->tree && cached->start == start) { 536 if (cached && cached->tree && cached->start == start) {
530 atomic_dec(&cached->refs); 537 if (clear)
538 atomic_dec(&cached->refs);
531 state = cached; 539 state = cached;
532 goto hit_next; 540 goto hit_next;
533 } 541 }
534 free_extent_state(cached); 542 if (clear)
543 free_extent_state(cached);
535 } 544 }
536 /* 545 /*
537 * this search will find the extents that end after 546 * this search will find the extents that end after
@@ -946,11 +955,11 @@ int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
946} 955}
947 956
948int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 957int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
949 gfp_t mask) 958 struct extent_state **cached_state, gfp_t mask)
950{ 959{
951 return set_extent_bit(tree, start, end, 960 return set_extent_bit(tree, start, end,
952 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 961 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
953 0, NULL, NULL, mask); 962 0, NULL, cached_state, mask);
954} 963}
955 964
956int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 965int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
@@ -984,10 +993,11 @@ int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
984} 993}
985 994
986static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, 995static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
987 u64 end, gfp_t mask) 996 u64 end, struct extent_state **cached_state,
997 gfp_t mask)
988{ 998{
989 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, 999 return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
990 NULL, mask); 1000 cached_state, mask);
991} 1001}
992 1002
993int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) 1003int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -1171,7 +1181,8 @@ out:
1171 * 1 is returned if we find something, 0 if nothing was in the tree 1181 * 1 is returned if we find something, 0 if nothing was in the tree
1172 */ 1182 */
1173static noinline u64 find_delalloc_range(struct extent_io_tree *tree, 1183static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1174 u64 *start, u64 *end, u64 max_bytes) 1184 u64 *start, u64 *end, u64 max_bytes,
1185 struct extent_state **cached_state)
1175{ 1186{
1176 struct rb_node *node; 1187 struct rb_node *node;
1177 struct extent_state *state; 1188 struct extent_state *state;
@@ -1203,8 +1214,11 @@ static noinline u64 find_delalloc_range(struct extent_io_tree *tree,
1203 *end = state->end; 1214 *end = state->end;
1204 goto out; 1215 goto out;
1205 } 1216 }
1206 if (!found) 1217 if (!found) {
1207 *start = state->start; 1218 *start = state->start;
1219 *cached_state = state;
1220 atomic_inc(&state->refs);
1221 }
1208 found++; 1222 found++;
1209 *end = state->end; 1223 *end = state->end;
1210 cur_start = state->end + 1; 1224 cur_start = state->end + 1;
@@ -1336,10 +1350,11 @@ again:
1336 delalloc_start = *start; 1350 delalloc_start = *start;
1337 delalloc_end = 0; 1351 delalloc_end = 0;
1338 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, 1352 found = find_delalloc_range(tree, &delalloc_start, &delalloc_end,
1339 max_bytes); 1353 max_bytes, &cached_state);
1340 if (!found || delalloc_end <= *start) { 1354 if (!found || delalloc_end <= *start) {
1341 *start = delalloc_start; 1355 *start = delalloc_start;
1342 *end = delalloc_end; 1356 *end = delalloc_end;
1357 free_extent_state(cached_state);
1343 return found; 1358 return found;
1344 } 1359 }
1345 1360
@@ -1722,7 +1737,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
1722 } 1737 }
1723 1738
1724 if (!uptodate) { 1739 if (!uptodate) {
1725 clear_extent_uptodate(tree, start, end, GFP_NOFS); 1740 clear_extent_uptodate(tree, start, end, NULL, GFP_NOFS);
1726 ClearPageUptodate(page); 1741 ClearPageUptodate(page);
1727 SetPageError(page); 1742 SetPageError(page);
1728 } 1743 }
@@ -1750,7 +1765,8 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
1750static void end_bio_extent_readpage(struct bio *bio, int err) 1765static void end_bio_extent_readpage(struct bio *bio, int err)
1751{ 1766{
1752 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1767 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1753 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; 1768 struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
1769 struct bio_vec *bvec = bio->bi_io_vec;
1754 struct extent_io_tree *tree; 1770 struct extent_io_tree *tree;
1755 u64 start; 1771 u64 start;
1756 u64 end; 1772 u64 end;
@@ -1773,7 +1789,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1773 else 1789 else
1774 whole_page = 0; 1790 whole_page = 0;
1775 1791
1776 if (--bvec >= bio->bi_io_vec) 1792 if (++bvec <= bvec_end)
1777 prefetchw(&bvec->bv_page->flags); 1793 prefetchw(&bvec->bv_page->flags);
1778 1794
1779 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { 1795 if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
@@ -1818,7 +1834,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1818 } 1834 }
1819 check_page_locked(tree, page); 1835 check_page_locked(tree, page);
1820 } 1836 }
1821 } while (bvec >= bio->bi_io_vec); 1837 } while (bvec <= bvec_end);
1822 1838
1823 bio_put(bio); 1839 bio_put(bio);
1824} 1840}
@@ -2704,6 +2720,7 @@ int extent_readpages(struct extent_io_tree *tree,
2704int extent_invalidatepage(struct extent_io_tree *tree, 2720int extent_invalidatepage(struct extent_io_tree *tree,
2705 struct page *page, unsigned long offset) 2721 struct page *page, unsigned long offset)
2706{ 2722{
2723 struct extent_state *cached_state = NULL;
2707 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); 2724 u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
2708 u64 end = start + PAGE_CACHE_SIZE - 1; 2725 u64 end = start + PAGE_CACHE_SIZE - 1;
2709 size_t blocksize = page->mapping->host->i_sb->s_blocksize; 2726 size_t blocksize = page->mapping->host->i_sb->s_blocksize;
@@ -2712,12 +2729,12 @@ int extent_invalidatepage(struct extent_io_tree *tree,
2712 if (start > end) 2729 if (start > end)
2713 return 0; 2730 return 0;
2714 2731
2715 lock_extent(tree, start, end, GFP_NOFS); 2732 lock_extent_bits(tree, start, end, 0, &cached_state, GFP_NOFS);
2716 wait_on_page_writeback(page); 2733 wait_on_page_writeback(page);
2717 clear_extent_bit(tree, start, end, 2734 clear_extent_bit(tree, start, end,
2718 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 2735 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
2719 EXTENT_DO_ACCOUNTING, 2736 EXTENT_DO_ACCOUNTING,
2720 1, 1, NULL, GFP_NOFS); 2737 1, 1, &cached_state, GFP_NOFS);
2721 return 0; 2738 return 0;
2722} 2739}
2723 2740
@@ -2920,16 +2937,17 @@ sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
2920 get_extent_t *get_extent) 2937 get_extent_t *get_extent)
2921{ 2938{
2922 struct inode *inode = mapping->host; 2939 struct inode *inode = mapping->host;
2940 struct extent_state *cached_state = NULL;
2923 u64 start = iblock << inode->i_blkbits; 2941 u64 start = iblock << inode->i_blkbits;
2924 sector_t sector = 0; 2942 sector_t sector = 0;
2925 size_t blksize = (1 << inode->i_blkbits); 2943 size_t blksize = (1 << inode->i_blkbits);
2926 struct extent_map *em; 2944 struct extent_map *em;
2927 2945
2928 lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2946 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
2929 GFP_NOFS); 2947 0, &cached_state, GFP_NOFS);
2930 em = get_extent(inode, NULL, 0, start, blksize, 0); 2948 em = get_extent(inode, NULL, 0, start, blksize, 0);
2931 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, 2949 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start,
2932 GFP_NOFS); 2950 start + blksize - 1, &cached_state, GFP_NOFS);
2933 if (!em || IS_ERR(em)) 2951 if (!em || IS_ERR(em))
2934 return 0; 2952 return 0;
2935 2953
@@ -2951,6 +2969,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2951 u32 flags = 0; 2969 u32 flags = 0;
2952 u64 disko = 0; 2970 u64 disko = 0;
2953 struct extent_map *em = NULL; 2971 struct extent_map *em = NULL;
2972 struct extent_state *cached_state = NULL;
2954 int end = 0; 2973 int end = 0;
2955 u64 em_start = 0, em_len = 0; 2974 u64 em_start = 0, em_len = 0;
2956 unsigned long emflags; 2975 unsigned long emflags;
@@ -2959,8 +2978,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
2959 if (len == 0) 2978 if (len == 0)
2960 return -EINVAL; 2979 return -EINVAL;
2961 2980
2962 lock_extent(&BTRFS_I(inode)->io_tree, start, start + len, 2981 lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
2963 GFP_NOFS); 2982 &cached_state, GFP_NOFS);
2964 em = get_extent(inode, NULL, 0, off, max - off, 0); 2983 em = get_extent(inode, NULL, 0, off, max - off, 0);
2965 if (!em) 2984 if (!em)
2966 goto out; 2985 goto out;
@@ -3023,8 +3042,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
3023out_free: 3042out_free:
3024 free_extent_map(em); 3043 free_extent_map(em);
3025out: 3044out:
3026 unlock_extent(&BTRFS_I(inode)->io_tree, start, start + len, 3045 unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len,
3027 GFP_NOFS); 3046 &cached_state, GFP_NOFS);
3028 return ret; 3047 return ret;
3029} 3048}
3030 3049
@@ -3264,7 +3283,8 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
3264} 3283}
3265 3284
3266int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 3285int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3267 struct extent_buffer *eb) 3286 struct extent_buffer *eb,
3287 struct extent_state **cached_state)
3268{ 3288{
3269 unsigned long i; 3289 unsigned long i;
3270 struct page *page; 3290 struct page *page;
@@ -3274,7 +3294,7 @@ int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
3274 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); 3294 clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
3275 3295
3276 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, 3296 clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
3277 GFP_NOFS); 3297 cached_state, GFP_NOFS);
3278 for (i = 0; i < num_pages; i++) { 3298 for (i = 0; i < num_pages; i++) {
3279 page = extent_buffer_page(eb, i); 3299 page = extent_buffer_page(eb, i);
3280 if (page) 3300 if (page)
@@ -3334,7 +3354,8 @@ int extent_range_uptodate(struct extent_io_tree *tree,
3334} 3354}
3335 3355
3336int extent_buffer_uptodate(struct extent_io_tree *tree, 3356int extent_buffer_uptodate(struct extent_io_tree *tree,
3337 struct extent_buffer *eb) 3357 struct extent_buffer *eb,
3358 struct extent_state *cached_state)
3338{ 3359{
3339 int ret = 0; 3360 int ret = 0;
3340 unsigned long num_pages; 3361 unsigned long num_pages;
@@ -3346,7 +3367,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3346 return 1; 3367 return 1;
3347 3368
3348 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, 3369 ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1,
3349 EXTENT_UPTODATE, 1, NULL); 3370 EXTENT_UPTODATE, 1, cached_state);
3350 if (ret) 3371 if (ret)
3351 return ret; 3372 return ret;
3352 3373
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 36de250a7b2b..bbab4813646f 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -163,6 +163,8 @@ int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
163int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, 163int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
164 int bits, struct extent_state **cached, gfp_t mask); 164 int bits, struct extent_state **cached, gfp_t mask);
165int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); 165int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask);
166int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
167 struct extent_state **cached, gfp_t mask);
166int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, 168int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
167 gfp_t mask); 169 gfp_t mask);
168int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 170int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
@@ -196,7 +198,7 @@ int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
196int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start, 198int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
197 u64 end, gfp_t mask); 199 u64 end, gfp_t mask);
198int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, 200int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
199 gfp_t mask); 201 struct extent_state **cached_state, gfp_t mask);
200int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, 202int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
201 gfp_t mask); 203 gfp_t mask);
202int find_first_extent_bit(struct extent_io_tree *tree, u64 start, 204int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -281,9 +283,11 @@ int test_extent_buffer_dirty(struct extent_io_tree *tree,
281int set_extent_buffer_uptodate(struct extent_io_tree *tree, 283int set_extent_buffer_uptodate(struct extent_io_tree *tree,
282 struct extent_buffer *eb); 284 struct extent_buffer *eb);
283int clear_extent_buffer_uptodate(struct extent_io_tree *tree, 285int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
284 struct extent_buffer *eb); 286 struct extent_buffer *eb,
287 struct extent_state **cached_state);
285int extent_buffer_uptodate(struct extent_io_tree *tree, 288int extent_buffer_uptodate(struct extent_io_tree *tree,
286 struct extent_buffer *eb); 289 struct extent_buffer *eb,
290 struct extent_state *cached_state);
287int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, 291int map_extent_buffer(struct extent_buffer *eb, unsigned long offset,
288 unsigned long min_len, char **token, char **map, 292 unsigned long min_len, char **token, char **map,
289 unsigned long *map_start, 293 unsigned long *map_start,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 6ed434ac037f..ee3323c7fc1c 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -123,7 +123,8 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1); 123 root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
124 124
125 end_of_last_block = start_pos + num_bytes - 1; 125 end_of_last_block = start_pos + num_bytes - 1;
126 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); 126 err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
127 NULL);
127 if (err) 128 if (err)
128 return err; 129 return err;
129 130
@@ -753,6 +754,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
753 loff_t pos, unsigned long first_index, 754 loff_t pos, unsigned long first_index,
754 unsigned long last_index, size_t write_bytes) 755 unsigned long last_index, size_t write_bytes)
755{ 756{
757 struct extent_state *cached_state = NULL;
756 int i; 758 int i;
757 unsigned long index = pos >> PAGE_CACHE_SHIFT; 759 unsigned long index = pos >> PAGE_CACHE_SHIFT;
758 struct inode *inode = fdentry(file)->d_inode; 760 struct inode *inode = fdentry(file)->d_inode;
@@ -781,16 +783,18 @@ again:
781 } 783 }
782 if (start_pos < inode->i_size) { 784 if (start_pos < inode->i_size) {
783 struct btrfs_ordered_extent *ordered; 785 struct btrfs_ordered_extent *ordered;
784 lock_extent(&BTRFS_I(inode)->io_tree, 786 lock_extent_bits(&BTRFS_I(inode)->io_tree,
785 start_pos, last_pos - 1, GFP_NOFS); 787 start_pos, last_pos - 1, 0, &cached_state,
788 GFP_NOFS);
786 ordered = btrfs_lookup_first_ordered_extent(inode, 789 ordered = btrfs_lookup_first_ordered_extent(inode,
787 last_pos - 1); 790 last_pos - 1);
788 if (ordered && 791 if (ordered &&
789 ordered->file_offset + ordered->len > start_pos && 792 ordered->file_offset + ordered->len > start_pos &&
790 ordered->file_offset < last_pos) { 793 ordered->file_offset < last_pos) {
791 btrfs_put_ordered_extent(ordered); 794 btrfs_put_ordered_extent(ordered);
792 unlock_extent(&BTRFS_I(inode)->io_tree, 795 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
793 start_pos, last_pos - 1, GFP_NOFS); 796 start_pos, last_pos - 1,
797 &cached_state, GFP_NOFS);
794 for (i = 0; i < num_pages; i++) { 798 for (i = 0; i < num_pages; i++) {
795 unlock_page(pages[i]); 799 unlock_page(pages[i]);
796 page_cache_release(pages[i]); 800 page_cache_release(pages[i]);
@@ -802,12 +806,13 @@ again:
802 if (ordered) 806 if (ordered)
803 btrfs_put_ordered_extent(ordered); 807 btrfs_put_ordered_extent(ordered);
804 808
805 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, 809 clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos,
806 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC | 810 last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
807 EXTENT_DO_ACCOUNTING, 811 EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
808 GFP_NOFS); 812 GFP_NOFS);
809 unlock_extent(&BTRFS_I(inode)->io_tree, 813 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
810 start_pos, last_pos - 1, GFP_NOFS); 814 start_pos, last_pos - 1, &cached_state,
815 GFP_NOFS);
811 } 816 }
812 for (i = 0; i < num_pages; i++) { 817 for (i = 0; i < num_pages; i++) {
813 clear_page_dirty_for_io(pages[i]); 818 clear_page_dirty_for_io(pages[i]);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c41db6d45ab6..02bb099845fd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -379,7 +379,8 @@ again:
379 * change at any time if we discover bad compression ratios. 379 * change at any time if we discover bad compression ratios.
380 */ 380 */
381 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && 381 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
382 btrfs_test_opt(root, COMPRESS)) { 382 (btrfs_test_opt(root, COMPRESS) ||
383 (BTRFS_I(inode)->force_compress))) {
383 WARN_ON(pages); 384 WARN_ON(pages);
384 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); 385 pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
385 386
@@ -483,8 +484,10 @@ again:
483 nr_pages_ret = 0; 484 nr_pages_ret = 0;
484 485
485 /* flag the file so we don't compress in the future */ 486 /* flag the file so we don't compress in the future */
486 if (!btrfs_test_opt(root, FORCE_COMPRESS)) 487 if (!btrfs_test_opt(root, FORCE_COMPRESS) &&
488 !(BTRFS_I(inode)->force_compress)) {
487 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; 489 BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
490 }
488 } 491 }
489 if (will_compress) { 492 if (will_compress) {
490 *num_added += 1; 493 *num_added += 1;
@@ -570,8 +573,8 @@ retry:
570 unsigned long nr_written = 0; 573 unsigned long nr_written = 0;
571 574
572 lock_extent(io_tree, async_extent->start, 575 lock_extent(io_tree, async_extent->start,
573 async_extent->start + 576 async_extent->start +
574 async_extent->ram_size - 1, GFP_NOFS); 577 async_extent->ram_size - 1, GFP_NOFS);
575 578
576 /* allocate blocks */ 579 /* allocate blocks */
577 ret = cow_file_range(inode, async_cow->locked_page, 580 ret = cow_file_range(inode, async_cow->locked_page,
@@ -1211,7 +1214,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
1211 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) 1214 else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)
1212 ret = run_delalloc_nocow(inode, locked_page, start, end, 1215 ret = run_delalloc_nocow(inode, locked_page, start, end,
1213 page_started, 0, nr_written); 1216 page_started, 0, nr_written);
1214 else if (!btrfs_test_opt(root, COMPRESS)) 1217 else if (!btrfs_test_opt(root, COMPRESS) &&
1218 !(BTRFS_I(inode)->force_compress))
1215 ret = cow_file_range(inode, locked_page, start, end, 1219 ret = cow_file_range(inode, locked_page, start, end,
1216 page_started, nr_written, 1); 1220 page_started, nr_written, 1);
1217 else 1221 else
@@ -1508,12 +1512,13 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
1508 return 0; 1512 return 0;
1509} 1513}
1510 1514
1511int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) 1515int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
1516 struct extent_state **cached_state)
1512{ 1517{
1513 if ((end & (PAGE_CACHE_SIZE - 1)) == 0) 1518 if ((end & (PAGE_CACHE_SIZE - 1)) == 0)
1514 WARN_ON(1); 1519 WARN_ON(1);
1515 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, 1520 return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end,
1516 GFP_NOFS); 1521 cached_state, GFP_NOFS);
1517} 1522}
1518 1523
1519/* see btrfs_writepage_start_hook for details on why this is required */ 1524/* see btrfs_writepage_start_hook for details on why this is required */
@@ -1526,6 +1531,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
1526{ 1531{
1527 struct btrfs_writepage_fixup *fixup; 1532 struct btrfs_writepage_fixup *fixup;
1528 struct btrfs_ordered_extent *ordered; 1533 struct btrfs_ordered_extent *ordered;
1534 struct extent_state *cached_state = NULL;
1529 struct page *page; 1535 struct page *page;
1530 struct inode *inode; 1536 struct inode *inode;
1531 u64 page_start; 1537 u64 page_start;
@@ -1544,7 +1550,8 @@ again:
1544 page_start = page_offset(page); 1550 page_start = page_offset(page);
1545 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; 1551 page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
1546 1552
1547 lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1553 lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
1554 &cached_state, GFP_NOFS);
1548 1555
1549 /* already ordered? We're done */ 1556 /* already ordered? We're done */
1550 if (PagePrivate2(page)) 1557 if (PagePrivate2(page))
@@ -1552,17 +1559,18 @@ again:
1552 1559
1553 ordered = btrfs_lookup_ordered_extent(inode, page_start); 1560 ordered = btrfs_lookup_ordered_extent(inode, page_start);
1554 if (ordered) { 1561 if (ordered) {
1555 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, 1562 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start,
1556 page_end, GFP_NOFS); 1563 page_end, &cached_state, GFP_NOFS);
1557 unlock_page(page); 1564 unlock_page(page);
1558 btrfs_start_ordered_extent(inode, ordered, 1); 1565 btrfs_start_ordered_extent(inode, ordered, 1);
1559 goto again; 1566 goto again;
1560 } 1567 }
1561 1568
1562 btrfs_set_extent_delalloc(inode, page_start, page_end); 1569 btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
1563 ClearPageChecked(page); 1570 ClearPageChecked(page);
1564out: 1571out:
1565 unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); 1572 unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end,
1573 &cached_state, GFP_NOFS);
1566out_page: 1574out_page:
1567 unlock_page(page); 1575 unlock_page(page);
1568 page_cache_release(page); 1576 page_cache_release(page);
@@ -1691,14 +1699,14 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1691 struct btrfs_trans_handle *trans; 1699 struct btrfs_trans_handle *trans;
1692 struct btrfs_ordered_extent *ordered_extent = NULL; 1700 struct btrfs_ordered_extent *ordered_extent = NULL;
1693 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 1701 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1702 struct extent_state *cached_state = NULL;
1694 int compressed = 0; 1703 int compressed = 0;
1695 int ret; 1704 int ret;
1696 1705
1697 ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1); 1706 ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
1707 end - start + 1);
1698 if (!ret) 1708 if (!ret)
1699 return 0; 1709 return 0;
1700
1701 ordered_extent = btrfs_lookup_ordered_extent(inode, start);
1702 BUG_ON(!ordered_extent); 1710 BUG_ON(!ordered_extent);
1703 1711
1704 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { 1712 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
@@ -1713,9 +1721,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1713 goto out; 1721 goto out;
1714 } 1722 }
1715 1723
1716 lock_extent(io_tree, ordered_extent->file_offset, 1724 lock_extent_bits(io_tree, ordered_extent->file_offset,
1717 ordered_extent->file_offset + ordered_extent->len - 1, 1725 ordered_extent->file_offset + ordered_extent->len - 1,
1718 GFP_NOFS); 1726 0, &cached_state, GFP_NOFS);
1719 1727
1720 trans = btrfs_join_transaction(root, 1); 1728 trans = btrfs_join_transaction(root, 1);
1721 1729
@@ -1742,9 +1750,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
1742 ordered_extent->len); 1750 ordered_extent->len);
1743 BUG_ON(ret); 1751 BUG_ON(ret);
1744 } 1752 }
1745 unlock_extent(io_tree, ordered_extent->file_offset, 1753 unlock_extent_cached(io_tree, ordered_extent->file_offset,
1746 ordered_extent->file_offset + ordered_extent->len - 1, 1754 ordered_extent->file_offset +
1747 GFP_NOFS); 1755 ordered_extent->len - 1, &cached_state, GFP_NOFS);
1756
1748 add_pending_csums(trans, inode, ordered_extent->file_offset, 1757 add_pending_csums(trans, inode, ordered_extent->file_offset,
1749 &ordered_extent->list); 1758 &ordered_extent->list);
1750 1759
@@ -2153,7 +2162,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
2153 found_key.objectid = found_key.offset; 2162 found_key.objectid = found_key.offset;
2154 found_key.type = BTRFS_INODE_ITEM_KEY; 2163 found_key.type = BTRFS_INODE_ITEM_KEY;
2155 found_key.offset = 0; 2164 found_key.offset = 0;
2156 inode = btrfs_iget(root->fs_info->sb, &found_key, root); 2165 inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
2157 if (IS_ERR(inode)) 2166 if (IS_ERR(inode))
2158 break; 2167 break;
2159 2168
@@ -3081,6 +3090,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
3081 struct btrfs_root *root = BTRFS_I(inode)->root; 3090 struct btrfs_root *root = BTRFS_I(inode)->root;
3082 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3091 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3083 struct btrfs_ordered_extent *ordered; 3092 struct btrfs_ordered_extent *ordered;
3093 struct extent_state *cached_state = NULL;
3084 char *kaddr; 3094 char *kaddr;
3085 u32 blocksize = root->sectorsize; 3095 u32 blocksize = root->sectorsize;
3086 pgoff_t index = from >> PAGE_CACHE_SHIFT; 3096 pgoff_t index = from >> PAGE_CACHE_SHIFT;
@@ -3127,12 +3137,14 @@ again:
3127 } 3137 }
3128 wait_on_page_writeback(page); 3138 wait_on_page_writeback(page);
3129 3139
3130 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 3140 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
3141 GFP_NOFS);
3131 set_page_extent_mapped(page); 3142 set_page_extent_mapped(page);
3132 3143
3133 ordered = btrfs_lookup_ordered_extent(inode, page_start); 3144 ordered = btrfs_lookup_ordered_extent(inode, page_start);
3134 if (ordered) { 3145 if (ordered) {
3135 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3146 unlock_extent_cached(io_tree, page_start, page_end,
3147 &cached_state, GFP_NOFS);
3136 unlock_page(page); 3148 unlock_page(page);
3137 page_cache_release(page); 3149 page_cache_release(page);
3138 btrfs_start_ordered_extent(inode, ordered, 1); 3150 btrfs_start_ordered_extent(inode, ordered, 1);
@@ -3140,13 +3152,15 @@ again:
3140 goto again; 3152 goto again;
3141 } 3153 }
3142 3154
3143 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 3155 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
3144 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 3156 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
3145 GFP_NOFS); 3157 0, 0, &cached_state, GFP_NOFS);
3146 3158
3147 ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 3159 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
3160 &cached_state);
3148 if (ret) { 3161 if (ret) {
3149 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3162 unlock_extent_cached(io_tree, page_start, page_end,
3163 &cached_state, GFP_NOFS);
3150 goto out_unlock; 3164 goto out_unlock;
3151 } 3165 }
3152 3166
@@ -3159,7 +3173,8 @@ again:
3159 } 3173 }
3160 ClearPageChecked(page); 3174 ClearPageChecked(page);
3161 set_page_dirty(page); 3175 set_page_dirty(page);
3162 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 3176 unlock_extent_cached(io_tree, page_start, page_end, &cached_state,
3177 GFP_NOFS);
3163 3178
3164out_unlock: 3179out_unlock:
3165 if (ret) 3180 if (ret)
@@ -3177,6 +3192,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3177 struct btrfs_root *root = BTRFS_I(inode)->root; 3192 struct btrfs_root *root = BTRFS_I(inode)->root;
3178 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 3193 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
3179 struct extent_map *em; 3194 struct extent_map *em;
3195 struct extent_state *cached_state = NULL;
3180 u64 mask = root->sectorsize - 1; 3196 u64 mask = root->sectorsize - 1;
3181 u64 hole_start = (inode->i_size + mask) & ~mask; 3197 u64 hole_start = (inode->i_size + mask) & ~mask;
3182 u64 block_end = (size + mask) & ~mask; 3198 u64 block_end = (size + mask) & ~mask;
@@ -3192,11 +3208,13 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3192 struct btrfs_ordered_extent *ordered; 3208 struct btrfs_ordered_extent *ordered;
3193 btrfs_wait_ordered_range(inode, hole_start, 3209 btrfs_wait_ordered_range(inode, hole_start,
3194 block_end - hole_start); 3210 block_end - hole_start);
3195 lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3211 lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
3212 &cached_state, GFP_NOFS);
3196 ordered = btrfs_lookup_ordered_extent(inode, hole_start); 3213 ordered = btrfs_lookup_ordered_extent(inode, hole_start);
3197 if (!ordered) 3214 if (!ordered)
3198 break; 3215 break;
3199 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3216 unlock_extent_cached(io_tree, hole_start, block_end - 1,
3217 &cached_state, GFP_NOFS);
3200 btrfs_put_ordered_extent(ordered); 3218 btrfs_put_ordered_extent(ordered);
3201 } 3219 }
3202 3220
@@ -3241,7 +3259,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
3241 break; 3259 break;
3242 } 3260 }
3243 3261
3244 unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); 3262 unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
3263 GFP_NOFS);
3245 return err; 3264 return err;
3246} 3265}
3247 3266
@@ -3639,6 +3658,7 @@ static noinline void init_btrfs_i(struct inode *inode)
3639 bi->index_cnt = (u64)-1; 3658 bi->index_cnt = (u64)-1;
3640 bi->last_unlink_trans = 0; 3659 bi->last_unlink_trans = 0;
3641 bi->ordered_data_close = 0; 3660 bi->ordered_data_close = 0;
3661 bi->force_compress = 0;
3642 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); 3662 extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
3643 extent_io_tree_init(&BTRFS_I(inode)->io_tree, 3663 extent_io_tree_init(&BTRFS_I(inode)->io_tree,
3644 inode->i_mapping, GFP_NOFS); 3664 inode->i_mapping, GFP_NOFS);
@@ -3687,7 +3707,7 @@ static struct inode *btrfs_iget_locked(struct super_block *s,
3687 * Returns in *is_new if the inode was read from disk 3707 * Returns in *is_new if the inode was read from disk
3688 */ 3708 */
3689struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, 3709struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3690 struct btrfs_root *root) 3710 struct btrfs_root *root, int *new)
3691{ 3711{
3692 struct inode *inode; 3712 struct inode *inode;
3693 3713
@@ -3702,6 +3722,8 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
3702 3722
3703 inode_tree_add(inode); 3723 inode_tree_add(inode);
3704 unlock_new_inode(inode); 3724 unlock_new_inode(inode);
3725 if (new)
3726 *new = 1;
3705 } 3727 }
3706 3728
3707 return inode; 3729 return inode;
@@ -3754,7 +3776,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3754 return NULL; 3776 return NULL;
3755 3777
3756 if (location.type == BTRFS_INODE_ITEM_KEY) { 3778 if (location.type == BTRFS_INODE_ITEM_KEY) {
3757 inode = btrfs_iget(dir->i_sb, &location, root); 3779 inode = btrfs_iget(dir->i_sb, &location, root, NULL);
3758 return inode; 3780 return inode;
3759 } 3781 }
3760 3782
@@ -3769,7 +3791,7 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
3769 else 3791 else
3770 inode = new_simple_dir(dir->i_sb, &location, sub_root); 3792 inode = new_simple_dir(dir->i_sb, &location, sub_root);
3771 } else { 3793 } else {
3772 inode = btrfs_iget(dir->i_sb, &location, sub_root); 3794 inode = btrfs_iget(dir->i_sb, &location, sub_root, NULL);
3773 } 3795 }
3774 srcu_read_unlock(&root->fs_info->subvol_srcu, index); 3796 srcu_read_unlock(&root->fs_info->subvol_srcu, index);
3775 3797
@@ -4501,7 +4523,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
4501 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); 4523 err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
4502 if (err) { 4524 if (err) {
4503 err = -ENOSPC; 4525 err = -ENOSPC;
4504 goto out_unlock; 4526 goto out_fail;
4505 } 4527 }
4506 4528
4507 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, 4529 inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
@@ -4979,6 +5001,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4979{ 5001{
4980 struct extent_io_tree *tree; 5002 struct extent_io_tree *tree;
4981 struct btrfs_ordered_extent *ordered; 5003 struct btrfs_ordered_extent *ordered;
5004 struct extent_state *cached_state = NULL;
4982 u64 page_start = page_offset(page); 5005 u64 page_start = page_offset(page);
4983 u64 page_end = page_start + PAGE_CACHE_SIZE - 1; 5006 u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
4984 5007
@@ -4997,7 +5020,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
4997 btrfs_releasepage(page, GFP_NOFS); 5020 btrfs_releasepage(page, GFP_NOFS);
4998 return; 5021 return;
4999 } 5022 }
5000 lock_extent(tree, page_start, page_end, GFP_NOFS); 5023 lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
5024 GFP_NOFS);
5001 ordered = btrfs_lookup_ordered_extent(page->mapping->host, 5025 ordered = btrfs_lookup_ordered_extent(page->mapping->host,
5002 page_offset(page)); 5026 page_offset(page));
5003 if (ordered) { 5027 if (ordered) {
@@ -5008,7 +5032,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5008 clear_extent_bit(tree, page_start, page_end, 5032 clear_extent_bit(tree, page_start, page_end,
5009 EXTENT_DIRTY | EXTENT_DELALLOC | 5033 EXTENT_DIRTY | EXTENT_DELALLOC |
5010 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0, 5034 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING, 1, 0,
5011 NULL, GFP_NOFS); 5035 &cached_state, GFP_NOFS);
5012 /* 5036 /*
5013 * whoever cleared the private bit is responsible 5037 * whoever cleared the private bit is responsible
5014 * for the finish_ordered_io 5038 * for the finish_ordered_io
@@ -5018,11 +5042,13 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
5018 page_start, page_end); 5042 page_start, page_end);
5019 } 5043 }
5020 btrfs_put_ordered_extent(ordered); 5044 btrfs_put_ordered_extent(ordered);
5021 lock_extent(tree, page_start, page_end, GFP_NOFS); 5045 cached_state = NULL;
5046 lock_extent_bits(tree, page_start, page_end, 0, &cached_state,
5047 GFP_NOFS);
5022 } 5048 }
5023 clear_extent_bit(tree, page_start, page_end, 5049 clear_extent_bit(tree, page_start, page_end,
5024 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | 5050 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
5025 EXTENT_DO_ACCOUNTING, 1, 1, NULL, GFP_NOFS); 5051 EXTENT_DO_ACCOUNTING, 1, 1, &cached_state, GFP_NOFS);
5026 __btrfs_releasepage(page, GFP_NOFS); 5052 __btrfs_releasepage(page, GFP_NOFS);
5027 5053
5028 ClearPageChecked(page); 5054 ClearPageChecked(page);
@@ -5055,6 +5081,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
5055 struct btrfs_root *root = BTRFS_I(inode)->root; 5081 struct btrfs_root *root = BTRFS_I(inode)->root;
5056 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; 5082 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5057 struct btrfs_ordered_extent *ordered; 5083 struct btrfs_ordered_extent *ordered;
5084 struct extent_state *cached_state = NULL;
5058 char *kaddr; 5085 char *kaddr;
5059 unsigned long zero_start; 5086 unsigned long zero_start;
5060 loff_t size; 5087 loff_t size;
@@ -5093,7 +5120,8 @@ again:
5093 } 5120 }
5094 wait_on_page_writeback(page); 5121 wait_on_page_writeback(page);
5095 5122
5096 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 5123 lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state,
5124 GFP_NOFS);
5097 set_page_extent_mapped(page); 5125 set_page_extent_mapped(page);
5098 5126
5099 /* 5127 /*
@@ -5102,7 +5130,8 @@ again:
5102 */ 5130 */
5103 ordered = btrfs_lookup_ordered_extent(inode, page_start); 5131 ordered = btrfs_lookup_ordered_extent(inode, page_start);
5104 if (ordered) { 5132 if (ordered) {
5105 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5133 unlock_extent_cached(io_tree, page_start, page_end,
5134 &cached_state, GFP_NOFS);
5106 unlock_page(page); 5135 unlock_page(page);
5107 btrfs_start_ordered_extent(inode, ordered, 1); 5136 btrfs_start_ordered_extent(inode, ordered, 1);
5108 btrfs_put_ordered_extent(ordered); 5137 btrfs_put_ordered_extent(ordered);
@@ -5116,13 +5145,15 @@ again:
5116 * is probably a better way to do this, but for now keep consistent with 5145 * is probably a better way to do this, but for now keep consistent with
5117 * prepare_pages in the normal write path. 5146 * prepare_pages in the normal write path.
5118 */ 5147 */
5119 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 5148 clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
5120 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING, 5149 EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING,
5121 GFP_NOFS); 5150 0, 0, &cached_state, GFP_NOFS);
5122 5151
5123 ret = btrfs_set_extent_delalloc(inode, page_start, page_end); 5152 ret = btrfs_set_extent_delalloc(inode, page_start, page_end,
5153 &cached_state);
5124 if (ret) { 5154 if (ret) {
5125 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5155 unlock_extent_cached(io_tree, page_start, page_end,
5156 &cached_state, GFP_NOFS);
5126 ret = VM_FAULT_SIGBUS; 5157 ret = VM_FAULT_SIGBUS;
5127 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); 5158 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
5128 goto out_unlock; 5159 goto out_unlock;
@@ -5148,7 +5179,7 @@ again:
5148 BTRFS_I(inode)->last_trans = root->fs_info->generation; 5179 BTRFS_I(inode)->last_trans = root->fs_info->generation;
5149 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; 5180 BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
5150 5181
5151 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 5182 unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
5152 5183
5153out_unlock: 5184out_unlock:
5154 btrfs_unreserve_metadata_for_delalloc(root, inode, 1); 5185 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
@@ -5827,6 +5858,7 @@ stop_trans:
5827static long btrfs_fallocate(struct inode *inode, int mode, 5858static long btrfs_fallocate(struct inode *inode, int mode,
5828 loff_t offset, loff_t len) 5859 loff_t offset, loff_t len)
5829{ 5860{
5861 struct extent_state *cached_state = NULL;
5830 u64 cur_offset; 5862 u64 cur_offset;
5831 u64 last_byte; 5863 u64 last_byte;
5832 u64 alloc_start; 5864 u64 alloc_start;
@@ -5865,16 +5897,17 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5865 /* the extent lock is ordered inside the running 5897 /* the extent lock is ordered inside the running
5866 * transaction 5898 * transaction
5867 */ 5899 */
5868 lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 5900 lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
5869 GFP_NOFS); 5901 locked_end, 0, &cached_state, GFP_NOFS);
5870 ordered = btrfs_lookup_first_ordered_extent(inode, 5902 ordered = btrfs_lookup_first_ordered_extent(inode,
5871 alloc_end - 1); 5903 alloc_end - 1);
5872 if (ordered && 5904 if (ordered &&
5873 ordered->file_offset + ordered->len > alloc_start && 5905 ordered->file_offset + ordered->len > alloc_start &&
5874 ordered->file_offset < alloc_end) { 5906 ordered->file_offset < alloc_end) {
5875 btrfs_put_ordered_extent(ordered); 5907 btrfs_put_ordered_extent(ordered);
5876 unlock_extent(&BTRFS_I(inode)->io_tree, 5908 unlock_extent_cached(&BTRFS_I(inode)->io_tree,
5877 alloc_start, locked_end, GFP_NOFS); 5909 alloc_start, locked_end,
5910 &cached_state, GFP_NOFS);
5878 /* 5911 /*
5879 * we can't wait on the range with the transaction 5912 * we can't wait on the range with the transaction
5880 * running or with the extent lock held 5913 * running or with the extent lock held
@@ -5916,8 +5949,8 @@ static long btrfs_fallocate(struct inode *inode, int mode,
5916 break; 5949 break;
5917 } 5950 }
5918 } 5951 }
5919 unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, 5952 unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
5920 GFP_NOFS); 5953 &cached_state, GFP_NOFS);
5921 5954
5922 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, 5955 btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
5923 alloc_end - alloc_start); 5956 alloc_end - alloc_start);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 645a17927a8f..2845c6ceecd2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -48,6 +48,7 @@
48#include "print-tree.h" 48#include "print-tree.h"
49#include "volumes.h" 49#include "volumes.h"
50#include "locking.h" 50#include "locking.h"
51#include "ctree.h"
51 52
52/* Mask out flags that are inappropriate for the given type of inode. */ 53/* Mask out flags that are inappropriate for the given type of inode. */
53static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) 54static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -474,7 +475,79 @@ out_unlock:
474 return error; 475 return error;
475} 476}
476 477
477static int btrfs_defrag_file(struct file *file) 478static int should_defrag_range(struct inode *inode, u64 start, u64 len,
479 int thresh, u64 *last_len, u64 *skip,
480 u64 *defrag_end)
481{
482 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
483 struct extent_map *em = NULL;
484 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
485 int ret = 1;
486
487
488 if (thresh == 0)
489 thresh = 256 * 1024;
490
491 /*
492 * make sure that once we start defragging and extent, we keep on
493 * defragging it
494 */
495 if (start < *defrag_end)
496 return 1;
497
498 *skip = 0;
499
500 /*
501 * hopefully we have this extent in the tree already, try without
502 * the full extent lock
503 */
504 read_lock(&em_tree->lock);
505 em = lookup_extent_mapping(em_tree, start, len);
506 read_unlock(&em_tree->lock);
507
508 if (!em) {
509 /* get the big lock and read metadata off disk */
510 lock_extent(io_tree, start, start + len - 1, GFP_NOFS);
511 em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
512 unlock_extent(io_tree, start, start + len - 1, GFP_NOFS);
513
514 if (!em)
515 return 0;
516 }
517
518 /* this will cover holes, and inline extents */
519 if (em->block_start >= EXTENT_MAP_LAST_BYTE)
520 ret = 0;
521
522 /*
523 * we hit a real extent, if it is big don't bother defragging it again
524 */
525 if ((*last_len == 0 || *last_len >= thresh) && em->len >= thresh)
526 ret = 0;
527
528 /*
529 * last_len ends up being a counter of how many bytes we've defragged.
530 * every time we choose not to defrag an extent, we reset *last_len
531 * so that the next tiny extent will force a defrag.
532 *
533 * The end result of this is that tiny extents before a single big
534 * extent will force at least part of that big extent to be defragged.
535 */
536 if (ret) {
537 *last_len += len;
538 *defrag_end = extent_map_end(em);
539 } else {
540 *last_len = 0;
541 *skip = extent_map_end(em);
542 *defrag_end = 0;
543 }
544
545 free_extent_map(em);
546 return ret;
547}
548
549static int btrfs_defrag_file(struct file *file,
550 struct btrfs_ioctl_defrag_range_args *range)
478{ 551{
479 struct inode *inode = fdentry(file)->d_inode; 552 struct inode *inode = fdentry(file)->d_inode;
480 struct btrfs_root *root = BTRFS_I(inode)->root; 553 struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -486,37 +559,96 @@ static int btrfs_defrag_file(struct file *file)
486 unsigned long total_read = 0; 559 unsigned long total_read = 0;
487 u64 page_start; 560 u64 page_start;
488 u64 page_end; 561 u64 page_end;
562 u64 last_len = 0;
563 u64 skip = 0;
564 u64 defrag_end = 0;
489 unsigned long i; 565 unsigned long i;
490 int ret; 566 int ret;
491 567
492 ret = btrfs_check_data_free_space(root, inode, inode->i_size); 568 if (inode->i_size == 0)
493 if (ret) 569 return 0;
494 return -ENOSPC; 570
571 if (range->start + range->len > range->start) {
572 last_index = min_t(u64, inode->i_size - 1,
573 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
574 } else {
575 last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
576 }
577
578 i = range->start >> PAGE_CACHE_SHIFT;
579 while (i <= last_index) {
580 if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
581 PAGE_CACHE_SIZE,
582 range->extent_thresh,
583 &last_len, &skip,
584 &defrag_end)) {
585 unsigned long next;
586 /*
587 * the should_defrag function tells us how much to skip
588 * bump our counter by the suggested amount
589 */
590 next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
591 i = max(i + 1, next);
592 continue;
593 }
495 594
496 mutex_lock(&inode->i_mutex);
497 last_index = inode->i_size >> PAGE_CACHE_SHIFT;
498 for (i = 0; i <= last_index; i++) {
499 if (total_read % ra_pages == 0) { 595 if (total_read % ra_pages == 0) {
500 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, 596 btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
501 min(last_index, i + ra_pages - 1)); 597 min(last_index, i + ra_pages - 1));
502 } 598 }
503 total_read++; 599 total_read++;
600 mutex_lock(&inode->i_mutex);
601 if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
602 BTRFS_I(inode)->force_compress = 1;
603
604 ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
605 if (ret) {
606 ret = -ENOSPC;
607 break;
608 }
609
610 ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
611 if (ret) {
612 btrfs_free_reserved_data_space(root, inode,
613 PAGE_CACHE_SIZE);
614 ret = -ENOSPC;
615 break;
616 }
504again: 617again:
618 if (inode->i_size == 0 ||
619 i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
620 ret = 0;
621 goto err_reservations;
622 }
623
505 page = grab_cache_page(inode->i_mapping, i); 624 page = grab_cache_page(inode->i_mapping, i);
506 if (!page) 625 if (!page)
507 goto out_unlock; 626 goto err_reservations;
627
508 if (!PageUptodate(page)) { 628 if (!PageUptodate(page)) {
509 btrfs_readpage(NULL, page); 629 btrfs_readpage(NULL, page);
510 lock_page(page); 630 lock_page(page);
511 if (!PageUptodate(page)) { 631 if (!PageUptodate(page)) {
512 unlock_page(page); 632 unlock_page(page);
513 page_cache_release(page); 633 page_cache_release(page);
514 goto out_unlock; 634 goto err_reservations;
515 } 635 }
516 } 636 }
517 637
638 if (page->mapping != inode->i_mapping) {
639 unlock_page(page);
640 page_cache_release(page);
641 goto again;
642 }
643
518 wait_on_page_writeback(page); 644 wait_on_page_writeback(page);
519 645
646 if (PageDirty(page)) {
647 btrfs_free_reserved_data_space(root, inode,
648 PAGE_CACHE_SIZE);
649 goto loop_unlock;
650 }
651
520 page_start = (u64)page->index << PAGE_CACHE_SHIFT; 652 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
521 page_end = page_start + PAGE_CACHE_SIZE - 1; 653 page_end = page_start + PAGE_CACHE_SIZE - 1;
522 lock_extent(io_tree, page_start, page_end, GFP_NOFS); 654 lock_extent(io_tree, page_start, page_end, GFP_NOFS);
@@ -537,18 +669,54 @@ again:
537 * page if it is dirtied again later 669 * page if it is dirtied again later
538 */ 670 */
539 clear_page_dirty_for_io(page); 671 clear_page_dirty_for_io(page);
672 clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
673 page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
674 EXTENT_DO_ACCOUNTING, GFP_NOFS);
540 675
541 btrfs_set_extent_delalloc(inode, page_start, page_end); 676 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
677 ClearPageChecked(page);
542 set_page_dirty(page); 678 set_page_dirty(page);
543 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 679 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
680
681loop_unlock:
544 unlock_page(page); 682 unlock_page(page);
545 page_cache_release(page); 683 page_cache_release(page);
684 mutex_unlock(&inode->i_mutex);
685
686 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
546 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); 687 balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
688 i++;
689 }
690
691 if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
692 filemap_flush(inode->i_mapping);
693
694 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
695 /* the filemap_flush will queue IO into the worker threads, but
696 * we have to make sure the IO is actually started and that
697 * ordered extents get created before we return
698 */
699 atomic_inc(&root->fs_info->async_submit_draining);
700 while (atomic_read(&root->fs_info->nr_async_submits) ||
701 atomic_read(&root->fs_info->async_delalloc_pages)) {
702 wait_event(root->fs_info->async_submit_wait,
703 (atomic_read(&root->fs_info->nr_async_submits) == 0 &&
704 atomic_read(&root->fs_info->async_delalloc_pages) == 0));
705 }
706 atomic_dec(&root->fs_info->async_submit_draining);
707
708 mutex_lock(&inode->i_mutex);
709 BTRFS_I(inode)->force_compress = 0;
710 mutex_unlock(&inode->i_mutex);
547 } 711 }
548 712
549out_unlock:
550 mutex_unlock(&inode->i_mutex);
551 return 0; 713 return 0;
714
715err_reservations:
716 mutex_unlock(&inode->i_mutex);
717 btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
718 btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
719 return ret;
552} 720}
553 721
554static noinline int btrfs_ioctl_resize(struct btrfs_root *root, 722static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
@@ -608,7 +776,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
608 mod = 1; 776 mod = 1;
609 sizestr++; 777 sizestr++;
610 } 778 }
611 new_size = btrfs_parse_size(sizestr); 779 new_size = memparse(sizestr, NULL);
612 if (new_size == 0) { 780 if (new_size == 0) {
613 ret = -EINVAL; 781 ret = -EINVAL;
614 goto out_unlock; 782 goto out_unlock;
@@ -743,6 +911,327 @@ out:
743 return ret; 911 return ret;
744} 912}
745 913
914static noinline int key_in_sk(struct btrfs_key *key,
915 struct btrfs_ioctl_search_key *sk)
916{
917 struct btrfs_key test;
918 int ret;
919
920 test.objectid = sk->min_objectid;
921 test.type = sk->min_type;
922 test.offset = sk->min_offset;
923
924 ret = btrfs_comp_cpu_keys(key, &test);
925 if (ret < 0)
926 return 0;
927
928 test.objectid = sk->max_objectid;
929 test.type = sk->max_type;
930 test.offset = sk->max_offset;
931
932 ret = btrfs_comp_cpu_keys(key, &test);
933 if (ret > 0)
934 return 0;
935 return 1;
936}
937
938static noinline int copy_to_sk(struct btrfs_root *root,
939 struct btrfs_path *path,
940 struct btrfs_key *key,
941 struct btrfs_ioctl_search_key *sk,
942 char *buf,
943 unsigned long *sk_offset,
944 int *num_found)
945{
946 u64 found_transid;
947 struct extent_buffer *leaf;
948 struct btrfs_ioctl_search_header sh;
949 unsigned long item_off;
950 unsigned long item_len;
951 int nritems;
952 int i;
953 int slot;
954 int found = 0;
955 int ret = 0;
956
957 leaf = path->nodes[0];
958 slot = path->slots[0];
959 nritems = btrfs_header_nritems(leaf);
960
961 if (btrfs_header_generation(leaf) > sk->max_transid) {
962 i = nritems;
963 goto advance_key;
964 }
965 found_transid = btrfs_header_generation(leaf);
966
967 for (i = slot; i < nritems; i++) {
968 item_off = btrfs_item_ptr_offset(leaf, i);
969 item_len = btrfs_item_size_nr(leaf, i);
970
971 if (item_len > BTRFS_SEARCH_ARGS_BUFSIZE)
972 item_len = 0;
973
974 if (sizeof(sh) + item_len + *sk_offset >
975 BTRFS_SEARCH_ARGS_BUFSIZE) {
976 ret = 1;
977 goto overflow;
978 }
979
980 btrfs_item_key_to_cpu(leaf, key, i);
981 if (!key_in_sk(key, sk))
982 continue;
983
984 sh.objectid = key->objectid;
985 sh.offset = key->offset;
986 sh.type = key->type;
987 sh.len = item_len;
988 sh.transid = found_transid;
989
990 /* copy search result header */
991 memcpy(buf + *sk_offset, &sh, sizeof(sh));
992 *sk_offset += sizeof(sh);
993
994 if (item_len) {
995 char *p = buf + *sk_offset;
996 /* copy the item */
997 read_extent_buffer(leaf, p,
998 item_off, item_len);
999 *sk_offset += item_len;
1000 }
1001 found++;
1002
1003 if (*num_found >= sk->nr_items)
1004 break;
1005 }
1006advance_key:
1007 ret = 0;
1008 if (key->offset < (u64)-1 && key->offset < sk->max_offset)
1009 key->offset++;
1010 else if (key->type < (u8)-1 && key->type < sk->max_type) {
1011 key->offset = 0;
1012 key->type++;
1013 } else if (key->objectid < (u64)-1 && key->objectid < sk->max_objectid) {
1014 key->offset = 0;
1015 key->type = 0;
1016 key->objectid++;
1017 } else
1018 ret = 1;
1019overflow:
1020 *num_found += found;
1021 return ret;
1022}
1023
1024static noinline int search_ioctl(struct inode *inode,
1025 struct btrfs_ioctl_search_args *args)
1026{
1027 struct btrfs_root *root;
1028 struct btrfs_key key;
1029 struct btrfs_key max_key;
1030 struct btrfs_path *path;
1031 struct btrfs_ioctl_search_key *sk = &args->key;
1032 struct btrfs_fs_info *info = BTRFS_I(inode)->root->fs_info;
1033 int ret;
1034 int num_found = 0;
1035 unsigned long sk_offset = 0;
1036
1037 path = btrfs_alloc_path();
1038 if (!path)
1039 return -ENOMEM;
1040
1041 if (sk->tree_id == 0) {
1042 /* search the root of the inode that was passed */
1043 root = BTRFS_I(inode)->root;
1044 } else {
1045 key.objectid = sk->tree_id;
1046 key.type = BTRFS_ROOT_ITEM_KEY;
1047 key.offset = (u64)-1;
1048 root = btrfs_read_fs_root_no_name(info, &key);
1049 if (IS_ERR(root)) {
1050 printk(KERN_ERR "could not find root %llu\n",
1051 sk->tree_id);
1052 btrfs_free_path(path);
1053 return -ENOENT;
1054 }
1055 }
1056
1057 key.objectid = sk->min_objectid;
1058 key.type = sk->min_type;
1059 key.offset = sk->min_offset;
1060
1061 max_key.objectid = sk->max_objectid;
1062 max_key.type = sk->max_type;
1063 max_key.offset = sk->max_offset;
1064
1065 path->keep_locks = 1;
1066
1067 while(1) {
1068 ret = btrfs_search_forward(root, &key, &max_key, path, 0,
1069 sk->min_transid);
1070 if (ret != 0) {
1071 if (ret > 0)
1072 ret = 0;
1073 goto err;
1074 }
1075 ret = copy_to_sk(root, path, &key, sk, args->buf,
1076 &sk_offset, &num_found);
1077 btrfs_release_path(root, path);
1078 if (ret || num_found >= sk->nr_items)
1079 break;
1080
1081 }
1082 ret = 0;
1083err:
1084 sk->nr_items = num_found;
1085 btrfs_free_path(path);
1086 return ret;
1087}
1088
1089static noinline int btrfs_ioctl_tree_search(struct file *file,
1090 void __user *argp)
1091{
1092 struct btrfs_ioctl_search_args *args;
1093 struct inode *inode;
1094 int ret;
1095
1096 if (!capable(CAP_SYS_ADMIN))
1097 return -EPERM;
1098
1099 args = kmalloc(sizeof(*args), GFP_KERNEL);
1100 if (!args)
1101 return -ENOMEM;
1102
1103 if (copy_from_user(args, argp, sizeof(*args))) {
1104 kfree(args);
1105 return -EFAULT;
1106 }
1107 inode = fdentry(file)->d_inode;
1108 ret = search_ioctl(inode, args);
1109 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
1110 ret = -EFAULT;
1111 kfree(args);
1112 return ret;
1113}
1114
1115/*
1116 * Search INODE_REFs to identify path name of 'dirid' directory
1117 * in a 'tree_id' tree. and sets path name to 'name'.
1118 */
1119static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
1120 u64 tree_id, u64 dirid, char *name)
1121{
1122 struct btrfs_root *root;
1123 struct btrfs_key key;
1124 char *ptr;
1125 int ret = -1;
1126 int slot;
1127 int len;
1128 int total_len = 0;
1129 struct btrfs_inode_ref *iref;
1130 struct extent_buffer *l;
1131 struct btrfs_path *path;
1132
1133 if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
1134 name[0]='\0';
1135 return 0;
1136 }
1137
1138 path = btrfs_alloc_path();
1139 if (!path)
1140 return -ENOMEM;
1141
1142 ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX];
1143
1144 key.objectid = tree_id;
1145 key.type = BTRFS_ROOT_ITEM_KEY;
1146 key.offset = (u64)-1;
1147 root = btrfs_read_fs_root_no_name(info, &key);
1148 if (IS_ERR(root)) {
1149 printk(KERN_ERR "could not find root %llu\n", tree_id);
1150 ret = -ENOENT;
1151 goto out;
1152 }
1153
1154 key.objectid = dirid;
1155 key.type = BTRFS_INODE_REF_KEY;
1156 key.offset = (u64)-1;
1157
1158 while(1) {
1159 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1160 if (ret < 0)
1161 goto out;
1162
1163 l = path->nodes[0];
1164 slot = path->slots[0];
1165 if (ret > 0 && slot > 0)
1166 slot--;
1167 btrfs_item_key_to_cpu(l, &key, slot);
1168
1169 if (ret > 0 && (key.objectid != dirid ||
1170 key.type != BTRFS_INODE_REF_KEY)) {
1171 ret = -ENOENT;
1172 goto out;
1173 }
1174
1175 iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
1176 len = btrfs_inode_ref_name_len(l, iref);
1177 ptr -= len + 1;
1178 total_len += len + 1;
1179 if (ptr < name)
1180 goto out;
1181
1182 *(ptr + len) = '/';
1183 read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
1184
1185 if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
1186 break;
1187
1188 btrfs_release_path(root, path);
1189 key.objectid = key.offset;
1190 key.offset = (u64)-1;
1191 dirid = key.objectid;
1192
1193 }
1194 if (ptr < name)
1195 goto out;
1196 memcpy(name, ptr, total_len);
1197 name[total_len]='\0';
1198 ret = 0;
1199out:
1200 btrfs_free_path(path);
1201 return ret;
1202}
1203
1204static noinline int btrfs_ioctl_ino_lookup(struct file *file,
1205 void __user *argp)
1206{
1207 struct btrfs_ioctl_ino_lookup_args *args;
1208 struct inode *inode;
1209 int ret;
1210
1211 if (!capable(CAP_SYS_ADMIN))
1212 return -EPERM;
1213
1214 args = kmalloc(sizeof(*args), GFP_KERNEL);
1215 if (copy_from_user(args, argp, sizeof(*args))) {
1216 kfree(args);
1217 return -EFAULT;
1218 }
1219 inode = fdentry(file)->d_inode;
1220
1221 if (args->treeid == 0)
1222 args->treeid = BTRFS_I(inode)->root->root_key.objectid;
1223
1224 ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
1225 args->treeid, args->objectid,
1226 args->name);
1227
1228 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
1229 ret = -EFAULT;
1230
1231 kfree(args);
1232 return ret;
1233}
1234
746static noinline int btrfs_ioctl_snap_destroy(struct file *file, 1235static noinline int btrfs_ioctl_snap_destroy(struct file *file,
747 void __user *arg) 1236 void __user *arg)
748{ 1237{
@@ -849,10 +1338,11 @@ out:
849 return err; 1338 return err;
850} 1339}
851 1340
852static int btrfs_ioctl_defrag(struct file *file) 1341static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
853{ 1342{
854 struct inode *inode = fdentry(file)->d_inode; 1343 struct inode *inode = fdentry(file)->d_inode;
855 struct btrfs_root *root = BTRFS_I(inode)->root; 1344 struct btrfs_root *root = BTRFS_I(inode)->root;
1345 struct btrfs_ioctl_defrag_range_args *range;
856 int ret; 1346 int ret;
857 1347
858 ret = mnt_want_write(file->f_path.mnt); 1348 ret = mnt_want_write(file->f_path.mnt);
@@ -873,7 +1363,30 @@ static int btrfs_ioctl_defrag(struct file *file)
873 ret = -EINVAL; 1363 ret = -EINVAL;
874 goto out; 1364 goto out;
875 } 1365 }
876 btrfs_defrag_file(file); 1366
1367 range = kzalloc(sizeof(*range), GFP_KERNEL);
1368 if (!range) {
1369 ret = -ENOMEM;
1370 goto out;
1371 }
1372
1373 if (argp) {
1374 if (copy_from_user(range, argp,
1375 sizeof(*range))) {
1376 ret = -EFAULT;
1377 kfree(range);
1378 }
1379 /* compression requires us to start the IO */
1380 if ((range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
1381 range->flags |= BTRFS_DEFRAG_RANGE_START_IO;
1382 range->extent_thresh = (u32)-1;
1383 }
1384 } else {
1385 /* the rest are all set to zero by kzalloc */
1386 range->len = (u64)-1;
1387 }
1388 btrfs_defrag_file(file, range);
1389 kfree(range);
877 break; 1390 break;
878 } 1391 }
879out: 1392out:
@@ -1274,6 +1787,157 @@ out:
1274 return ret; 1787 return ret;
1275} 1788}
1276 1789
1790static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
1791{
1792 struct inode *inode = fdentry(file)->d_inode;
1793 struct btrfs_root *root = BTRFS_I(inode)->root;
1794 struct btrfs_root *new_root;
1795 struct btrfs_dir_item *di;
1796 struct btrfs_trans_handle *trans;
1797 struct btrfs_path *path;
1798 struct btrfs_key location;
1799 struct btrfs_disk_key disk_key;
1800 struct btrfs_super_block *disk_super;
1801 u64 features;
1802 u64 objectid = 0;
1803 u64 dir_id;
1804
1805 if (!capable(CAP_SYS_ADMIN))
1806 return -EPERM;
1807
1808 if (copy_from_user(&objectid, argp, sizeof(objectid)))
1809 return -EFAULT;
1810
1811 if (!objectid)
1812 objectid = root->root_key.objectid;
1813
1814 location.objectid = objectid;
1815 location.type = BTRFS_ROOT_ITEM_KEY;
1816 location.offset = (u64)-1;
1817
1818 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
1819 if (IS_ERR(new_root))
1820 return PTR_ERR(new_root);
1821
1822 if (btrfs_root_refs(&new_root->root_item) == 0)
1823 return -ENOENT;
1824
1825 path = btrfs_alloc_path();
1826 if (!path)
1827 return -ENOMEM;
1828 path->leave_spinning = 1;
1829
1830 trans = btrfs_start_transaction(root, 1);
1831 if (!trans) {
1832 btrfs_free_path(path);
1833 return -ENOMEM;
1834 }
1835
1836 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
1837 di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
1838 dir_id, "default", 7, 1);
1839 if (!di) {
1840 btrfs_free_path(path);
1841 btrfs_end_transaction(trans, root);
1842 printk(KERN_ERR "Umm, you don't have the default dir item, "
1843 "this isn't going to work\n");
1844 return -ENOENT;
1845 }
1846
1847 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
1848 btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
1849 btrfs_mark_buffer_dirty(path->nodes[0]);
1850 btrfs_free_path(path);
1851
1852 disk_super = &root->fs_info->super_copy;
1853 features = btrfs_super_incompat_flags(disk_super);
1854 if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
1855 features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
1856 btrfs_set_super_incompat_flags(disk_super, features);
1857 }
1858 btrfs_end_transaction(trans, root);
1859
1860 return 0;
1861}
1862
1863long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
1864{
1865 struct btrfs_ioctl_space_args space_args;
1866 struct btrfs_ioctl_space_info space;
1867 struct btrfs_ioctl_space_info *dest;
1868 struct btrfs_ioctl_space_info *dest_orig;
1869 struct btrfs_ioctl_space_info *user_dest;
1870 struct btrfs_space_info *info;
1871 int alloc_size;
1872 int ret = 0;
1873 int slot_count = 0;
1874
1875 if (copy_from_user(&space_args,
1876 (struct btrfs_ioctl_space_args __user *)arg,
1877 sizeof(space_args)))
1878 return -EFAULT;
1879
1880 /* first we count slots */
1881 rcu_read_lock();
1882 list_for_each_entry_rcu(info, &root->fs_info->space_info, list)
1883 slot_count++;
1884 rcu_read_unlock();
1885
1886 /* space_slots == 0 means they are asking for a count */
1887 if (space_args.space_slots == 0) {
1888 space_args.total_spaces = slot_count;
1889 goto out;
1890 }
1891 alloc_size = sizeof(*dest) * slot_count;
1892 /* we generally have at most 6 or so space infos, one for each raid
1893 * level. So, a whole page should be more than enough for everyone
1894 */
1895 if (alloc_size > PAGE_CACHE_SIZE)
1896 return -ENOMEM;
1897
1898 space_args.total_spaces = 0;
1899 dest = kmalloc(alloc_size, GFP_NOFS);
1900 if (!dest)
1901 return -ENOMEM;
1902 dest_orig = dest;
1903
1904 /* now we have a buffer to copy into */
1905 rcu_read_lock();
1906 list_for_each_entry_rcu(info, &root->fs_info->space_info, list) {
1907 /* make sure we don't copy more than we allocated
1908 * in our buffer
1909 */
1910 if (slot_count == 0)
1911 break;
1912 slot_count--;
1913
1914 /* make sure userland has enough room in their buffer */
1915 if (space_args.total_spaces >= space_args.space_slots)
1916 break;
1917
1918 space.flags = info->flags;
1919 space.total_bytes = info->total_bytes;
1920 space.used_bytes = info->bytes_used;
1921 memcpy(dest, &space, sizeof(space));
1922 dest++;
1923 space_args.total_spaces++;
1924 }
1925 rcu_read_unlock();
1926
1927 user_dest = (struct btrfs_ioctl_space_info *)
1928 (arg + sizeof(struct btrfs_ioctl_space_args));
1929
1930 if (copy_to_user(user_dest, dest_orig, alloc_size))
1931 ret = -EFAULT;
1932
1933 kfree(dest_orig);
1934out:
1935 if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
1936 ret = -EFAULT;
1937
1938 return ret;
1939}
1940
1277/* 1941/*
1278 * there are many ways the trans_start and trans_end ioctls can lead 1942 * there are many ways the trans_start and trans_end ioctls can lead
1279 * to deadlocks. They should only be used by applications that 1943 * to deadlocks. They should only be used by applications that
@@ -1320,8 +1984,12 @@ long btrfs_ioctl(struct file *file, unsigned int
1320 return btrfs_ioctl_snap_create(file, argp, 1); 1984 return btrfs_ioctl_snap_create(file, argp, 1);
1321 case BTRFS_IOC_SNAP_DESTROY: 1985 case BTRFS_IOC_SNAP_DESTROY:
1322 return btrfs_ioctl_snap_destroy(file, argp); 1986 return btrfs_ioctl_snap_destroy(file, argp);
1987 case BTRFS_IOC_DEFAULT_SUBVOL:
1988 return btrfs_ioctl_default_subvol(file, argp);
1323 case BTRFS_IOC_DEFRAG: 1989 case BTRFS_IOC_DEFRAG:
1324 return btrfs_ioctl_defrag(file); 1990 return btrfs_ioctl_defrag(file, NULL);
1991 case BTRFS_IOC_DEFRAG_RANGE:
1992 return btrfs_ioctl_defrag(file, argp);
1325 case BTRFS_IOC_RESIZE: 1993 case BTRFS_IOC_RESIZE:
1326 return btrfs_ioctl_resize(root, argp); 1994 return btrfs_ioctl_resize(root, argp);
1327 case BTRFS_IOC_ADD_DEV: 1995 case BTRFS_IOC_ADD_DEV:
@@ -1338,6 +2006,12 @@ long btrfs_ioctl(struct file *file, unsigned int
1338 return btrfs_ioctl_trans_start(file); 2006 return btrfs_ioctl_trans_start(file);
1339 case BTRFS_IOC_TRANS_END: 2007 case BTRFS_IOC_TRANS_END:
1340 return btrfs_ioctl_trans_end(file); 2008 return btrfs_ioctl_trans_end(file);
2009 case BTRFS_IOC_TREE_SEARCH:
2010 return btrfs_ioctl_tree_search(file, argp);
2011 case BTRFS_IOC_INO_LOOKUP:
2012 return btrfs_ioctl_ino_lookup(file, argp);
2013 case BTRFS_IOC_SPACE_INFO:
2014 return btrfs_ioctl_space_info(root, argp);
1341 case BTRFS_IOC_SYNC: 2015 case BTRFS_IOC_SYNC:
1342 btrfs_sync_fs(file->f_dentry->d_sb, 1); 2016 btrfs_sync_fs(file->f_dentry->d_sb, 1);
1343 return 0; 2017 return 0;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index bc49914475eb..424694aa517f 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -30,12 +30,114 @@ struct btrfs_ioctl_vol_args {
30 char name[BTRFS_PATH_NAME_MAX + 1]; 30 char name[BTRFS_PATH_NAME_MAX + 1];
31}; 31};
32 32
33#define BTRFS_INO_LOOKUP_PATH_MAX 4080
34struct btrfs_ioctl_ino_lookup_args {
35 __u64 treeid;
36 __u64 objectid;
37 char name[BTRFS_INO_LOOKUP_PATH_MAX];
38};
39
40struct btrfs_ioctl_search_key {
41 /* which root are we searching. 0 is the tree of tree roots */
42 __u64 tree_id;
43
44 /* keys returned will be >= min and <= max */
45 __u64 min_objectid;
46 __u64 max_objectid;
47
48 /* keys returned will be >= min and <= max */
49 __u64 min_offset;
50 __u64 max_offset;
51
52 /* max and min transids to search for */
53 __u64 min_transid;
54 __u64 max_transid;
55
56 /* keys returned will be >= min and <= max */
57 __u32 min_type;
58 __u32 max_type;
59
60 /*
61 * how many items did userland ask for, and how many are we
62 * returning
63 */
64 __u32 nr_items;
65
66 /* align to 64 bits */
67 __u32 unused;
68
69 /* some extra for later */
70 __u64 unused1;
71 __u64 unused2;
72 __u64 unused3;
73 __u64 unused4;
74};
75
76struct btrfs_ioctl_search_header {
77 __u64 transid;
78 __u64 objectid;
79 __u64 offset;
80 __u32 type;
81 __u32 len;
82};
83
84#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
85/*
86 * the buf is an array of search headers where
87 * each header is followed by the actual item
88 * the type field is expanded to 32 bits for alignment
89 */
90struct btrfs_ioctl_search_args {
91 struct btrfs_ioctl_search_key key;
92 char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
93};
94
33struct btrfs_ioctl_clone_range_args { 95struct btrfs_ioctl_clone_range_args {
34 __s64 src_fd; 96 __s64 src_fd;
35 __u64 src_offset, src_length; 97 __u64 src_offset, src_length;
36 __u64 dest_offset; 98 __u64 dest_offset;
37}; 99};
38 100
101/* flags for the defrag range ioctl */
102#define BTRFS_DEFRAG_RANGE_COMPRESS 1
103#define BTRFS_DEFRAG_RANGE_START_IO 2
104
105struct btrfs_ioctl_defrag_range_args {
106 /* start of the defrag operation */
107 __u64 start;
108
109 /* number of bytes to defrag, use (u64)-1 to say all */
110 __u64 len;
111
112 /*
113 * flags for the operation, which can include turning
114 * on compression for this one defrag
115 */
116 __u64 flags;
117
118 /*
119 * any extent bigger than this will be considered
120 * already defragged. Use 0 to take the kernel default
121 * Use 1 to say every single extent must be rewritten
122 */
123 __u32 extent_thresh;
124
125 /* spare for later */
126 __u32 unused[5];
127};
128
129struct btrfs_ioctl_space_info {
130 __u64 flags;
131 __u64 total_bytes;
132 __u64 used_bytes;
133};
134
135struct btrfs_ioctl_space_args {
136 __u64 space_slots;
137 __u64 total_spaces;
138 struct btrfs_ioctl_space_info spaces[0];
139};
140
39#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ 141#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
40 struct btrfs_ioctl_vol_args) 142 struct btrfs_ioctl_vol_args)
41#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ 143#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -67,4 +169,13 @@ struct btrfs_ioctl_clone_range_args {
67 struct btrfs_ioctl_vol_args) 169 struct btrfs_ioctl_vol_args)
68#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \ 170#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
69 struct btrfs_ioctl_vol_args) 171 struct btrfs_ioctl_vol_args)
172#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
173 struct btrfs_ioctl_defrag_range_args)
174#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
175 struct btrfs_ioctl_search_args)
176#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
177 struct btrfs_ioctl_ino_lookup_args)
178#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
179#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
180 struct btrfs_ioctl_space_args)
70#endif 181#endif
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 5c2a9e78a949..a8ffecd0b491 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -174,7 +174,6 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
174 if (!entry) 174 if (!entry)
175 return -ENOMEM; 175 return -ENOMEM;
176 176
177 mutex_lock(&tree->mutex);
178 entry->file_offset = file_offset; 177 entry->file_offset = file_offset;
179 entry->start = start; 178 entry->start = start;
180 entry->len = len; 179 entry->len = len;
@@ -190,16 +189,17 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
190 INIT_LIST_HEAD(&entry->list); 189 INIT_LIST_HEAD(&entry->list);
191 INIT_LIST_HEAD(&entry->root_extent_list); 190 INIT_LIST_HEAD(&entry->root_extent_list);
192 191
192 spin_lock(&tree->lock);
193 node = tree_insert(&tree->tree, file_offset, 193 node = tree_insert(&tree->tree, file_offset,
194 &entry->rb_node); 194 &entry->rb_node);
195 BUG_ON(node); 195 BUG_ON(node);
196 spin_unlock(&tree->lock);
196 197
197 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 198 spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
198 list_add_tail(&entry->root_extent_list, 199 list_add_tail(&entry->root_extent_list,
199 &BTRFS_I(inode)->root->fs_info->ordered_extents); 200 &BTRFS_I(inode)->root->fs_info->ordered_extents);
200 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); 201 spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
201 202
202 mutex_unlock(&tree->mutex);
203 BUG_ON(node); 203 BUG_ON(node);
204 return 0; 204 return 0;
205} 205}
@@ -216,9 +216,9 @@ int btrfs_add_ordered_sum(struct inode *inode,
216 struct btrfs_ordered_inode_tree *tree; 216 struct btrfs_ordered_inode_tree *tree;
217 217
218 tree = &BTRFS_I(inode)->ordered_tree; 218 tree = &BTRFS_I(inode)->ordered_tree;
219 mutex_lock(&tree->mutex); 219 spin_lock(&tree->lock);
220 list_add_tail(&sum->list, &entry->list); 220 list_add_tail(&sum->list, &entry->list);
221 mutex_unlock(&tree->mutex); 221 spin_unlock(&tree->lock);
222 return 0; 222 return 0;
223} 223}
224 224
@@ -232,15 +232,16 @@ int btrfs_add_ordered_sum(struct inode *inode,
232 * to make sure this function only returns 1 once for a given ordered extent. 232 * to make sure this function only returns 1 once for a given ordered extent.
233 */ 233 */
234int btrfs_dec_test_ordered_pending(struct inode *inode, 234int btrfs_dec_test_ordered_pending(struct inode *inode,
235 struct btrfs_ordered_extent **cached,
235 u64 file_offset, u64 io_size) 236 u64 file_offset, u64 io_size)
236{ 237{
237 struct btrfs_ordered_inode_tree *tree; 238 struct btrfs_ordered_inode_tree *tree;
238 struct rb_node *node; 239 struct rb_node *node;
239 struct btrfs_ordered_extent *entry; 240 struct btrfs_ordered_extent *entry = NULL;
240 int ret; 241 int ret;
241 242
242 tree = &BTRFS_I(inode)->ordered_tree; 243 tree = &BTRFS_I(inode)->ordered_tree;
243 mutex_lock(&tree->mutex); 244 spin_lock(&tree->lock);
244 node = tree_search(tree, file_offset); 245 node = tree_search(tree, file_offset);
245 if (!node) { 246 if (!node) {
246 ret = 1; 247 ret = 1;
@@ -264,7 +265,11 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
264 else 265 else
265 ret = 1; 266 ret = 1;
266out: 267out:
267 mutex_unlock(&tree->mutex); 268 if (!ret && cached && entry) {
269 *cached = entry;
270 atomic_inc(&entry->refs);
271 }
272 spin_unlock(&tree->lock);
268 return ret == 0; 273 return ret == 0;
269} 274}
270 275
@@ -291,7 +296,7 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
291 296
292/* 297/*
293 * remove an ordered extent from the tree. No references are dropped 298 * remove an ordered extent from the tree. No references are dropped
294 * and you must wake_up entry->wait. You must hold the tree mutex 299 * and you must wake_up entry->wait. You must hold the tree lock
295 * while you call this function. 300 * while you call this function.
296 */ 301 */
297static int __btrfs_remove_ordered_extent(struct inode *inode, 302static int __btrfs_remove_ordered_extent(struct inode *inode,
@@ -340,9 +345,9 @@ int btrfs_remove_ordered_extent(struct inode *inode,
340 int ret; 345 int ret;
341 346
342 tree = &BTRFS_I(inode)->ordered_tree; 347 tree = &BTRFS_I(inode)->ordered_tree;
343 mutex_lock(&tree->mutex); 348 spin_lock(&tree->lock);
344 ret = __btrfs_remove_ordered_extent(inode, entry); 349 ret = __btrfs_remove_ordered_extent(inode, entry);
345 mutex_unlock(&tree->mutex); 350 spin_unlock(&tree->lock);
346 wake_up(&entry->wait); 351 wake_up(&entry->wait);
347 352
348 return ret; 353 return ret;
@@ -567,7 +572,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
567 struct btrfs_ordered_extent *entry = NULL; 572 struct btrfs_ordered_extent *entry = NULL;
568 573
569 tree = &BTRFS_I(inode)->ordered_tree; 574 tree = &BTRFS_I(inode)->ordered_tree;
570 mutex_lock(&tree->mutex); 575 spin_lock(&tree->lock);
571 node = tree_search(tree, file_offset); 576 node = tree_search(tree, file_offset);
572 if (!node) 577 if (!node)
573 goto out; 578 goto out;
@@ -578,7 +583,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
578 if (entry) 583 if (entry)
579 atomic_inc(&entry->refs); 584 atomic_inc(&entry->refs);
580out: 585out:
581 mutex_unlock(&tree->mutex); 586 spin_unlock(&tree->lock);
582 return entry; 587 return entry;
583} 588}
584 589
@@ -594,7 +599,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
594 struct btrfs_ordered_extent *entry = NULL; 599 struct btrfs_ordered_extent *entry = NULL;
595 600
596 tree = &BTRFS_I(inode)->ordered_tree; 601 tree = &BTRFS_I(inode)->ordered_tree;
597 mutex_lock(&tree->mutex); 602 spin_lock(&tree->lock);
598 node = tree_search(tree, file_offset); 603 node = tree_search(tree, file_offset);
599 if (!node) 604 if (!node)
600 goto out; 605 goto out;
@@ -602,7 +607,7 @@ btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset)
602 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); 607 entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
603 atomic_inc(&entry->refs); 608 atomic_inc(&entry->refs);
604out: 609out:
605 mutex_unlock(&tree->mutex); 610 spin_unlock(&tree->lock);
606 return entry; 611 return entry;
607} 612}
608 613
@@ -629,7 +634,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
629 else 634 else
630 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); 635 offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
631 636
632 mutex_lock(&tree->mutex); 637 spin_lock(&tree->lock);
633 disk_i_size = BTRFS_I(inode)->disk_i_size; 638 disk_i_size = BTRFS_I(inode)->disk_i_size;
634 639
635 /* truncate file */ 640 /* truncate file */
@@ -735,7 +740,7 @@ out:
735 */ 740 */
736 if (ordered) 741 if (ordered)
737 __btrfs_remove_ordered_extent(inode, ordered); 742 __btrfs_remove_ordered_extent(inode, ordered);
738 mutex_unlock(&tree->mutex); 743 spin_unlock(&tree->lock);
739 if (ordered) 744 if (ordered)
740 wake_up(&ordered->wait); 745 wake_up(&ordered->wait);
741 return ret; 746 return ret;
@@ -762,7 +767,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
762 if (!ordered) 767 if (!ordered)
763 return 1; 768 return 1;
764 769
765 mutex_lock(&tree->mutex); 770 spin_lock(&tree->lock);
766 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { 771 list_for_each_entry_reverse(ordered_sum, &ordered->list, list) {
767 if (disk_bytenr >= ordered_sum->bytenr) { 772 if (disk_bytenr >= ordered_sum->bytenr) {
768 num_sectors = ordered_sum->len / sectorsize; 773 num_sectors = ordered_sum->len / sectorsize;
@@ -777,7 +782,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr,
777 } 782 }
778 } 783 }
779out: 784out:
780 mutex_unlock(&tree->mutex); 785 spin_unlock(&tree->lock);
781 btrfs_put_ordered_extent(ordered); 786 btrfs_put_ordered_extent(ordered);
782 return ret; 787 return ret;
783} 788}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 9116c6d0c5a9..c82f76a9f040 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -21,7 +21,7 @@
21 21
22/* one of these per inode */ 22/* one of these per inode */
23struct btrfs_ordered_inode_tree { 23struct btrfs_ordered_inode_tree {
24 struct mutex mutex; 24 spinlock_t lock;
25 struct rb_root tree; 25 struct rb_root tree;
26 struct rb_node *last; 26 struct rb_node *last;
27}; 27};
@@ -128,7 +128,7 @@ static inline int btrfs_ordered_sum_size(struct btrfs_root *root,
128static inline void 128static inline void
129btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) 129btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
130{ 130{
131 mutex_init(&t->mutex); 131 spin_lock_init(&t->lock);
132 t->tree = RB_ROOT; 132 t->tree = RB_ROOT;
133 t->last = NULL; 133 t->last = NULL;
134} 134}
@@ -137,7 +137,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
137int btrfs_remove_ordered_extent(struct inode *inode, 137int btrfs_remove_ordered_extent(struct inode *inode,
138 struct btrfs_ordered_extent *entry); 138 struct btrfs_ordered_extent *entry);
139int btrfs_dec_test_ordered_pending(struct inode *inode, 139int btrfs_dec_test_ordered_pending(struct inode *inode,
140 u64 file_offset, u64 io_size); 140 struct btrfs_ordered_extent **cached,
141 u64 file_offset, u64 io_size);
141int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, 142int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
142 u64 start, u64 len, u64 disk_len, int tyep); 143 u64 start, u64 len, u64 disk_len, int tyep);
143int btrfs_add_ordered_sum(struct inode *inode, 144int btrfs_add_ordered_sum(struct inode *inode,
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 0109e5606bad..0b23942cbc0d 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2659,7 +2659,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
2659 EXTENT_BOUNDARY, GFP_NOFS); 2659 EXTENT_BOUNDARY, GFP_NOFS);
2660 nr++; 2660 nr++;
2661 } 2661 }
2662 btrfs_set_extent_delalloc(inode, page_start, page_end); 2662 btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
2663 2663
2664 set_page_dirty(page); 2664 set_page_dirty(page);
2665 dirty_page++; 2665 dirty_page++;
@@ -3487,7 +3487,7 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
3487 key.objectid = objectid; 3487 key.objectid = objectid;
3488 key.type = BTRFS_INODE_ITEM_KEY; 3488 key.type = BTRFS_INODE_ITEM_KEY;
3489 key.offset = 0; 3489 key.offset = 0;
3490 inode = btrfs_iget(root->fs_info->sb, &key, root); 3490 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
3491 BUG_ON(IS_ERR(inode) || is_bad_inode(inode)); 3491 BUG_ON(IS_ERR(inode) || is_bad_inode(inode));
3492 BTRFS_I(inode)->index_cnt = group->key.objectid; 3492 BTRFS_I(inode)->index_cnt = group->key.objectid;
3493 3493
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f8b4521de907..9ac612e6ca60 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -63,10 +63,10 @@ static void btrfs_put_super(struct super_block *sb)
63} 63}
64 64
65enum { 65enum {
66 Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, 66 Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
67 Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, 67 Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start,
68 Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, 68 Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool,
69 Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio, 69 Opt_noacl, Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio,
70 Opt_flushoncommit, 70 Opt_flushoncommit,
71 Opt_discard, Opt_err, 71 Opt_discard, Opt_err,
72}; 72};
@@ -74,6 +74,7 @@ enum {
74static match_table_t tokens = { 74static match_table_t tokens = {
75 {Opt_degraded, "degraded"}, 75 {Opt_degraded, "degraded"},
76 {Opt_subvol, "subvol=%s"}, 76 {Opt_subvol, "subvol=%s"},
77 {Opt_subvolid, "subvolid=%d"},
77 {Opt_device, "device=%s"}, 78 {Opt_device, "device=%s"},
78 {Opt_nodatasum, "nodatasum"}, 79 {Opt_nodatasum, "nodatasum"},
79 {Opt_nodatacow, "nodatacow"}, 80 {Opt_nodatacow, "nodatacow"},
@@ -95,31 +96,6 @@ static match_table_t tokens = {
95 {Opt_err, NULL}, 96 {Opt_err, NULL},
96}; 97};
97 98
98u64 btrfs_parse_size(char *str)
99{
100 u64 res;
101 int mult = 1;
102 char *end;
103 char last;
104
105 res = simple_strtoul(str, &end, 10);
106
107 last = end[0];
108 if (isalpha(last)) {
109 last = tolower(last);
110 switch (last) {
111 case 'g':
112 mult *= 1024;
113 case 'm':
114 mult *= 1024;
115 case 'k':
116 mult *= 1024;
117 }
118 res = res * mult;
119 }
120 return res;
121}
122
123/* 99/*
124 * Regular mount options parser. Everything that is needed only when 100 * Regular mount options parser. Everything that is needed only when
125 * reading in a new superblock is parsed here. 101 * reading in a new superblock is parsed here.
@@ -157,6 +133,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
157 btrfs_set_opt(info->mount_opt, DEGRADED); 133 btrfs_set_opt(info->mount_opt, DEGRADED);
158 break; 134 break;
159 case Opt_subvol: 135 case Opt_subvol:
136 case Opt_subvolid:
160 case Opt_device: 137 case Opt_device:
161 /* 138 /*
162 * These are parsed by btrfs_parse_early_options 139 * These are parsed by btrfs_parse_early_options
@@ -214,7 +191,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
214 case Opt_max_extent: 191 case Opt_max_extent:
215 num = match_strdup(&args[0]); 192 num = match_strdup(&args[0]);
216 if (num) { 193 if (num) {
217 info->max_extent = btrfs_parse_size(num); 194 info->max_extent = memparse(num, NULL);
218 kfree(num); 195 kfree(num);
219 196
220 info->max_extent = max_t(u64, 197 info->max_extent = max_t(u64,
@@ -226,7 +203,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
226 case Opt_max_inline: 203 case Opt_max_inline:
227 num = match_strdup(&args[0]); 204 num = match_strdup(&args[0]);
228 if (num) { 205 if (num) {
229 info->max_inline = btrfs_parse_size(num); 206 info->max_inline = memparse(num, NULL);
230 kfree(num); 207 kfree(num);
231 208
232 if (info->max_inline) { 209 if (info->max_inline) {
@@ -241,7 +218,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
241 case Opt_alloc_start: 218 case Opt_alloc_start:
242 num = match_strdup(&args[0]); 219 num = match_strdup(&args[0]);
243 if (num) { 220 if (num) {
244 info->alloc_start = btrfs_parse_size(num); 221 info->alloc_start = memparse(num, NULL);
245 kfree(num); 222 kfree(num);
246 printk(KERN_INFO 223 printk(KERN_INFO
247 "btrfs: allocations start at %llu\n", 224 "btrfs: allocations start at %llu\n",
@@ -292,12 +269,13 @@ out:
292 * only when we need to allocate a new super block. 269 * only when we need to allocate a new super block.
293 */ 270 */
294static int btrfs_parse_early_options(const char *options, fmode_t flags, 271static int btrfs_parse_early_options(const char *options, fmode_t flags,
295 void *holder, char **subvol_name, 272 void *holder, char **subvol_name, u64 *subvol_objectid,
296 struct btrfs_fs_devices **fs_devices) 273 struct btrfs_fs_devices **fs_devices)
297{ 274{
298 substring_t args[MAX_OPT_ARGS]; 275 substring_t args[MAX_OPT_ARGS];
299 char *opts, *p; 276 char *opts, *p;
300 int error = 0; 277 int error = 0;
278 int intarg;
301 279
302 if (!options) 280 if (!options)
303 goto out; 281 goto out;
@@ -320,6 +298,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
320 case Opt_subvol: 298 case Opt_subvol:
321 *subvol_name = match_strdup(&args[0]); 299 *subvol_name = match_strdup(&args[0]);
322 break; 300 break;
301 case Opt_subvolid:
302 intarg = 0;
303 error = match_int(&args[0], &intarg);
304 if (!error) {
305 /* we want the original fs_tree */
306 if (!intarg)
307 *subvol_objectid =
308 BTRFS_FS_TREE_OBJECTID;
309 else
310 *subvol_objectid = intarg;
311 }
312 break;
323 case Opt_device: 313 case Opt_device:
324 error = btrfs_scan_one_device(match_strdup(&args[0]), 314 error = btrfs_scan_one_device(match_strdup(&args[0]),
325 flags, holder, fs_devices); 315 flags, holder, fs_devices);
@@ -347,6 +337,110 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
347 return error; 337 return error;
348} 338}
349 339
340static struct dentry *get_default_root(struct super_block *sb,
341 u64 subvol_objectid)
342{
343 struct btrfs_root *root = sb->s_fs_info;
344 struct btrfs_root *new_root;
345 struct btrfs_dir_item *di;
346 struct btrfs_path *path;
347 struct btrfs_key location;
348 struct inode *inode;
349 struct dentry *dentry;
350 u64 dir_id;
351 int new = 0;
352
353 /*
354 * We have a specific subvol we want to mount, just setup location and
355 * go look up the root.
356 */
357 if (subvol_objectid) {
358 location.objectid = subvol_objectid;
359 location.type = BTRFS_ROOT_ITEM_KEY;
360 location.offset = (u64)-1;
361 goto find_root;
362 }
363
364 path = btrfs_alloc_path();
365 if (!path)
366 return ERR_PTR(-ENOMEM);
367 path->leave_spinning = 1;
368
369 /*
370 * Find the "default" dir item which points to the root item that we
371 * will mount by default if we haven't been given a specific subvolume
372 * to mount.
373 */
374 dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
375 di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
376 if (!di) {
377 /*
378 * Ok the default dir item isn't there. This is weird since
379 * it's always been there, but don't freak out, just try and
380 * mount to root most subvolume.
381 */
382 btrfs_free_path(path);
383 dir_id = BTRFS_FIRST_FREE_OBJECTID;
384 new_root = root->fs_info->fs_root;
385 goto setup_root;
386 }
387
388 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
389 btrfs_free_path(path);
390
391find_root:
392 new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
393 if (IS_ERR(new_root))
394 return ERR_PTR(PTR_ERR(new_root));
395
396 if (btrfs_root_refs(&new_root->root_item) == 0)
397 return ERR_PTR(-ENOENT);
398
399 dir_id = btrfs_root_dirid(&new_root->root_item);
400setup_root:
401 location.objectid = dir_id;
402 location.type = BTRFS_INODE_ITEM_KEY;
403 location.offset = 0;
404
405 inode = btrfs_iget(sb, &location, new_root, &new);
406 if (!inode)
407 return ERR_PTR(-ENOMEM);
408
409 /*
410 * If we're just mounting the root most subvol put the inode and return
411 * a reference to the dentry. We will have already gotten a reference
412 * to the inode in btrfs_fill_super so we're good to go.
413 */
414 if (!new && sb->s_root->d_inode == inode) {
415 iput(inode);
416 return dget(sb->s_root);
417 }
418
419 if (new) {
420 const struct qstr name = { .name = "/", .len = 1 };
421
422 /*
423 * New inode, we need to make the dentry a sibling of s_root so
424 * everything gets cleaned up properly on unmount.
425 */
426 dentry = d_alloc(sb->s_root, &name);
427 if (!dentry) {
428 iput(inode);
429 return ERR_PTR(-ENOMEM);
430 }
431 d_splice_alias(inode, dentry);
432 } else {
433 /*
434 * We found the inode in cache, just find a dentry for it and
435 * put the reference to the inode we just got.
436 */
437 dentry = d_find_alias(inode);
438 iput(inode);
439 }
440
441 return dentry;
442}
443
350static int btrfs_fill_super(struct super_block *sb, 444static int btrfs_fill_super(struct super_block *sb,
351 struct btrfs_fs_devices *fs_devices, 445 struct btrfs_fs_devices *fs_devices,
352 void *data, int silent) 446 void *data, int silent)
@@ -380,7 +474,7 @@ static int btrfs_fill_super(struct super_block *sb,
380 key.objectid = BTRFS_FIRST_FREE_OBJECTID; 474 key.objectid = BTRFS_FIRST_FREE_OBJECTID;
381 key.type = BTRFS_INODE_ITEM_KEY; 475 key.type = BTRFS_INODE_ITEM_KEY;
382 key.offset = 0; 476 key.offset = 0;
383 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root); 477 inode = btrfs_iget(sb, &key, tree_root->fs_info->fs_root, NULL);
384 if (IS_ERR(inode)) { 478 if (IS_ERR(inode)) {
385 err = PTR_ERR(inode); 479 err = PTR_ERR(inode);
386 goto fail_close; 480 goto fail_close;
@@ -392,12 +486,6 @@ static int btrfs_fill_super(struct super_block *sb,
392 err = -ENOMEM; 486 err = -ENOMEM;
393 goto fail_close; 487 goto fail_close;
394 } 488 }
395#if 0
396 /* this does the super kobj at the same time */
397 err = btrfs_sysfs_add_super(tree_root->fs_info);
398 if (err)
399 goto fail_close;
400#endif
401 489
402 sb->s_root = root_dentry; 490 sb->s_root = root_dentry;
403 491
@@ -489,19 +577,22 @@ static int btrfs_test_super(struct super_block *s, void *data)
489static int btrfs_get_sb(struct file_system_type *fs_type, int flags, 577static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
490 const char *dev_name, void *data, struct vfsmount *mnt) 578 const char *dev_name, void *data, struct vfsmount *mnt)
491{ 579{
492 char *subvol_name = NULL;
493 struct block_device *bdev = NULL; 580 struct block_device *bdev = NULL;
494 struct super_block *s; 581 struct super_block *s;
495 struct dentry *root; 582 struct dentry *root;
496 struct btrfs_fs_devices *fs_devices = NULL; 583 struct btrfs_fs_devices *fs_devices = NULL;
497 fmode_t mode = FMODE_READ; 584 fmode_t mode = FMODE_READ;
585 char *subvol_name = NULL;
586 u64 subvol_objectid = 0;
498 int error = 0; 587 int error = 0;
588 int found = 0;
499 589
500 if (!(flags & MS_RDONLY)) 590 if (!(flags & MS_RDONLY))
501 mode |= FMODE_WRITE; 591 mode |= FMODE_WRITE;
502 592
503 error = btrfs_parse_early_options(data, mode, fs_type, 593 error = btrfs_parse_early_options(data, mode, fs_type,
504 &subvol_name, &fs_devices); 594 &subvol_name, &subvol_objectid,
595 &fs_devices);
505 if (error) 596 if (error)
506 return error; 597 return error;
507 598
@@ -530,6 +621,7 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
530 goto error_close_devices; 621 goto error_close_devices;
531 } 622 }
532 623
624 found = 1;
533 btrfs_close_devices(fs_devices); 625 btrfs_close_devices(fs_devices);
534 } else { 626 } else {
535 char b[BDEVNAME_SIZE]; 627 char b[BDEVNAME_SIZE];
@@ -547,25 +639,35 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
547 s->s_flags |= MS_ACTIVE; 639 s->s_flags |= MS_ACTIVE;
548 } 640 }
549 641
550 if (!strcmp(subvol_name, ".")) 642 root = get_default_root(s, subvol_objectid);
551 root = dget(s->s_root); 643 if (IS_ERR(root)) {
552 else { 644 error = PTR_ERR(root);
553 mutex_lock(&s->s_root->d_inode->i_mutex); 645 deactivate_locked_super(s);
554 root = lookup_one_len(subvol_name, s->s_root, 646 goto error;
647 }
648 /* if they gave us a subvolume name bind mount into that */
649 if (strcmp(subvol_name, ".")) {
650 struct dentry *new_root;
651 mutex_lock(&root->d_inode->i_mutex);
652 new_root = lookup_one_len(subvol_name, root,
555 strlen(subvol_name)); 653 strlen(subvol_name));
556 mutex_unlock(&s->s_root->d_inode->i_mutex); 654 mutex_unlock(&root->d_inode->i_mutex);
557 655
558 if (IS_ERR(root)) { 656 if (IS_ERR(new_root)) {
559 deactivate_locked_super(s); 657 deactivate_locked_super(s);
560 error = PTR_ERR(root); 658 error = PTR_ERR(new_root);
561 goto error_free_subvol_name; 659 dput(root);
660 goto error_close_devices;
562 } 661 }
563 if (!root->d_inode) { 662 if (!new_root->d_inode) {
564 dput(root); 663 dput(root);
664 dput(new_root);
565 deactivate_locked_super(s); 665 deactivate_locked_super(s);
566 error = -ENXIO; 666 error = -ENXIO;
567 goto error_free_subvol_name; 667 goto error_close_devices;
568 } 668 }
669 dput(root);
670 root = new_root;
569 } 671 }
570 672
571 mnt->mnt_sb = s; 673 mnt->mnt_sb = s;
@@ -580,6 +682,7 @@ error_close_devices:
580 btrfs_close_devices(fs_devices); 682 btrfs_close_devices(fs_devices);
581error_free_subvol_name: 683error_free_subvol_name:
582 kfree(subvol_name); 684 kfree(subvol_name);
685error:
583 return error; 686 return error;
584} 687}
585 688
@@ -624,14 +727,37 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
624{ 727{
625 struct btrfs_root *root = btrfs_sb(dentry->d_sb); 728 struct btrfs_root *root = btrfs_sb(dentry->d_sb);
626 struct btrfs_super_block *disk_super = &root->fs_info->super_copy; 729 struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
730 struct list_head *head = &root->fs_info->space_info;
731 struct btrfs_space_info *found;
732 u64 total_used = 0;
733 u64 data_used = 0;
627 int bits = dentry->d_sb->s_blocksize_bits; 734 int bits = dentry->d_sb->s_blocksize_bits;
628 __be32 *fsid = (__be32 *)root->fs_info->fsid; 735 __be32 *fsid = (__be32 *)root->fs_info->fsid;
629 736
737 rcu_read_lock();
738 list_for_each_entry_rcu(found, head, list) {
739 if (found->flags & (BTRFS_BLOCK_GROUP_DUP|
740 BTRFS_BLOCK_GROUP_RAID10|
741 BTRFS_BLOCK_GROUP_RAID1)) {
742 total_used += found->bytes_used;
743 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
744 data_used += found->bytes_used;
745 else
746 data_used += found->total_bytes;
747 }
748
749 total_used += found->bytes_used;
750 if (found->flags & BTRFS_BLOCK_GROUP_DATA)
751 data_used += found->bytes_used;
752 else
753 data_used += found->total_bytes;
754 }
755 rcu_read_unlock();
756
630 buf->f_namelen = BTRFS_NAME_LEN; 757 buf->f_namelen = BTRFS_NAME_LEN;
631 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; 758 buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
632 buf->f_bfree = buf->f_blocks - 759 buf->f_bfree = buf->f_blocks - (total_used >> bits);
633 (btrfs_super_bytes_used(disk_super) >> bits); 760 buf->f_bavail = buf->f_blocks - (data_used >> bits);
634 buf->f_bavail = buf->f_bfree;
635 buf->f_bsize = dentry->d_sb->s_blocksize; 761 buf->f_bsize = dentry->d_sb->s_blocksize;
636 buf->f_type = BTRFS_SUPER_MAGIC; 762 buf->f_type = BTRFS_SUPER_MAGIC;
637 763
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2a36e236a492..2d654c1c794d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -997,13 +997,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
997 997
998 mutex_unlock(&root->fs_info->trans_mutex); 998 mutex_unlock(&root->fs_info->trans_mutex);
999 999
1000 if (flush_on_commit) { 1000 if (flush_on_commit || snap_pending) {
1001 btrfs_start_delalloc_inodes(root, 1); 1001 btrfs_start_delalloc_inodes(root, 1);
1002 ret = btrfs_wait_ordered_extents(root, 0, 1); 1002 ret = btrfs_wait_ordered_extents(root, 0, 1);
1003 BUG_ON(ret); 1003 BUG_ON(ret);
1004 } else if (snap_pending) {
1005 ret = btrfs_wait_ordered_extents(root, 0, 1);
1006 BUG_ON(ret);
1007 } 1004 }
1008 1005
1009 /* 1006 /*
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4a9434b622ec..1255fcc8ade5 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -445,7 +445,7 @@ static noinline struct inode *read_one_inode(struct btrfs_root *root,
445 key.objectid = objectid; 445 key.objectid = objectid;
446 key.type = BTRFS_INODE_ITEM_KEY; 446 key.type = BTRFS_INODE_ITEM_KEY;
447 key.offset = 0; 447 key.offset = 0;
448 inode = btrfs_iget(root->fs_info->sb, &key, root); 448 inode = btrfs_iget(root->fs_info->sb, &key, root, NULL);
449 if (IS_ERR(inode)) { 449 if (IS_ERR(inode)) {
450 inode = NULL; 450 inode = NULL;
451 } else if (is_bad_inode(inode)) { 451 } else if (is_bad_inode(inode)) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 41ecbb2347f2..9df8e3f1ccab 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -256,13 +256,13 @@ loop_lock:
256 wake_up(&fs_info->async_submit_wait); 256 wake_up(&fs_info->async_submit_wait);
257 257
258 BUG_ON(atomic_read(&cur->bi_cnt) == 0); 258 BUG_ON(atomic_read(&cur->bi_cnt) == 0);
259 submit_bio(cur->bi_rw, cur);
260 num_run++;
261 batch_run++;
262 259
263 if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) 260 if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
264 num_sync_run++; 261 num_sync_run++;
265 262
263 submit_bio(cur->bi_rw, cur);
264 num_run++;
265 batch_run++;
266 if (need_resched()) { 266 if (need_resched()) {
267 if (num_sync_run) { 267 if (num_sync_run) {
268 blk_run_backing_dev(bdi, NULL); 268 blk_run_backing_dev(bdi, NULL);
@@ -325,16 +325,6 @@ loop_lock:
325 num_sync_run = 0; 325 num_sync_run = 0;
326 blk_run_backing_dev(bdi, NULL); 326 blk_run_backing_dev(bdi, NULL);
327 } 327 }
328
329 cond_resched();
330 if (again)
331 goto loop;
332
333 spin_lock(&device->io_lock);
334 if (device->pending_bios.head || device->pending_sync_bios.head)
335 goto loop_lock;
336 spin_unlock(&device->io_lock);
337
338 /* 328 /*
339 * IO has already been through a long path to get here. Checksumming, 329 * IO has already been through a long path to get here. Checksumming,
340 * async helper threads, perhaps compression. We've done a pretty 330 * async helper threads, perhaps compression. We've done a pretty
@@ -346,6 +336,16 @@ loop_lock:
346 * cared about found its way down here. 336 * cared about found its way down here.
347 */ 337 */
348 blk_run_backing_dev(bdi, NULL); 338 blk_run_backing_dev(bdi, NULL);
339
340 cond_resched();
341 if (again)
342 goto loop;
343
344 spin_lock(&device->io_lock);
345 if (device->pending_bios.head || device->pending_sync_bios.head)
346 goto loop_lock;
347 spin_unlock(&device->io_lock);
348
349done: 349done:
350 return 0; 350 return 0;
351} 351}
@@ -365,6 +365,7 @@ static noinline int device_list_add(const char *path,
365 struct btrfs_device *device; 365 struct btrfs_device *device;
366 struct btrfs_fs_devices *fs_devices; 366 struct btrfs_fs_devices *fs_devices;
367 u64 found_transid = btrfs_super_generation(disk_super); 367 u64 found_transid = btrfs_super_generation(disk_super);
368 char *name;
368 369
369 fs_devices = find_fsid(disk_super->fsid); 370 fs_devices = find_fsid(disk_super->fsid);
370 if (!fs_devices) { 371 if (!fs_devices) {
@@ -411,6 +412,12 @@ static noinline int device_list_add(const char *path,
411 412
412 device->fs_devices = fs_devices; 413 device->fs_devices = fs_devices;
413 fs_devices->num_devices++; 414 fs_devices->num_devices++;
415 } else if (strcmp(device->name, path)) {
416 name = kstrdup(path, GFP_NOFS);
417 if (!name)
418 return -ENOMEM;
419 kfree(device->name);
420 device->name = name;
414 } 421 }
415 422
416 if (found_transid > fs_devices->latest_trans) { 423 if (found_transid > fs_devices->latest_trans) {
@@ -592,7 +599,7 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
592 goto error_close; 599 goto error_close;
593 600
594 disk_super = (struct btrfs_super_block *)bh->b_data; 601 disk_super = (struct btrfs_super_block *)bh->b_data;
595 devid = le64_to_cpu(disk_super->dev_item.devid); 602 devid = btrfs_stack_device_id(&disk_super->dev_item);
596 if (devid != device->devid) 603 if (devid != device->devid)
597 goto error_brelse; 604 goto error_brelse;
598 605
@@ -694,7 +701,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
694 goto error_close; 701 goto error_close;
695 } 702 }
696 disk_super = (struct btrfs_super_block *)bh->b_data; 703 disk_super = (struct btrfs_super_block *)bh->b_data;
697 devid = le64_to_cpu(disk_super->dev_item.devid); 704 devid = btrfs_stack_device_id(&disk_super->dev_item);
698 transid = btrfs_super_generation(disk_super); 705 transid = btrfs_super_generation(disk_super);
699 if (disk_super->label[0]) 706 if (disk_super->label[0])
700 printk(KERN_INFO "device label %s ", disk_super->label); 707 printk(KERN_INFO "device label %s ", disk_super->label);
@@ -1187,7 +1194,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
1187 goto error_close; 1194 goto error_close;
1188 } 1195 }
1189 disk_super = (struct btrfs_super_block *)bh->b_data; 1196 disk_super = (struct btrfs_super_block *)bh->b_data;
1190 devid = le64_to_cpu(disk_super->dev_item.devid); 1197 devid = btrfs_stack_device_id(&disk_super->dev_item);
1191 dev_uuid = disk_super->dev_item.uuid; 1198 dev_uuid = disk_super->dev_item.uuid;
1192 device = btrfs_find_device(root, devid, dev_uuid, 1199 device = btrfs_find_device(root, devid, dev_uuid,
1193 disk_super->fsid); 1200 disk_super->fsid);
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 000000000000..04b8280582a9
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,27 @@
1config CEPH_FS
2 tristate "Ceph distributed file system (EXPERIMENTAL)"
3 depends on INET && EXPERIMENTAL
4 select LIBCRC32C
5 select CONFIG_CRYPTO_AES
6 help
7 Choose Y or M here to include support for mounting the
8 experimental Ceph distributed file system. Ceph is an extremely
9 scalable file system designed to provide high performance,
10 reliable access to petabytes of storage.
11
12 More information at http://ceph.newdream.net/.
13
14 If unsure, say N.
15
16config CEPH_FS_PRETTYDEBUG
17 bool "Include file:line in ceph debug output"
18 depends on CEPH_FS
19 default n
20 help
21 If you say Y here, debug output will include a filename and
22 line to aid debugging. This icnreases kernel size and slows
23 execution slightly when debug call sites are enabled (e.g.,
24 via CONFIG_DYNAMIC_DEBUG).
25
26 If unsure, say N.
27
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 000000000000..6a660e610be8
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,39 @@
1#
2# Makefile for CEPH filesystem.
3#
4
5ifneq ($(KERNELRELEASE),)
6
7obj-$(CONFIG_CEPH_FS) += ceph.o
8
9ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
10 export.o caps.o snap.o xattr.o \
11 messenger.o msgpool.o buffer.o pagelist.o \
12 mds_client.o mdsmap.o \
13 mon_client.o \
14 osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
15 debugfs.o \
16 auth.o auth_none.o \
17 crypto.o armor.o \
18 auth_x.o \
19 ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
20
21else
22#Otherwise we were called directly from the command
23# line; invoke the kernel build system.
24
25KERNELDIR ?= /lib/modules/$(shell uname -r)/build
26PWD := $(shell pwd)
27
28default: all
29
30all:
31 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
32
33modules_install:
34 $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
35
36clean:
37 $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
38
39endif
diff --git a/fs/ceph/README b/fs/ceph/README
new file mode 100644
index 000000000000..18352fab37c0
--- /dev/null
+++ b/fs/ceph/README
@@ -0,0 +1,20 @@
1#
2# The following files are shared by (and manually synchronized
3# between) the Ceph userland and kernel client.
4#
5# userland kernel
6src/include/ceph_fs.h fs/ceph/ceph_fs.h
7src/include/ceph_fs.cc fs/ceph/ceph_fs.c
8src/include/msgr.h fs/ceph/msgr.h
9src/include/rados.h fs/ceph/rados.h
10src/include/ceph_strings.cc fs/ceph/ceph_strings.c
11src/include/ceph_frag.h fs/ceph/ceph_frag.h
12src/include/ceph_frag.cc fs/ceph/ceph_frag.c
13src/include/ceph_hash.h fs/ceph/ceph_hash.h
14src/include/ceph_hash.cc fs/ceph/ceph_hash.c
15src/crush/crush.c fs/ceph/crush/crush.c
16src/crush/crush.h fs/ceph/crush/crush.h
17src/crush/mapper.c fs/ceph/crush/mapper.c
18src/crush/mapper.h fs/ceph/crush/mapper.h
19src/crush/hash.h fs/ceph/crush/hash.h
20src/crush/hash.c fs/ceph/crush/hash.c
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 000000000000..23bb0ceabe31
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1188 @@
1#include "ceph_debug.h"
2
3#include <linux/backing-dev.h>
4#include <linux/fs.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/writeback.h> /* generic_writepages */
8#include <linux/pagevec.h>
9#include <linux/task_io_accounting_ops.h>
10
11#include "super.h"
12#include "osd_client.h"
13
14/*
15 * Ceph address space ops.
16 *
17 * There are a few funny things going on here.
18 *
19 * The page->private field is used to reference a struct
20 * ceph_snap_context for _every_ dirty page. This indicates which
21 * snapshot the page was logically dirtied in, and thus which snap
22 * context needs to be associated with the osd write during writeback.
23 *
24 * Similarly, struct ceph_inode_info maintains a set of counters to
25 * count dirty pages on the inode. In the absense of snapshots,
26 * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
27 *
28 * When a snapshot is taken (that is, when the client receives
29 * notification that a snapshot was taken), each inode with caps and
30 * with dirty pages (dirty pages implies there is a cap) gets a new
31 * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
32 * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
33 * moved to capsnap->dirty. (Unless a sync write is currently in
34 * progress. In that case, the capsnap is said to be "pending", new
35 * writes cannot start, and the capsnap isn't "finalized" until the
36 * write completes (or fails) and a final size/mtime for the inode for
37 * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
38 *
39 * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
40 * we look for the first capsnap in i_cap_snaps and write out pages in
41 * that snap context _only_. Then we move on to the next capsnap,
42 * eventually reaching the "live" or "head" context (i.e., pages that
43 * are not yet snapped) and are writing the most recently dirtied
44 * pages.
45 *
46 * Invalidate and so forth must take care to ensure the dirty page
47 * accounting is preserved.
48 */
49
50#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
51#define CONGESTION_OFF_THRESH(congestion_kb) \
52 (CONGESTION_ON_THRESH(congestion_kb) - \
53 (CONGESTION_ON_THRESH(congestion_kb) >> 2))
54
55
56
57/*
58 * Dirty a page. Optimistically adjust accounting, on the assumption
59 * that we won't race with invalidate. If we do, readjust.
60 */
61static int ceph_set_page_dirty(struct page *page)
62{
63 struct address_space *mapping = page->mapping;
64 struct inode *inode;
65 struct ceph_inode_info *ci;
66 int undo = 0;
67 struct ceph_snap_context *snapc;
68
69 if (unlikely(!mapping))
70 return !TestSetPageDirty(page);
71
72 if (TestSetPageDirty(page)) {
73 dout("%p set_page_dirty %p idx %lu -- already dirty\n",
74 mapping->host, page, page->index);
75 return 0;
76 }
77
78 inode = mapping->host;
79 ci = ceph_inode(inode);
80
81 /*
82 * Note that we're grabbing a snapc ref here without holding
83 * any locks!
84 */
85 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
86
87 /* dirty the head */
88 spin_lock(&inode->i_lock);
89 if (ci->i_wrbuffer_ref_head == 0)
90 ci->i_head_snapc = ceph_get_snap_context(snapc);
91 ++ci->i_wrbuffer_ref_head;
92 if (ci->i_wrbuffer_ref == 0)
93 igrab(inode);
94 ++ci->i_wrbuffer_ref;
95 dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
96 "snapc %p seq %lld (%d snaps)\n",
97 mapping->host, page, page->index,
98 ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
99 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
100 snapc, snapc->seq, snapc->num_snaps);
101 spin_unlock(&inode->i_lock);
102
103 /* now adjust page */
104 spin_lock_irq(&mapping->tree_lock);
105 if (page->mapping) { /* Race with truncate? */
106 WARN_ON_ONCE(!PageUptodate(page));
107
108 if (mapping_cap_account_dirty(mapping)) {
109 __inc_zone_page_state(page, NR_FILE_DIRTY);
110 __inc_bdi_stat(mapping->backing_dev_info,
111 BDI_RECLAIMABLE);
112 task_io_account_write(PAGE_CACHE_SIZE);
113 }
114 radix_tree_tag_set(&mapping->page_tree,
115 page_index(page), PAGECACHE_TAG_DIRTY);
116
117 /*
118 * Reference snap context in page->private. Also set
119 * PagePrivate so that we get invalidatepage callback.
120 */
121 page->private = (unsigned long)snapc;
122 SetPagePrivate(page);
123 } else {
124 dout("ANON set_page_dirty %p (raced truncate?)\n", page);
125 undo = 1;
126 }
127
128 spin_unlock_irq(&mapping->tree_lock);
129
130 if (undo)
131 /* whoops, we failed to dirty the page */
132 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
133
134 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
135
136 BUG_ON(!PageDirty(page));
137 return 1;
138}
139
140/*
141 * If we are truncating the full page (i.e. offset == 0), adjust the
142 * dirty page counters appropriately. Only called if there is private
143 * data on the page.
144 */
145static void ceph_invalidatepage(struct page *page, unsigned long offset)
146{
147 struct inode *inode;
148 struct ceph_inode_info *ci;
149 struct ceph_snap_context *snapc = (void *)page->private;
150
151 BUG_ON(!PageLocked(page));
152 BUG_ON(!page->private);
153 BUG_ON(!PagePrivate(page));
154 BUG_ON(!page->mapping);
155
156 inode = page->mapping->host;
157
158 /*
159 * We can get non-dirty pages here due to races between
160 * set_page_dirty and truncate_complete_page; just spit out a
161 * warning, in case we end up with accounting problems later.
162 */
163 if (!PageDirty(page))
164 pr_err("%p invalidatepage %p page not dirty\n", inode, page);
165
166 if (offset == 0)
167 ClearPageChecked(page);
168
169 ci = ceph_inode(inode);
170 if (offset == 0) {
171 dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
172 inode, page, page->index, offset);
173 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
174 ceph_put_snap_context(snapc);
175 page->private = 0;
176 ClearPagePrivate(page);
177 } else {
178 dout("%p invalidatepage %p idx %lu partial dirty page\n",
179 inode, page, page->index);
180 }
181}
182
183/* just a sanity check */
184static int ceph_releasepage(struct page *page, gfp_t g)
185{
186 struct inode *inode = page->mapping ? page->mapping->host : NULL;
187 dout("%p releasepage %p idx %lu\n", inode, page, page->index);
188 WARN_ON(PageDirty(page));
189 WARN_ON(page->private);
190 WARN_ON(PagePrivate(page));
191 return 0;
192}
193
194/*
195 * read a single page, without unlocking it.
196 */
197static int readpage_nounlock(struct file *filp, struct page *page)
198{
199 struct inode *inode = filp->f_dentry->d_inode;
200 struct ceph_inode_info *ci = ceph_inode(inode);
201 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
202 int err = 0;
203 u64 len = PAGE_CACHE_SIZE;
204
205 dout("readpage inode %p file %p page %p index %lu\n",
206 inode, filp, page, page->index);
207 err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
208 page->index << PAGE_CACHE_SHIFT, &len,
209 ci->i_truncate_seq, ci->i_truncate_size,
210 &page, 1);
211 if (err == -ENOENT)
212 err = 0;
213 if (err < 0) {
214 SetPageError(page);
215 goto out;
216 } else if (err < PAGE_CACHE_SIZE) {
217 /* zero fill remainder of page */
218 zero_user_segment(page, err, PAGE_CACHE_SIZE);
219 }
220 SetPageUptodate(page);
221
222out:
223 return err < 0 ? err : 0;
224}
225
226static int ceph_readpage(struct file *filp, struct page *page)
227{
228 int r = readpage_nounlock(filp, page);
229 unlock_page(page);
230 return r;
231}
232
233/*
234 * Build a vector of contiguous pages from the provided page list.
235 */
236static struct page **page_vector_from_list(struct list_head *page_list,
237 unsigned *nr_pages)
238{
239 struct page **pages;
240 struct page *page;
241 int next_index, contig_pages = 0;
242
243 /* build page vector */
244 pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
245 if (!pages)
246 return ERR_PTR(-ENOMEM);
247
248 BUG_ON(list_empty(page_list));
249 next_index = list_entry(page_list->prev, struct page, lru)->index;
250 list_for_each_entry_reverse(page, page_list, lru) {
251 if (page->index == next_index) {
252 dout("readpages page %d %p\n", contig_pages, page);
253 pages[contig_pages] = page;
254 contig_pages++;
255 next_index++;
256 } else {
257 break;
258 }
259 }
260 *nr_pages = contig_pages;
261 return pages;
262}
263
264/*
265 * Read multiple pages. Leave pages we don't read + unlock in page_list;
266 * the caller (VM) cleans them up.
267 */
268static int ceph_readpages(struct file *file, struct address_space *mapping,
269 struct list_head *page_list, unsigned nr_pages)
270{
271 struct inode *inode = file->f_dentry->d_inode;
272 struct ceph_inode_info *ci = ceph_inode(inode);
273 struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
274 int rc = 0;
275 struct page **pages;
276 struct pagevec pvec;
277 loff_t offset;
278 u64 len;
279
280 dout("readpages %p file %p nr_pages %d\n",
281 inode, file, nr_pages);
282
283 pages = page_vector_from_list(page_list, &nr_pages);
284 if (IS_ERR(pages))
285 return PTR_ERR(pages);
286
287 /* guess read extent */
288 offset = pages[0]->index << PAGE_CACHE_SHIFT;
289 len = nr_pages << PAGE_CACHE_SHIFT;
290 rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
291 offset, &len,
292 ci->i_truncate_seq, ci->i_truncate_size,
293 pages, nr_pages);
294 if (rc == -ENOENT)
295 rc = 0;
296 if (rc < 0)
297 goto out;
298
299 /* set uptodate and add to lru in pagevec-sized chunks */
300 pagevec_init(&pvec, 0);
301 for (; !list_empty(page_list) && len > 0;
302 rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
303 struct page *page =
304 list_entry(page_list->prev, struct page, lru);
305
306 list_del(&page->lru);
307
308 if (rc < (int)PAGE_CACHE_SIZE) {
309 /* zero (remainder of) page */
310 int s = rc < 0 ? 0 : rc;
311 zero_user_segment(page, s, PAGE_CACHE_SIZE);
312 }
313
314 if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
315 page_cache_release(page);
316 dout("readpages %p add_to_page_cache failed %p\n",
317 inode, page);
318 continue;
319 }
320 dout("readpages %p adding %p idx %lu\n", inode, page,
321 page->index);
322 flush_dcache_page(page);
323 SetPageUptodate(page);
324 unlock_page(page);
325 if (pagevec_add(&pvec, page) == 0)
326 pagevec_lru_add_file(&pvec); /* add to lru */
327 }
328 pagevec_lru_add_file(&pvec);
329 rc = 0;
330
331out:
332 kfree(pages);
333 return rc;
334}
335
336/*
337 * Get ref for the oldest snapc for an inode with dirty data... that is, the
338 * only snap context we are allowed to write back.
339 *
340 * Caller holds i_lock.
341 */
342static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
343 u64 *snap_size)
344{
345 struct ceph_inode_info *ci = ceph_inode(inode);
346 struct ceph_snap_context *snapc = NULL;
347 struct ceph_cap_snap *capsnap = NULL;
348
349 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
350 dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
351 capsnap->context, capsnap->dirty_pages);
352 if (capsnap->dirty_pages) {
353 snapc = ceph_get_snap_context(capsnap->context);
354 if (snap_size)
355 *snap_size = capsnap->size;
356 break;
357 }
358 }
359 if (!snapc && ci->i_snap_realm) {
360 snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
361 dout(" head snapc %p has %d dirty pages\n",
362 snapc, ci->i_wrbuffer_ref_head);
363 }
364 return snapc;
365}
366
367static struct ceph_snap_context *get_oldest_context(struct inode *inode,
368 u64 *snap_size)
369{
370 struct ceph_snap_context *snapc = NULL;
371
372 spin_lock(&inode->i_lock);
373 snapc = __get_oldest_context(inode, snap_size);
374 spin_unlock(&inode->i_lock);
375 return snapc;
376}
377
378/*
379 * Write a single page, but leave the page locked.
380 *
381 * If we get a write error, set the page error bit, but still adjust the
382 * dirty page accounting (i.e., page is no longer dirty).
383 */
384static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
385{
386 struct inode *inode;
387 struct ceph_inode_info *ci;
388 struct ceph_client *client;
389 struct ceph_osd_client *osdc;
390 loff_t page_off = page->index << PAGE_CACHE_SHIFT;
391 int len = PAGE_CACHE_SIZE;
392 loff_t i_size;
393 int err = 0;
394 struct ceph_snap_context *snapc;
395 u64 snap_size = 0;
396 long writeback_stat;
397
398 dout("writepage %p idx %lu\n", page, page->index);
399
400 if (!page->mapping || !page->mapping->host) {
401 dout("writepage %p - no mapping\n", page);
402 return -EFAULT;
403 }
404 inode = page->mapping->host;
405 ci = ceph_inode(inode);
406 client = ceph_inode_to_client(inode);
407 osdc = &client->osdc;
408
409 /* verify this is a writeable snap context */
410 snapc = (void *)page->private;
411 if (snapc == NULL) {
412 dout("writepage %p page %p not dirty?\n", inode, page);
413 goto out;
414 }
415 if (snapc != get_oldest_context(inode, &snap_size)) {
416 dout("writepage %p page %p snapc %p not writeable - noop\n",
417 inode, page, (void *)page->private);
418 /* we should only noop if called by kswapd */
419 WARN_ON((current->flags & PF_MEMALLOC) == 0);
420 goto out;
421 }
422
423 /* is this a partial page at end of file? */
424 if (snap_size)
425 i_size = snap_size;
426 else
427 i_size = i_size_read(inode);
428 if (i_size < page_off + len)
429 len = i_size - page_off;
430
431 dout("writepage %p page %p index %lu on %llu~%u\n",
432 inode, page, page->index, page_off, len);
433
434 writeback_stat = atomic_long_inc_return(&client->writeback_count);
435 if (writeback_stat >
436 CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
437 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
438
439 set_page_writeback(page);
440 err = ceph_osdc_writepages(osdc, ceph_vino(inode),
441 &ci->i_layout, snapc,
442 page_off, len,
443 ci->i_truncate_seq, ci->i_truncate_size,
444 &inode->i_mtime,
445 &page, 1, 0, 0, true);
446 if (err < 0) {
447 dout("writepage setting page/mapping error %d %p\n", err, page);
448 SetPageError(page);
449 mapping_set_error(&inode->i_data, err);
450 if (wbc)
451 wbc->pages_skipped++;
452 } else {
453 dout("writepage cleaned page %p\n", page);
454 err = 0; /* vfs expects us to return 0 */
455 }
456 page->private = 0;
457 ClearPagePrivate(page);
458 end_page_writeback(page);
459 ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
460 ceph_put_snap_context(snapc);
461out:
462 return err;
463}
464
465static int ceph_writepage(struct page *page, struct writeback_control *wbc)
466{
467 int err;
468 struct inode *inode = page->mapping->host;
469 BUG_ON(!inode);
470 igrab(inode);
471 err = writepage_nounlock(page, wbc);
472 unlock_page(page);
473 iput(inode);
474 return err;
475}
476
477
478/*
479 * lame release_pages helper. release_pages() isn't exported to
480 * modules.
481 */
482static void ceph_release_pages(struct page **pages, int num)
483{
484 struct pagevec pvec;
485 int i;
486
487 pagevec_init(&pvec, 0);
488 for (i = 0; i < num; i++) {
489 if (pagevec_add(&pvec, pages[i]) == 0)
490 pagevec_release(&pvec);
491 }
492 pagevec_release(&pvec);
493}
494
495
496/*
497 * async writeback completion handler.
498 *
499 * If we get an error, set the mapping error bit, but not the individual
500 * page error bits.
501 */
502static void writepages_finish(struct ceph_osd_request *req,
503 struct ceph_msg *msg)
504{
505 struct inode *inode = req->r_inode;
506 struct ceph_osd_reply_head *replyhead;
507 struct ceph_osd_op *op;
508 struct ceph_inode_info *ci = ceph_inode(inode);
509 unsigned wrote;
510 struct page *page;
511 int i;
512 struct ceph_snap_context *snapc = req->r_snapc;
513 struct address_space *mapping = inode->i_mapping;
514 struct writeback_control *wbc = req->r_wbc;
515 __s32 rc = -EIO;
516 u64 bytes = 0;
517 struct ceph_client *client = ceph_inode_to_client(inode);
518 long writeback_stat;
519 unsigned issued = __ceph_caps_issued(ci, NULL);
520
521 /* parse reply */
522 replyhead = msg->front.iov_base;
523 WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
524 op = (void *)(replyhead + 1);
525 rc = le32_to_cpu(replyhead->result);
526 bytes = le64_to_cpu(op->extent.length);
527
528 if (rc >= 0) {
529 /*
530 * Assume we wrote the pages we originally sent. The
531 * osd might reply with fewer pages if our writeback
532 * raced with a truncation and was adjusted at the osd,
533 * so don't believe the reply.
534 */
535 wrote = req->r_num_pages;
536 } else {
537 wrote = 0;
538 mapping_set_error(mapping, rc);
539 }
540 dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
541 inode, rc, bytes, wrote);
542
543 /* clean all pages */
544 for (i = 0; i < req->r_num_pages; i++) {
545 page = req->r_pages[i];
546 BUG_ON(!page);
547 WARN_ON(!PageUptodate(page));
548
549 writeback_stat =
550 atomic_long_dec_return(&client->writeback_count);
551 if (writeback_stat <
552 CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
553 clear_bdi_congested(&client->backing_dev_info,
554 BLK_RW_ASYNC);
555
556 if (i >= wrote) {
557 dout("inode %p skipping page %p\n", inode, page);
558 wbc->pages_skipped++;
559 }
560 page->private = 0;
561 ClearPagePrivate(page);
562 ceph_put_snap_context(snapc);
563 dout("unlocking %d %p\n", i, page);
564 end_page_writeback(page);
565
566 /*
567 * We lost the cache cap, need to truncate the page before
568 * it is unlocked, otherwise we'd truncate it later in the
569 * page truncation thread, possibly losing some data that
570 * raced its way in
571 */
572 if ((issued & CEPH_CAP_FILE_CACHE) == 0)
573 generic_error_remove_page(inode->i_mapping, page);
574
575 unlock_page(page);
576 }
577 dout("%p wrote+cleaned %d pages\n", inode, wrote);
578 ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
579
580 ceph_release_pages(req->r_pages, req->r_num_pages);
581 if (req->r_pages_from_pool)
582 mempool_free(req->r_pages,
583 ceph_client(inode->i_sb)->wb_pagevec_pool);
584 else
585 kfree(req->r_pages);
586 ceph_osdc_put_request(req);
587}
588
589/*
590 * allocate a page vec, either directly, or if necessary, via a the
591 * mempool. we avoid the mempool if we can because req->r_num_pages
592 * may be less than the maximum write size.
593 */
594static void alloc_page_vec(struct ceph_client *client,
595 struct ceph_osd_request *req)
596{
597 req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
598 GFP_NOFS);
599 if (!req->r_pages) {
600 req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
601 req->r_pages_from_pool = 1;
602 WARN_ON(!req->r_pages);
603 }
604}
605
606/*
607 * initiate async writeback
608 */
609static int ceph_writepages_start(struct address_space *mapping,
610 struct writeback_control *wbc)
611{
612 struct inode *inode = mapping->host;
613 struct backing_dev_info *bdi = mapping->backing_dev_info;
614 struct ceph_inode_info *ci = ceph_inode(inode);
615 struct ceph_client *client;
616 pgoff_t index, start, end;
617 int range_whole = 0;
618 int should_loop = 1;
619 pgoff_t max_pages = 0, max_pages_ever = 0;
620 struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
621 struct pagevec pvec;
622 int done = 0;
623 int rc = 0;
624 unsigned wsize = 1 << inode->i_blkbits;
625 struct ceph_osd_request *req = NULL;
626 int do_sync;
627 u64 snap_size = 0;
628
629 /*
630 * Include a 'sync' in the OSD request if this is a data
631 * integrity write (e.g., O_SYNC write or fsync()), or if our
632 * cap is being revoked.
633 */
634 do_sync = wbc->sync_mode == WB_SYNC_ALL;
635 if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
636 do_sync = 1;
637 dout("writepages_start %p dosync=%d (mode=%s)\n",
638 inode, do_sync,
639 wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
640 (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
641
642 client = ceph_inode_to_client(inode);
643 if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
644 pr_warning("writepage_start %p on forced umount\n", inode);
645 return -EIO; /* we're in a forced umount, don't write! */
646 }
647 if (client->mount_args->wsize && client->mount_args->wsize < wsize)
648 wsize = client->mount_args->wsize;
649 if (wsize < PAGE_CACHE_SIZE)
650 wsize = PAGE_CACHE_SIZE;
651 max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
652
653 pagevec_init(&pvec, 0);
654
655 /* ?? */
656 if (wbc->nonblocking && bdi_write_congested(bdi)) {
657 dout(" writepages congested\n");
658 wbc->encountered_congestion = 1;
659 goto out_final;
660 }
661
662 /* where to start/end? */
663 if (wbc->range_cyclic) {
664 start = mapping->writeback_index; /* Start from prev offset */
665 end = -1;
666 dout(" cyclic, start at %lu\n", start);
667 } else {
668 start = wbc->range_start >> PAGE_CACHE_SHIFT;
669 end = wbc->range_end >> PAGE_CACHE_SHIFT;
670 if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
671 range_whole = 1;
672 should_loop = 0;
673 dout(" not cyclic, %lu to %lu\n", start, end);
674 }
675 index = start;
676
677retry:
678 /* find oldest snap context with dirty data */
679 ceph_put_snap_context(snapc);
680 snapc = get_oldest_context(inode, &snap_size);
681 if (!snapc) {
682 /* hmm, why does writepages get called when there
683 is no dirty data? */
684 dout(" no snap context with dirty data?\n");
685 goto out;
686 }
687 dout(" oldest snapc is %p seq %lld (%d snaps)\n",
688 snapc, snapc->seq, snapc->num_snaps);
689 if (last_snapc && snapc != last_snapc) {
690 /* if we switched to a newer snapc, restart our scan at the
691 * start of the original file range. */
692 dout(" snapc differs from last pass, restarting at %lu\n",
693 index);
694 index = start;
695 }
696 last_snapc = snapc;
697
698 while (!done && index <= end) {
699 unsigned i;
700 int first;
701 pgoff_t next;
702 int pvec_pages, locked_pages;
703 struct page *page;
704 int want;
705 u64 offset, len;
706 struct ceph_osd_request_head *reqhead;
707 struct ceph_osd_op *op;
708 long writeback_stat;
709
710 next = 0;
711 locked_pages = 0;
712 max_pages = max_pages_ever;
713
714get_more_pages:
715 first = -1;
716 want = min(end - index,
717 min((pgoff_t)PAGEVEC_SIZE,
718 max_pages - (pgoff_t)locked_pages) - 1)
719 + 1;
720 pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
721 PAGECACHE_TAG_DIRTY,
722 want);
723 dout("pagevec_lookup_tag got %d\n", pvec_pages);
724 if (!pvec_pages && !locked_pages)
725 break;
726 for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
727 page = pvec.pages[i];
728 dout("? %p idx %lu\n", page, page->index);
729 if (locked_pages == 0)
730 lock_page(page); /* first page */
731 else if (!trylock_page(page))
732 break;
733
734 /* only dirty pages, or our accounting breaks */
735 if (unlikely(!PageDirty(page)) ||
736 unlikely(page->mapping != mapping)) {
737 dout("!dirty or !mapping %p\n", page);
738 unlock_page(page);
739 break;
740 }
741 if (!wbc->range_cyclic && page->index > end) {
742 dout("end of range %p\n", page);
743 done = 1;
744 unlock_page(page);
745 break;
746 }
747 if (next && (page->index != next)) {
748 dout("not consecutive %p\n", page);
749 unlock_page(page);
750 break;
751 }
752 if (wbc->sync_mode != WB_SYNC_NONE) {
753 dout("waiting on writeback %p\n", page);
754 wait_on_page_writeback(page);
755 }
756 if ((snap_size && page_offset(page) > snap_size) ||
757 (!snap_size &&
758 page_offset(page) > i_size_read(inode))) {
759 dout("%p page eof %llu\n", page, snap_size ?
760 snap_size : i_size_read(inode));
761 done = 1;
762 unlock_page(page);
763 break;
764 }
765 if (PageWriteback(page)) {
766 dout("%p under writeback\n", page);
767 unlock_page(page);
768 break;
769 }
770
771 /* only if matching snap context */
772 if (snapc != (void *)page->private) {
773 dout("page snapc %p != oldest %p\n",
774 (void *)page->private, snapc);
775 unlock_page(page);
776 if (!locked_pages)
777 continue; /* keep looking for snap */
778 break;
779 }
780
781 if (!clear_page_dirty_for_io(page)) {
782 dout("%p !clear_page_dirty_for_io\n", page);
783 unlock_page(page);
784 break;
785 }
786
787 /* ok */
788 if (locked_pages == 0) {
789 /* prepare async write request */
790 offset = page->index << PAGE_CACHE_SHIFT;
791 len = wsize;
792 req = ceph_osdc_new_request(&client->osdc,
793 &ci->i_layout,
794 ceph_vino(inode),
795 offset, &len,
796 CEPH_OSD_OP_WRITE,
797 CEPH_OSD_FLAG_WRITE |
798 CEPH_OSD_FLAG_ONDISK,
799 snapc, do_sync,
800 ci->i_truncate_seq,
801 ci->i_truncate_size,
802 &inode->i_mtime, true, 1);
803 max_pages = req->r_num_pages;
804
805 alloc_page_vec(client, req);
806 req->r_callback = writepages_finish;
807 req->r_inode = inode;
808 req->r_wbc = wbc;
809 }
810
811 /* note position of first page in pvec */
812 if (first < 0)
813 first = i;
814 dout("%p will write page %p idx %lu\n",
815 inode, page, page->index);
816
817 writeback_stat = atomic_long_inc_return(&client->writeback_count);
818 if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
819 set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
820 }
821
822 set_page_writeback(page);
823 req->r_pages[locked_pages] = page;
824 locked_pages++;
825 next = page->index + 1;
826 }
827
828 /* did we get anything? */
829 if (!locked_pages)
830 goto release_pvec_pages;
831 if (i) {
832 int j;
833 BUG_ON(!locked_pages || first < 0);
834
835 if (pvec_pages && i == pvec_pages &&
836 locked_pages < max_pages) {
837 dout("reached end pvec, trying for more\n");
838 pagevec_reinit(&pvec);
839 goto get_more_pages;
840 }
841
842 /* shift unused pages over in the pvec... we
843 * will need to release them below. */
844 for (j = i; j < pvec_pages; j++) {
845 dout(" pvec leftover page %p\n",
846 pvec.pages[j]);
847 pvec.pages[j-i+first] = pvec.pages[j];
848 }
849 pvec.nr -= i-first;
850 }
851
852 /* submit the write */
853 offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
854 len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
855 (u64)locked_pages << PAGE_CACHE_SHIFT);
856 dout("writepages got %d pages at %llu~%llu\n",
857 locked_pages, offset, len);
858
859 /* revise final length, page count */
860 req->r_num_pages = locked_pages;
861 reqhead = req->r_request->front.iov_base;
862 op = (void *)(reqhead + 1);
863 op->extent.length = cpu_to_le64(len);
864 op->payload_len = cpu_to_le32(len);
865 req->r_request->hdr.data_len = cpu_to_le32(len);
866
867 ceph_osdc_start_request(&client->osdc, req, true);
868 req = NULL;
869
870 /* continue? */
871 index = next;
872 wbc->nr_to_write -= locked_pages;
873 if (wbc->nr_to_write <= 0)
874 done = 1;
875
876release_pvec_pages:
877 dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
878 pvec.nr ? pvec.pages[0] : NULL);
879 pagevec_release(&pvec);
880
881 if (locked_pages && !done)
882 goto retry;
883 }
884
885 if (should_loop && !done) {
886 /* more to do; loop back to beginning of file */
887 dout("writepages looping back to beginning of file\n");
888 should_loop = 0;
889 index = 0;
890 goto retry;
891 }
892
893 if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
894 mapping->writeback_index = index;
895
896out:
897 if (req)
898 ceph_osdc_put_request(req);
899 if (rc > 0)
900 rc = 0; /* vfs expects us to return 0 */
901 ceph_put_snap_context(snapc);
902 dout("writepages done, rc = %d\n", rc);
903out_final:
904 return rc;
905}
906
907
908
909/*
910 * See if a given @snapc is either writeable, or already written.
911 */
912static int context_is_writeable_or_written(struct inode *inode,
913 struct ceph_snap_context *snapc)
914{
915 struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
916 return !oldest || snapc->seq <= oldest->seq;
917}
918
919/*
920 * We are only allowed to write into/dirty the page if the page is
921 * clean, or already dirty within the same snap context.
922 */
923static int ceph_update_writeable_page(struct file *file,
924 loff_t pos, unsigned len,
925 struct page *page)
926{
927 struct inode *inode = file->f_dentry->d_inode;
928 struct ceph_inode_info *ci = ceph_inode(inode);
929 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
930 loff_t page_off = pos & PAGE_CACHE_MASK;
931 int pos_in_page = pos & ~PAGE_CACHE_MASK;
932 int end_in_page = pos_in_page + len;
933 loff_t i_size;
934 struct ceph_snap_context *snapc;
935 int r;
936
937retry_locked:
938 /* writepages currently holds page lock, but if we change that later, */
939 wait_on_page_writeback(page);
940
941 /* check snap context */
942 BUG_ON(!ci->i_snap_realm);
943 down_read(&mdsc->snap_rwsem);
944 BUG_ON(!ci->i_snap_realm->cached_context);
945 if (page->private &&
946 (void *)page->private != ci->i_snap_realm->cached_context) {
947 /*
948 * this page is already dirty in another (older) snap
949 * context! is it writeable now?
950 */
951 snapc = get_oldest_context(inode, NULL);
952 up_read(&mdsc->snap_rwsem);
953
954 if (snapc != (void *)page->private) {
955 dout(" page %p snapc %p not current or oldest\n",
956 page, (void *)page->private);
957 /*
958 * queue for writeback, and wait for snapc to
959 * be writeable or written
960 */
961 snapc = ceph_get_snap_context((void *)page->private);
962 unlock_page(page);
963 ceph_queue_writeback(inode);
964 wait_event_interruptible(ci->i_cap_wq,
965 context_is_writeable_or_written(inode, snapc));
966 ceph_put_snap_context(snapc);
967 return -EAGAIN;
968 }
969
970 /* yay, writeable, do it now (without dropping page lock) */
971 dout(" page %p snapc %p not current, but oldest\n",
972 page, snapc);
973 if (!clear_page_dirty_for_io(page))
974 goto retry_locked;
975 r = writepage_nounlock(page, NULL);
976 if (r < 0)
977 goto fail_nosnap;
978 goto retry_locked;
979 }
980
981 if (PageUptodate(page)) {
982 dout(" page %p already uptodate\n", page);
983 return 0;
984 }
985
986 /* full page? */
987 if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
988 return 0;
989
990 /* past end of file? */
991 i_size = inode->i_size; /* caller holds i_mutex */
992
993 if (i_size + len > inode->i_sb->s_maxbytes) {
994 /* file is too big */
995 r = -EINVAL;
996 goto fail;
997 }
998
999 if (page_off >= i_size ||
1000 (pos_in_page == 0 && (pos+len) >= i_size &&
1001 end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
1002 dout(" zeroing %p 0 - %d and %d - %d\n",
1003 page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
1004 zero_user_segments(page,
1005 0, pos_in_page,
1006 end_in_page, PAGE_CACHE_SIZE);
1007 return 0;
1008 }
1009
1010 /* we need to read it. */
1011 up_read(&mdsc->snap_rwsem);
1012 r = readpage_nounlock(file, page);
1013 if (r < 0)
1014 goto fail_nosnap;
1015 goto retry_locked;
1016
1017fail:
1018 up_read(&mdsc->snap_rwsem);
1019fail_nosnap:
1020 unlock_page(page);
1021 return r;
1022}
1023
1024/*
1025 * We are only allowed to write into/dirty the page if the page is
1026 * clean, or already dirty within the same snap context.
1027 */
1028static int ceph_write_begin(struct file *file, struct address_space *mapping,
1029 loff_t pos, unsigned len, unsigned flags,
1030 struct page **pagep, void **fsdata)
1031{
1032 struct inode *inode = file->f_dentry->d_inode;
1033 struct page *page;
1034 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
1035 int r;
1036
1037 do {
1038 /* get a page*/
1039 page = grab_cache_page_write_begin(mapping, index, 0);
1040 if (!page)
1041 return -ENOMEM;
1042 *pagep = page;
1043
1044 dout("write_begin file %p inode %p page %p %d~%d\n", file,
1045 inode, page, (int)pos, (int)len);
1046
1047 r = ceph_update_writeable_page(file, pos, len, page);
1048 } while (r == -EAGAIN);
1049
1050 return r;
1051}
1052
1053/*
1054 * we don't do anything in here that simple_write_end doesn't do
1055 * except adjust dirty page accounting and drop read lock on
1056 * mdsc->snap_rwsem.
1057 */
1058static int ceph_write_end(struct file *file, struct address_space *mapping,
1059 loff_t pos, unsigned len, unsigned copied,
1060 struct page *page, void *fsdata)
1061{
1062 struct inode *inode = file->f_dentry->d_inode;
1063 struct ceph_client *client = ceph_inode_to_client(inode);
1064 struct ceph_mds_client *mdsc = &client->mdsc;
1065 unsigned from = pos & (PAGE_CACHE_SIZE - 1);
1066 int check_cap = 0;
1067
1068 dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
1069 inode, page, (int)pos, (int)copied, (int)len);
1070
1071 /* zero the stale part of the page if we did a short copy */
1072 if (copied < len)
1073 zero_user_segment(page, from+copied, len);
1074
1075 /* did file size increase? */
1076 /* (no need for i_size_read(); we caller holds i_mutex */
1077 if (pos+copied > inode->i_size)
1078 check_cap = ceph_inode_set_size(inode, pos+copied);
1079
1080 if (!PageUptodate(page))
1081 SetPageUptodate(page);
1082
1083 set_page_dirty(page);
1084
1085 unlock_page(page);
1086 up_read(&mdsc->snap_rwsem);
1087 page_cache_release(page);
1088
1089 if (check_cap)
1090 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
1091
1092 return copied;
1093}
1094
1095/*
1096 * we set .direct_IO to indicate direct io is supported, but since we
1097 * intercept O_DIRECT reads and writes early, this function should
1098 * never get called.
1099 */
1100static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
1101 const struct iovec *iov,
1102 loff_t pos, unsigned long nr_segs)
1103{
1104 WARN_ON(1);
1105 return -EINVAL;
1106}
1107
1108const struct address_space_operations ceph_aops = {
1109 .readpage = ceph_readpage,
1110 .readpages = ceph_readpages,
1111 .writepage = ceph_writepage,
1112 .writepages = ceph_writepages_start,
1113 .write_begin = ceph_write_begin,
1114 .write_end = ceph_write_end,
1115 .set_page_dirty = ceph_set_page_dirty,
1116 .invalidatepage = ceph_invalidatepage,
1117 .releasepage = ceph_releasepage,
1118 .direct_IO = ceph_direct_io,
1119};
1120
1121
1122/*
1123 * vm ops
1124 */
1125
1126/*
1127 * Reuse write_begin here for simplicity.
1128 */
1129static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
1130{
1131 struct inode *inode = vma->vm_file->f_dentry->d_inode;
1132 struct page *page = vmf->page;
1133 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1134 loff_t off = page->index << PAGE_CACHE_SHIFT;
1135 loff_t size, len;
1136 int ret;
1137
1138 size = i_size_read(inode);
1139 if (off + PAGE_CACHE_SIZE <= size)
1140 len = PAGE_CACHE_SIZE;
1141 else
1142 len = size & ~PAGE_CACHE_MASK;
1143
1144 dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
1145 off, len, page, page->index);
1146
1147 lock_page(page);
1148
1149 ret = VM_FAULT_NOPAGE;
1150 if ((off > size) ||
1151 (page->mapping != inode->i_mapping))
1152 goto out;
1153
1154 ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
1155 if (ret == 0) {
1156 /* success. we'll keep the page locked. */
1157 set_page_dirty(page);
1158 up_read(&mdsc->snap_rwsem);
1159 ret = VM_FAULT_LOCKED;
1160 } else {
1161 if (ret == -ENOMEM)
1162 ret = VM_FAULT_OOM;
1163 else
1164 ret = VM_FAULT_SIGBUS;
1165 }
1166out:
1167 dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
1168 if (ret != VM_FAULT_LOCKED)
1169 unlock_page(page);
1170 return ret;
1171}
1172
1173static struct vm_operations_struct ceph_vmops = {
1174 .fault = filemap_fault,
1175 .page_mkwrite = ceph_page_mkwrite,
1176};
1177
1178int ceph_mmap(struct file *file, struct vm_area_struct *vma)
1179{
1180 struct address_space *mapping = file->f_mapping;
1181
1182 if (!mapping->a_ops->readpage)
1183 return -ENOEXEC;
1184 file_accessed(file);
1185 vma->vm_ops = &ceph_vmops;
1186 vma->vm_flags |= VM_CAN_NONLINEAR;
1187 return 0;
1188}
diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
new file mode 100644
index 000000000000..67b2c030924b
--- /dev/null
+++ b/fs/ceph/armor.c
@@ -0,0 +1,99 @@
1
2#include <linux/errno.h>
3
4/*
5 * base64 encode/decode.
6 */
7
8const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
9
10static int encode_bits(int c)
11{
12 return pem_key[c];
13}
14
15static int decode_bits(char c)
16{
17 if (c >= 'A' && c <= 'Z')
18 return c - 'A';
19 if (c >= 'a' && c <= 'z')
20 return c - 'a' + 26;
21 if (c >= '0' && c <= '9')
22 return c - '0' + 52;
23 if (c == '+')
24 return 62;
25 if (c == '/')
26 return 63;
27 if (c == '=')
28 return 0; /* just non-negative, please */
29 return -EINVAL;
30}
31
32int ceph_armor(char *dst, const char *src, const char *end)
33{
34 int olen = 0;
35 int line = 0;
36
37 while (src < end) {
38 unsigned char a, b, c;
39
40 a = *src++;
41 *dst++ = encode_bits(a >> 2);
42 if (src < end) {
43 b = *src++;
44 *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
45 if (src < end) {
46 c = *src++;
47 *dst++ = encode_bits(((b & 15) << 2) |
48 (c >> 6));
49 *dst++ = encode_bits(c & 63);
50 } else {
51 *dst++ = encode_bits((b & 15) << 2);
52 *dst++ = '=';
53 }
54 } else {
55 *dst++ = encode_bits(((a & 3) << 4));
56 *dst++ = '=';
57 *dst++ = '=';
58 }
59 olen += 4;
60 line += 4;
61 if (line == 64) {
62 line = 0;
63 *(dst++) = '\n';
64 olen++;
65 }
66 }
67 return olen;
68}
69
70int ceph_unarmor(char *dst, const char *src, const char *end)
71{
72 int olen = 0;
73
74 while (src < end) {
75 int a, b, c, d;
76
77 if (src < end && src[0] == '\n')
78 src++;
79 if (src + 4 > end)
80 return -EINVAL;
81 a = decode_bits(src[0]);
82 b = decode_bits(src[1]);
83 c = decode_bits(src[2]);
84 d = decode_bits(src[3]);
85 if (a < 0 || b < 0 || c < 0 || d < 0)
86 return -EINVAL;
87
88 *dst++ = (a << 2) | (b >> 4);
89 if (src[2] == '=')
90 return olen + 1;
91 *dst++ = ((b & 15) << 4) | (c >> 2);
92 if (src[3] == '=')
93 return olen + 2;
94 *dst++ = ((c & 3) << 6) | d;
95 olen += 3;
96 src += 4;
97 }
98 return olen;
99}
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
new file mode 100644
index 000000000000..abb204fea6c7
--- /dev/null
+++ b/fs/ceph/auth.c
@@ -0,0 +1,257 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/err.h>
5
6#include "types.h"
7#include "auth_none.h"
8#include "auth_x.h"
9#include "decode.h"
10#include "super.h"
11
12#include "messenger.h"
13
14/*
15 * get protocol handler
16 */
17static u32 supported_protocols[] = {
18 CEPH_AUTH_NONE,
19 CEPH_AUTH_CEPHX
20};
21
22int ceph_auth_init_protocol(struct ceph_auth_client *ac, int protocol)
23{
24 switch (protocol) {
25 case CEPH_AUTH_NONE:
26 return ceph_auth_none_init(ac);
27 case CEPH_AUTH_CEPHX:
28 return ceph_x_init(ac);
29 default:
30 return -ENOENT;
31 }
32}
33
34/*
35 * setup, teardown.
36 */
37struct ceph_auth_client *ceph_auth_init(const char *name, const char *secret)
38{
39 struct ceph_auth_client *ac;
40 int ret;
41
42 dout("auth_init name '%s' secret '%s'\n", name, secret);
43
44 ret = -ENOMEM;
45 ac = kzalloc(sizeof(*ac), GFP_NOFS);
46 if (!ac)
47 goto out;
48
49 ac->negotiating = true;
50 if (name)
51 ac->name = name;
52 else
53 ac->name = CEPH_AUTH_NAME_DEFAULT;
54 dout("auth_init name %s secret %s\n", ac->name, secret);
55 ac->secret = secret;
56 return ac;
57
58out:
59 return ERR_PTR(ret);
60}
61
62void ceph_auth_destroy(struct ceph_auth_client *ac)
63{
64 dout("auth_destroy %p\n", ac);
65 if (ac->ops)
66 ac->ops->destroy(ac);
67 kfree(ac);
68}
69
70/*
71 * Reset occurs when reconnecting to the monitor.
72 */
73void ceph_auth_reset(struct ceph_auth_client *ac)
74{
75 dout("auth_reset %p\n", ac);
76 if (ac->ops && !ac->negotiating)
77 ac->ops->reset(ac);
78 ac->negotiating = true;
79}
80
81int ceph_entity_name_encode(const char *name, void **p, void *end)
82{
83 int len = strlen(name);
84
85 if (*p + 2*sizeof(u32) + len > end)
86 return -ERANGE;
87 ceph_encode_32(p, CEPH_ENTITY_TYPE_CLIENT);
88 ceph_encode_32(p, len);
89 ceph_encode_copy(p, name, len);
90 return 0;
91}
92
93/*
94 * Initiate protocol negotiation with monitor. Include entity name
95 * and list supported protocols.
96 */
97int ceph_auth_build_hello(struct ceph_auth_client *ac, void *buf, size_t len)
98{
99 struct ceph_mon_request_header *monhdr = buf;
100 void *p = monhdr + 1, *end = buf + len, *lenp;
101 int i, num;
102 int ret;
103
104 dout("auth_build_hello\n");
105 monhdr->have_version = 0;
106 monhdr->session_mon = cpu_to_le16(-1);
107 monhdr->session_mon_tid = 0;
108
109 ceph_encode_32(&p, 0); /* no protocol, yet */
110
111 lenp = p;
112 p += sizeof(u32);
113
114 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
115 ceph_encode_8(&p, 1);
116 num = ARRAY_SIZE(supported_protocols);
117 ceph_encode_32(&p, num);
118 ceph_decode_need(&p, end, num * sizeof(u32), bad);
119 for (i = 0; i < num; i++)
120 ceph_encode_32(&p, supported_protocols[i]);
121
122 ret = ceph_entity_name_encode(ac->name, &p, end);
123 if (ret < 0)
124 return ret;
125 ceph_decode_need(&p, end, sizeof(u64), bad);
126 ceph_encode_64(&p, ac->global_id);
127
128 ceph_encode_32(&lenp, p - lenp - sizeof(u32));
129 return p - buf;
130
131bad:
132 return -ERANGE;
133}
134
135int ceph_build_auth_request(struct ceph_auth_client *ac,
136 void *msg_buf, size_t msg_len)
137{
138 struct ceph_mon_request_header *monhdr = msg_buf;
139 void *p = monhdr + 1;
140 void *end = msg_buf + msg_len;
141 int ret;
142
143 monhdr->have_version = 0;
144 monhdr->session_mon = cpu_to_le16(-1);
145 monhdr->session_mon_tid = 0;
146
147 ceph_encode_32(&p, ac->protocol);
148
149 ret = ac->ops->build_request(ac, p + sizeof(u32), end);
150 if (ret < 0) {
151 pr_err("error %d building request\n", ret);
152 return ret;
153 }
154 dout(" built request %d bytes\n", ret);
155 ceph_encode_32(&p, ret);
156 return p + ret - msg_buf;
157}
158
159/*
160 * Handle auth message from monitor.
161 */
162int ceph_handle_auth_reply(struct ceph_auth_client *ac,
163 void *buf, size_t len,
164 void *reply_buf, size_t reply_len)
165{
166 void *p = buf;
167 void *end = buf + len;
168 int protocol;
169 s32 result;
170 u64 global_id;
171 void *payload, *payload_end;
172 int payload_len;
173 char *result_msg;
174 int result_msg_len;
175 int ret = -EINVAL;
176
177 dout("handle_auth_reply %p %p\n", p, end);
178 ceph_decode_need(&p, end, sizeof(u32) * 3 + sizeof(u64), bad);
179 protocol = ceph_decode_32(&p);
180 result = ceph_decode_32(&p);
181 global_id = ceph_decode_64(&p);
182 payload_len = ceph_decode_32(&p);
183 payload = p;
184 p += payload_len;
185 ceph_decode_need(&p, end, sizeof(u32), bad);
186 result_msg_len = ceph_decode_32(&p);
187 result_msg = p;
188 p += result_msg_len;
189 if (p != end)
190 goto bad;
191
192 dout(" result %d '%.*s' gid %llu len %d\n", result, result_msg_len,
193 result_msg, global_id, payload_len);
194
195 payload_end = payload + payload_len;
196
197 if (global_id && ac->global_id != global_id) {
198 dout(" set global_id %lld -> %lld\n", ac->global_id, global_id);
199 ac->global_id = global_id;
200 }
201
202 if (ac->negotiating) {
203 /* server does not support our protocols? */
204 if (!protocol && result < 0) {
205 ret = result;
206 goto out;
207 }
208 /* set up (new) protocol handler? */
209 if (ac->protocol && ac->protocol != protocol) {
210 ac->ops->destroy(ac);
211 ac->protocol = 0;
212 ac->ops = NULL;
213 }
214 if (ac->protocol != protocol) {
215 ret = ceph_auth_init_protocol(ac, protocol);
216 if (ret) {
217 pr_err("error %d on auth protocol %d init\n",
218 ret, protocol);
219 goto out;
220 }
221 }
222
223 ac->negotiating = false;
224 }
225
226 ret = ac->ops->handle_reply(ac, result, payload, payload_end);
227 if (ret == -EAGAIN) {
228 return ceph_build_auth_request(ac, reply_buf, reply_len);
229 } else if (ret) {
230 pr_err("authentication error %d\n", ret);
231 return ret;
232 }
233 return 0;
234
235bad:
236 pr_err("failed to decode auth msg\n");
237out:
238 return ret;
239}
240
241int ceph_build_auth(struct ceph_auth_client *ac,
242 void *msg_buf, size_t msg_len)
243{
244 if (!ac->protocol)
245 return ceph_auth_build_hello(ac, msg_buf, msg_len);
246 BUG_ON(!ac->ops);
247 if (!ac->ops->is_authenticated(ac))
248 return ceph_build_auth_request(ac, msg_buf, msg_len);
249 return 0;
250}
251
252int ceph_auth_is_authenticated(struct ceph_auth_client *ac)
253{
254 if (!ac->ops)
255 return 0;
256 return ac->ops->is_authenticated(ac);
257}
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
new file mode 100644
index 000000000000..ca4f57cfb267
--- /dev/null
+++ b/fs/ceph/auth.h
@@ -0,0 +1,84 @@
1#ifndef _FS_CEPH_AUTH_H
2#define _FS_CEPH_AUTH_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * Abstract interface for communicating with the authenticate module.
9 * There is some handshake that takes place between us and the monitor
10 * to acquire the necessary keys. These are used to generate an
11 * 'authorizer' that we use when connecting to a service (mds, osd).
12 */
13
14struct ceph_auth_client;
15struct ceph_authorizer;
16
17struct ceph_auth_client_ops {
18 /*
19 * true if we are authenticated and can connect to
20 * services.
21 */
22 int (*is_authenticated)(struct ceph_auth_client *ac);
23
24 /*
25 * build requests and process replies during monitor
26 * handshake. if handle_reply returns -EAGAIN, we build
27 * another request.
28 */
29 int (*build_request)(struct ceph_auth_client *ac, void *buf, void *end);
30 int (*handle_reply)(struct ceph_auth_client *ac, int result,
31 void *buf, void *end);
32
33 /*
34 * Create authorizer for connecting to a service, and verify
35 * the response to authenticate the service.
36 */
37 int (*create_authorizer)(struct ceph_auth_client *ac, int peer_type,
38 struct ceph_authorizer **a,
39 void **buf, size_t *len,
40 void **reply_buf, size_t *reply_len);
41 int (*verify_authorizer_reply)(struct ceph_auth_client *ac,
42 struct ceph_authorizer *a, size_t len);
43 void (*destroy_authorizer)(struct ceph_auth_client *ac,
44 struct ceph_authorizer *a);
45 void (*invalidate_authorizer)(struct ceph_auth_client *ac,
46 int peer_type);
47
48 /* reset when we (re)connect to a monitor */
49 void (*reset)(struct ceph_auth_client *ac);
50
51 void (*destroy)(struct ceph_auth_client *ac);
52};
53
54struct ceph_auth_client {
55 u32 protocol; /* CEPH_AUTH_* */
56 void *private; /* for use by protocol implementation */
57 const struct ceph_auth_client_ops *ops; /* null iff protocol==0 */
58
59 bool negotiating; /* true if negotiating protocol */
60 const char *name; /* entity name */
61 u64 global_id; /* our unique id in system */
62 const char *secret; /* our secret key */
63 unsigned want_keys; /* which services we want */
64};
65
66extern struct ceph_auth_client *ceph_auth_init(const char *name,
67 const char *secret);
68extern void ceph_auth_destroy(struct ceph_auth_client *ac);
69
70extern void ceph_auth_reset(struct ceph_auth_client *ac);
71
72extern int ceph_auth_build_hello(struct ceph_auth_client *ac,
73 void *buf, size_t len);
74extern int ceph_handle_auth_reply(struct ceph_auth_client *ac,
75 void *buf, size_t len,
76 void *reply_buf, size_t reply_len);
77extern int ceph_entity_name_encode(const char *name, void **p, void *end);
78
79extern int ceph_build_auth(struct ceph_auth_client *ac,
80 void *msg_buf, size_t msg_len);
81
82extern int ceph_auth_is_authenticated(struct ceph_auth_client *ac);
83
84#endif
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
new file mode 100644
index 000000000000..b4ef6f0a6c85
--- /dev/null
+++ b/fs/ceph/auth_none.c
@@ -0,0 +1,121 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7
8#include "auth_none.h"
9#include "auth.h"
10#include "decode.h"
11
12static void reset(struct ceph_auth_client *ac)
13{
14 struct ceph_auth_none_info *xi = ac->private;
15
16 xi->starting = true;
17 xi->built_authorizer = false;
18}
19
20static void destroy(struct ceph_auth_client *ac)
21{
22 kfree(ac->private);
23 ac->private = NULL;
24}
25
26static int is_authenticated(struct ceph_auth_client *ac)
27{
28 struct ceph_auth_none_info *xi = ac->private;
29
30 return !xi->starting;
31}
32
33/*
34 * the generic auth code decode the global_id, and we carry no actual
35 * authenticate state, so nothing happens here.
36 */
37static int handle_reply(struct ceph_auth_client *ac, int result,
38 void *buf, void *end)
39{
40 struct ceph_auth_none_info *xi = ac->private;
41
42 xi->starting = false;
43 return result;
44}
45
46/*
47 * build an 'authorizer' with our entity_name and global_id. we can
48 * reuse a single static copy since it is identical for all services
49 * we connect to.
50 */
51static int ceph_auth_none_create_authorizer(
52 struct ceph_auth_client *ac, int peer_type,
53 struct ceph_authorizer **a,
54 void **buf, size_t *len,
55 void **reply_buf, size_t *reply_len)
56{
57 struct ceph_auth_none_info *ai = ac->private;
58 struct ceph_none_authorizer *au = &ai->au;
59 void *p, *end;
60 int ret;
61
62 if (!ai->built_authorizer) {
63 p = au->buf;
64 end = p + sizeof(au->buf);
65 ceph_encode_8(&p, 1);
66 ret = ceph_entity_name_encode(ac->name, &p, end - 8);
67 if (ret < 0)
68 goto bad;
69 ceph_decode_need(&p, end, sizeof(u64), bad2);
70 ceph_encode_64(&p, ac->global_id);
71 au->buf_len = p - (void *)au->buf;
72 ai->built_authorizer = true;
73 dout("built authorizer len %d\n", au->buf_len);
74 }
75
76 *a = (struct ceph_authorizer *)au;
77 *buf = au->buf;
78 *len = au->buf_len;
79 *reply_buf = au->reply_buf;
80 *reply_len = sizeof(au->reply_buf);
81 return 0;
82
83bad2:
84 ret = -ERANGE;
85bad:
86 return ret;
87}
88
89static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
90 struct ceph_authorizer *a)
91{
92 /* nothing to do */
93}
94
95static const struct ceph_auth_client_ops ceph_auth_none_ops = {
96 .reset = reset,
97 .destroy = destroy,
98 .is_authenticated = is_authenticated,
99 .handle_reply = handle_reply,
100 .create_authorizer = ceph_auth_none_create_authorizer,
101 .destroy_authorizer = ceph_auth_none_destroy_authorizer,
102};
103
104int ceph_auth_none_init(struct ceph_auth_client *ac)
105{
106 struct ceph_auth_none_info *xi;
107
108 dout("ceph_auth_none_init %p\n", ac);
109 xi = kzalloc(sizeof(*xi), GFP_NOFS);
110 if (!xi)
111 return -ENOMEM;
112
113 xi->starting = true;
114 xi->built_authorizer = false;
115
116 ac->protocol = CEPH_AUTH_NONE;
117 ac->private = xi;
118 ac->ops = &ceph_auth_none_ops;
119 return 0;
120}
121
diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
new file mode 100644
index 000000000000..56c05533a31c
--- /dev/null
+++ b/fs/ceph/auth_none.h
@@ -0,0 +1,28 @@
1#ifndef _FS_CEPH_AUTH_NONE_H
2#define _FS_CEPH_AUTH_NONE_H
3
4#include "auth.h"
5
6/*
7 * null security mode.
8 *
9 * we use a single static authorizer that simply encodes our entity name
10 * and global id.
11 */
12
13struct ceph_none_authorizer {
14 char buf[128];
15 int buf_len;
16 char reply_buf[0];
17};
18
19struct ceph_auth_none_info {
20 bool starting;
21 bool built_authorizer;
22 struct ceph_none_authorizer au; /* we only need one; it's static */
23};
24
25extern int ceph_auth_none_init(struct ceph_auth_client *ac);
26
27#endif
28
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
new file mode 100644
index 000000000000..f0318427b6da
--- /dev/null
+++ b/fs/ceph/auth_x.c
@@ -0,0 +1,656 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/module.h>
6#include <linux/random.h>
7
8#include "auth_x.h"
9#include "auth_x_protocol.h"
10#include "crypto.h"
11#include "auth.h"
12#include "decode.h"
13
14struct kmem_cache *ceph_x_ticketbuf_cachep;
15
16#define TEMP_TICKET_BUF_LEN 256
17
18static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
19
20static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
21{
22 struct ceph_x_info *xi = ac->private;
23 int need;
24
25 ceph_x_validate_tickets(ac, &need);
26 dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
27 ac->want_keys, need, xi->have_keys);
28 return (ac->want_keys & xi->have_keys) == ac->want_keys;
29}
30
31static int ceph_x_encrypt(struct ceph_crypto_key *secret,
32 void *ibuf, int ilen, void *obuf, size_t olen)
33{
34 struct ceph_x_encrypt_header head = {
35 .struct_v = 1,
36 .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
37 };
38 size_t len = olen - sizeof(u32);
39 int ret;
40
41 ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
42 &head, sizeof(head), ibuf, ilen);
43 if (ret)
44 return ret;
45 ceph_encode_32(&obuf, len);
46 return len + sizeof(u32);
47}
48
49static int ceph_x_decrypt(struct ceph_crypto_key *secret,
50 void **p, void *end, void *obuf, size_t olen)
51{
52 struct ceph_x_encrypt_header head;
53 size_t head_len = sizeof(head);
54 int len, ret;
55
56 len = ceph_decode_32(p);
57 if (*p + len > end)
58 return -EINVAL;
59
60 dout("ceph_x_decrypt len %d\n", len);
61 ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
62 *p, len);
63 if (ret)
64 return ret;
65 if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
66 return -EPERM;
67 *p += len;
68 return olen;
69}
70
71/*
72 * get existing (or insert new) ticket handler
73 */
74struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
75 int service)
76{
77 struct ceph_x_ticket_handler *th;
78 struct ceph_x_info *xi = ac->private;
79 struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;
80
81 while (*p) {
82 parent = *p;
83 th = rb_entry(parent, struct ceph_x_ticket_handler, node);
84 if (service < th->service)
85 p = &(*p)->rb_left;
86 else if (service > th->service)
87 p = &(*p)->rb_right;
88 else
89 return th;
90 }
91
92 /* add it */
93 th = kzalloc(sizeof(*th), GFP_NOFS);
94 if (!th)
95 return ERR_PTR(-ENOMEM);
96 th->service = service;
97 rb_link_node(&th->node, parent, p);
98 rb_insert_color(&th->node, &xi->ticket_handlers);
99 return th;
100}
101
102static void remove_ticket_handler(struct ceph_auth_client *ac,
103 struct ceph_x_ticket_handler *th)
104{
105 struct ceph_x_info *xi = ac->private;
106
107 dout("remove_ticket_handler %p %d\n", th, th->service);
108 rb_erase(&th->node, &xi->ticket_handlers);
109 ceph_crypto_key_destroy(&th->session_key);
110 if (th->ticket_blob)
111 ceph_buffer_put(th->ticket_blob);
112 kfree(th);
113}
114
115static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
116 struct ceph_crypto_key *secret,
117 void *buf, void *end)
118{
119 struct ceph_x_info *xi = ac->private;
120 int num;
121 void *p = buf;
122 int ret;
123 char *dbuf;
124 char *ticket_buf;
125 u8 struct_v;
126
127 dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC);
128 if (!dbuf)
129 return -ENOMEM;
130
131 ret = -ENOMEM;
132 ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep,
133 GFP_NOFS | GFP_ATOMIC);
134 if (!ticket_buf)
135 goto out_dbuf;
136
137 ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
138 struct_v = ceph_decode_8(&p);
139 if (struct_v != 1)
140 goto bad;
141 num = ceph_decode_32(&p);
142 dout("%d tickets\n", num);
143 while (num--) {
144 int type;
145 u8 struct_v;
146 struct ceph_x_ticket_handler *th;
147 void *dp, *dend;
148 int dlen;
149 char is_enc;
150 struct timespec validity;
151 struct ceph_crypto_key old_key;
152 void *tp, *tpend;
153
154 ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
155
156 type = ceph_decode_32(&p);
157 dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
158
159 struct_v = ceph_decode_8(&p);
160 if (struct_v != 1)
161 goto bad;
162
163 th = get_ticket_handler(ac, type);
164 if (IS_ERR(th)) {
165 ret = PTR_ERR(th);
166 goto out;
167 }
168
169 /* blob for me */
170 dlen = ceph_x_decrypt(secret, &p, end, dbuf,
171 TEMP_TICKET_BUF_LEN);
172 if (dlen <= 0) {
173 ret = dlen;
174 goto out;
175 }
176 dout(" decrypted %d bytes\n", dlen);
177 dend = dbuf + dlen;
178 dp = dbuf;
179
180 struct_v = ceph_decode_8(&dp);
181 if (struct_v != 1)
182 goto bad;
183
184 memcpy(&old_key, &th->session_key, sizeof(old_key));
185 ret = ceph_crypto_key_decode(&th->session_key, &dp, dend);
186 if (ret)
187 goto out;
188
189 ceph_decode_copy(&dp, &th->validity, sizeof(th->validity));
190 ceph_decode_timespec(&validity, &th->validity);
191 th->expires = get_seconds() + validity.tv_sec;
192 th->renew_after = th->expires - (validity.tv_sec / 4);
193 dout(" expires=%lu renew_after=%lu\n", th->expires,
194 th->renew_after);
195
196 /* ticket blob for service */
197 ceph_decode_8_safe(&p, end, is_enc, bad);
198 tp = ticket_buf;
199 if (is_enc) {
200 /* encrypted */
201 dout(" encrypted ticket\n");
202 dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
203 TEMP_TICKET_BUF_LEN);
204 if (dlen < 0) {
205 ret = dlen;
206 goto out;
207 }
208 dlen = ceph_decode_32(&tp);
209 } else {
210 /* unencrypted */
211 ceph_decode_32_safe(&p, end, dlen, bad);
212 ceph_decode_need(&p, end, dlen, bad);
213 ceph_decode_copy(&p, ticket_buf, dlen);
214 }
215 tpend = tp + dlen;
216 dout(" ticket blob is %d bytes\n", dlen);
217 ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
218 struct_v = ceph_decode_8(&tp);
219 th->secret_id = ceph_decode_64(&tp);
220 ret = ceph_decode_buffer(&th->ticket_blob, &tp, tpend);
221 if (ret)
222 goto out;
223 dout(" got ticket service %d (%s) secret_id %lld len %d\n",
224 type, ceph_entity_type_name(type), th->secret_id,
225 (int)th->ticket_blob->vec.iov_len);
226 xi->have_keys |= th->service;
227 }
228
229 ret = 0;
230out:
231 kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf);
232out_dbuf:
233 kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf);
234 return ret;
235
236bad:
237 ret = -EINVAL;
238 goto out;
239}
240
241static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
242 struct ceph_x_ticket_handler *th,
243 struct ceph_x_authorizer *au)
244{
245 int len;
246 struct ceph_x_authorize_a *msg_a;
247 struct ceph_x_authorize_b msg_b;
248 void *p, *end;
249 int ret;
250 int ticket_blob_len =
251 (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
252
253 dout("build_authorizer for %s %p\n",
254 ceph_entity_type_name(th->service), au);
255
256 len = sizeof(*msg_a) + sizeof(msg_b) + sizeof(u32) +
257 ticket_blob_len + 16;
258 dout(" need len %d\n", len);
259 if (au->buf && au->buf->alloc_len < len) {
260 ceph_buffer_put(au->buf);
261 au->buf = NULL;
262 }
263 if (!au->buf) {
264 au->buf = ceph_buffer_new(len, GFP_NOFS);
265 if (!au->buf)
266 return -ENOMEM;
267 }
268 au->service = th->service;
269
270 msg_a = au->buf->vec.iov_base;
271 msg_a->struct_v = 1;
272 msg_a->global_id = cpu_to_le64(ac->global_id);
273 msg_a->service_id = cpu_to_le32(th->service);
274 msg_a->ticket_blob.struct_v = 1;
275 msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
276 msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
277 if (ticket_blob_len) {
278 memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
279 th->ticket_blob->vec.iov_len);
280 }
281 dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
282 le64_to_cpu(msg_a->ticket_blob.secret_id));
283
284 p = msg_a + 1;
285 p += ticket_blob_len;
286 end = au->buf->vec.iov_base + au->buf->vec.iov_len;
287
288 get_random_bytes(&au->nonce, sizeof(au->nonce));
289 msg_b.struct_v = 1;
290 msg_b.nonce = cpu_to_le64(au->nonce);
291 ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
292 p, end - p);
293 if (ret < 0)
294 goto out_buf;
295 p += ret;
296 au->buf->vec.iov_len = p - au->buf->vec.iov_base;
297 dout(" built authorizer nonce %llx len %d\n", au->nonce,
298 (int)au->buf->vec.iov_len);
299 return 0;
300
301out_buf:
302 ceph_buffer_put(au->buf);
303 au->buf = NULL;
304 return ret;
305}
306
307static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
308 void **p, void *end)
309{
310 ceph_decode_need(p, end, 1 + sizeof(u64), bad);
311 ceph_encode_8(p, 1);
312 ceph_encode_64(p, th->secret_id);
313 if (th->ticket_blob) {
314 const char *buf = th->ticket_blob->vec.iov_base;
315 u32 len = th->ticket_blob->vec.iov_len;
316
317 ceph_encode_32_safe(p, end, len, bad);
318 ceph_encode_copy_safe(p, end, buf, len, bad);
319 } else {
320 ceph_encode_32_safe(p, end, 0, bad);
321 }
322
323 return 0;
324bad:
325 return -ERANGE;
326}
327
328static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
329{
330 int want = ac->want_keys;
331 struct ceph_x_info *xi = ac->private;
332 int service;
333
334 *pneed = ac->want_keys & ~(xi->have_keys);
335
336 for (service = 1; service <= want; service <<= 1) {
337 struct ceph_x_ticket_handler *th;
338
339 if (!(ac->want_keys & service))
340 continue;
341
342 if (*pneed & service)
343 continue;
344
345 th = get_ticket_handler(ac, service);
346
347 if (!th) {
348 *pneed |= service;
349 continue;
350 }
351
352 if (get_seconds() >= th->renew_after)
353 *pneed |= service;
354 if (get_seconds() >= th->expires)
355 xi->have_keys &= ~service;
356 }
357}
358
359
360static int ceph_x_build_request(struct ceph_auth_client *ac,
361 void *buf, void *end)
362{
363 struct ceph_x_info *xi = ac->private;
364 int need;
365 struct ceph_x_request_header *head = buf;
366 int ret;
367 struct ceph_x_ticket_handler *th =
368 get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
369
370 ceph_x_validate_tickets(ac, &need);
371
372 dout("build_request want %x have %x need %x\n",
373 ac->want_keys, xi->have_keys, need);
374
375 if (need & CEPH_ENTITY_TYPE_AUTH) {
376 struct ceph_x_authenticate *auth = (void *)(head + 1);
377 void *p = auth + 1;
378 struct ceph_x_challenge_blob tmp;
379 char tmp_enc[40];
380 u64 *u;
381
382 if (p > end)
383 return -ERANGE;
384
385 dout(" get_auth_session_key\n");
386 head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
387
388 /* encrypt and hash */
389 get_random_bytes(&auth->client_challenge, sizeof(u64));
390 tmp.client_challenge = auth->client_challenge;
391 tmp.server_challenge = cpu_to_le64(xi->server_challenge);
392 ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
393 tmp_enc, sizeof(tmp_enc));
394 if (ret < 0)
395 return ret;
396
397 auth->struct_v = 1;
398 auth->key = 0;
399 for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
400 auth->key ^= *u;
401 dout(" server_challenge %llx client_challenge %llx key %llx\n",
402 xi->server_challenge, le64_to_cpu(auth->client_challenge),
403 le64_to_cpu(auth->key));
404
405 /* now encode the old ticket if exists */
406 ret = ceph_x_encode_ticket(th, &p, end);
407 if (ret < 0)
408 return ret;
409
410 return p - buf;
411 }
412
413 if (need) {
414 void *p = head + 1;
415 struct ceph_x_service_ticket_request *req;
416
417 if (p > end)
418 return -ERANGE;
419 head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
420
421 BUG_ON(!th);
422 ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
423 if (ret)
424 return ret;
425 ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
426 xi->auth_authorizer.buf->vec.iov_len);
427
428 req = p;
429 req->keys = cpu_to_le32(need);
430 p += sizeof(*req);
431 return p - buf;
432 }
433
434 return 0;
435}
436
437static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
438 void *buf, void *end)
439{
440 struct ceph_x_info *xi = ac->private;
441 struct ceph_x_reply_header *head = buf;
442 struct ceph_x_ticket_handler *th;
443 int len = end - buf;
444 int op;
445 int ret;
446
447 if (result)
448 return result; /* XXX hmm? */
449
450 if (xi->starting) {
451 /* it's a hello */
452 struct ceph_x_server_challenge *sc = buf;
453
454 if (len != sizeof(*sc))
455 return -EINVAL;
456 xi->server_challenge = le64_to_cpu(sc->server_challenge);
457 dout("handle_reply got server challenge %llx\n",
458 xi->server_challenge);
459 xi->starting = false;
460 xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
461 return -EAGAIN;
462 }
463
464 op = le32_to_cpu(head->op);
465 result = le32_to_cpu(head->result);
466 dout("handle_reply op %d result %d\n", op, result);
467 switch (op) {
468 case CEPHX_GET_AUTH_SESSION_KEY:
469 /* verify auth key */
470 ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
471 buf + sizeof(*head), end);
472 break;
473
474 case CEPHX_GET_PRINCIPAL_SESSION_KEY:
475 th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
476 BUG_ON(!th);
477 ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
478 buf + sizeof(*head), end);
479 break;
480
481 default:
482 return -EINVAL;
483 }
484 if (ret)
485 return ret;
486 if (ac->want_keys == xi->have_keys)
487 return 0;
488 return -EAGAIN;
489}
490
491static int ceph_x_create_authorizer(
492 struct ceph_auth_client *ac, int peer_type,
493 struct ceph_authorizer **a,
494 void **buf, size_t *len,
495 void **reply_buf, size_t *reply_len)
496{
497 struct ceph_x_authorizer *au;
498 struct ceph_x_ticket_handler *th;
499 int ret;
500
501 th = get_ticket_handler(ac, peer_type);
502 if (IS_ERR(th))
503 return PTR_ERR(th);
504
505 au = kzalloc(sizeof(*au), GFP_NOFS);
506 if (!au)
507 return -ENOMEM;
508
509 ret = ceph_x_build_authorizer(ac, th, au);
510 if (ret) {
511 kfree(au);
512 return ret;
513 }
514
515 *a = (struct ceph_authorizer *)au;
516 *buf = au->buf->vec.iov_base;
517 *len = au->buf->vec.iov_len;
518 *reply_buf = au->reply_buf;
519 *reply_len = sizeof(au->reply_buf);
520 return 0;
521}
522
523static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
524 struct ceph_authorizer *a, size_t len)
525{
526 struct ceph_x_authorizer *au = (void *)a;
527 struct ceph_x_ticket_handler *th;
528 int ret = 0;
529 struct ceph_x_authorize_reply reply;
530 void *p = au->reply_buf;
531 void *end = p + sizeof(au->reply_buf);
532
533 th = get_ticket_handler(ac, au->service);
534 if (!th)
535 return -EIO; /* hrm! */
536 ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
537 if (ret < 0)
538 return ret;
539 if (ret != sizeof(reply))
540 return -EPERM;
541
542 if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
543 ret = -EPERM;
544 else
545 ret = 0;
546 dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
547 au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);
548 return ret;
549}
550
551static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
552 struct ceph_authorizer *a)
553{
554 struct ceph_x_authorizer *au = (void *)a;
555
556 ceph_buffer_put(au->buf);
557 kfree(au);
558}
559
560
561static void ceph_x_reset(struct ceph_auth_client *ac)
562{
563 struct ceph_x_info *xi = ac->private;
564
565 dout("reset\n");
566 xi->starting = true;
567 xi->server_challenge = 0;
568}
569
570static void ceph_x_destroy(struct ceph_auth_client *ac)
571{
572 struct ceph_x_info *xi = ac->private;
573 struct rb_node *p;
574
575 dout("ceph_x_destroy %p\n", ac);
576 ceph_crypto_key_destroy(&xi->secret);
577
578 while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
579 struct ceph_x_ticket_handler *th =
580 rb_entry(p, struct ceph_x_ticket_handler, node);
581 remove_ticket_handler(ac, th);
582 }
583
584 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
585
586 kfree(ac->private);
587 ac->private = NULL;
588}
589
590static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
591 int peer_type)
592{
593 struct ceph_x_ticket_handler *th;
594
595 th = get_ticket_handler(ac, peer_type);
596 if (th && !IS_ERR(th))
597 remove_ticket_handler(ac, th);
598}
599
600
601static const struct ceph_auth_client_ops ceph_x_ops = {
602 .is_authenticated = ceph_x_is_authenticated,
603 .build_request = ceph_x_build_request,
604 .handle_reply = ceph_x_handle_reply,
605 .create_authorizer = ceph_x_create_authorizer,
606 .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
607 .destroy_authorizer = ceph_x_destroy_authorizer,
608 .invalidate_authorizer = ceph_x_invalidate_authorizer,
609 .reset = ceph_x_reset,
610 .destroy = ceph_x_destroy,
611};
612
613
614int ceph_x_init(struct ceph_auth_client *ac)
615{
616 struct ceph_x_info *xi;
617 int ret;
618
619 dout("ceph_x_init %p\n", ac);
620 xi = kzalloc(sizeof(*xi), GFP_NOFS);
621 if (!xi)
622 return -ENOMEM;
623
624 ret = -ENOMEM;
625 ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
626 TEMP_TICKET_BUF_LEN, 8,
627 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
628 NULL);
629 if (!ceph_x_ticketbuf_cachep)
630 goto done_nomem;
631 ret = -EINVAL;
632 if (!ac->secret) {
633 pr_err("no secret set (for auth_x protocol)\n");
634 goto done_nomem;
635 }
636
637 ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
638 if (ret)
639 goto done_nomem;
640
641 xi->starting = true;
642 xi->ticket_handlers = RB_ROOT;
643
644 ac->protocol = CEPH_AUTH_CEPHX;
645 ac->private = xi;
646 ac->ops = &ceph_x_ops;
647 return 0;
648
649done_nomem:
650 kfree(xi);
651 if (ceph_x_ticketbuf_cachep)
652 kmem_cache_destroy(ceph_x_ticketbuf_cachep);
653 return ret;
654}
655
656
diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
new file mode 100644
index 000000000000..ff6f8180e681
--- /dev/null
+++ b/fs/ceph/auth_x.h
@@ -0,0 +1,49 @@
1#ifndef _FS_CEPH_AUTH_X_H
2#define _FS_CEPH_AUTH_X_H
3
4#include <linux/rbtree.h>
5
6#include "crypto.h"
7#include "auth.h"
8#include "auth_x_protocol.h"
9
10/*
11 * Handle ticket for a single service.
12 */
13struct ceph_x_ticket_handler {
14 struct rb_node node;
15 unsigned service;
16
17 struct ceph_crypto_key session_key;
18 struct ceph_timespec validity;
19
20 u64 secret_id;
21 struct ceph_buffer *ticket_blob;
22
23 unsigned long renew_after, expires;
24};
25
26
27struct ceph_x_authorizer {
28 struct ceph_buffer *buf;
29 unsigned service;
30 u64 nonce;
31 char reply_buf[128]; /* big enough for encrypted blob */
32};
33
34struct ceph_x_info {
35 struct ceph_crypto_key secret;
36
37 bool starting;
38 u64 server_challenge;
39
40 unsigned have_keys;
41 struct rb_root ticket_handlers;
42
43 struct ceph_x_authorizer auth_authorizer;
44};
45
46extern int ceph_x_init(struct ceph_auth_client *ac);
47
48#endif
49
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
new file mode 100644
index 000000000000..671d30576c4f
--- /dev/null
+++ b/fs/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
1#ifndef __FS_CEPH_AUTH_X_PROTOCOL
2#define __FS_CEPH_AUTH_X_PROTOCOL
3
4#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
5#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
6#define CEPHX_GET_ROTATING_KEY 0x0400
7
8/* common bits */
9struct ceph_x_ticket_blob {
10 __u8 struct_v;
11 __le64 secret_id;
12 __le32 blob_len;
13 char blob[];
14} __attribute__ ((packed));
15
16
17/* common request/reply headers */
18struct ceph_x_request_header {
19 __le16 op;
20} __attribute__ ((packed));
21
22struct ceph_x_reply_header {
23 __le16 op;
24 __le32 result;
25} __attribute__ ((packed));
26
27
28/* authenticate handshake */
29
30/* initial hello (no reply header) */
31struct ceph_x_server_challenge {
32 __u8 struct_v;
33 __le64 server_challenge;
34} __attribute__ ((packed));
35
36struct ceph_x_authenticate {
37 __u8 struct_v;
38 __le64 client_challenge;
39 __le64 key;
40 /* ticket blob */
41} __attribute__ ((packed));
42
43struct ceph_x_service_ticket_request {
44 __u8 struct_v;
45 __le32 keys;
46} __attribute__ ((packed));
47
48struct ceph_x_challenge_blob {
49 __le64 server_challenge;
50 __le64 client_challenge;
51} __attribute__ ((packed));
52
53
54
55/* authorize handshake */
56
57/*
58 * The authorizer consists of two pieces:
59 * a - service id, ticket blob
60 * b - encrypted with session key
61 */
62struct ceph_x_authorize_a {
63 __u8 struct_v;
64 __le64 global_id;
65 __le32 service_id;
66 struct ceph_x_ticket_blob ticket_blob;
67} __attribute__ ((packed));
68
69struct ceph_x_authorize_b {
70 __u8 struct_v;
71 __le64 nonce;
72} __attribute__ ((packed));
73
74struct ceph_x_authorize_reply {
75 __u8 struct_v;
76 __le64 nonce_plus_one;
77} __attribute__ ((packed));
78
79
80/*
81 * encyption bundle
82 */
83#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
84
85struct ceph_x_encrypt_header {
86 __u8 struct_v;
87 __le64 magic;
88} __attribute__ ((packed));
89
90#endif
diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
new file mode 100644
index 000000000000..b98086c7aeba
--- /dev/null
+++ b/fs/ceph/buffer.c
@@ -0,0 +1,78 @@
1
2#include "ceph_debug.h"
3#include "buffer.h"
4#include "decode.h"
5
6struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
7{
8 struct ceph_buffer *b;
9
10 b = kmalloc(sizeof(*b), gfp);
11 if (!b)
12 return NULL;
13
14 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
15 if (b->vec.iov_base) {
16 b->is_vmalloc = false;
17 } else {
18 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
19 if (!b->vec.iov_base) {
20 kfree(b);
21 return NULL;
22 }
23 b->is_vmalloc = true;
24 }
25
26 kref_init(&b->kref);
27 b->alloc_len = len;
28 b->vec.iov_len = len;
29 dout("buffer_new %p\n", b);
30 return b;
31}
32
33void ceph_buffer_release(struct kref *kref)
34{
35 struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
36
37 dout("buffer_release %p\n", b);
38 if (b->vec.iov_base) {
39 if (b->is_vmalloc)
40 vfree(b->vec.iov_base);
41 else
42 kfree(b->vec.iov_base);
43 }
44 kfree(b);
45}
46
47int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
48{
49 b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
50 if (b->vec.iov_base) {
51 b->is_vmalloc = false;
52 } else {
53 b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
54 b->is_vmalloc = true;
55 }
56 if (!b->vec.iov_base)
57 return -ENOMEM;
58 b->alloc_len = len;
59 b->vec.iov_len = len;
60 return 0;
61}
62
63int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
64{
65 size_t len;
66
67 ceph_decode_need(p, end, sizeof(u32), bad);
68 len = ceph_decode_32(p);
69 dout("decode_buffer len %d\n", (int)len);
70 ceph_decode_need(p, end, len, bad);
71 *b = ceph_buffer_new(len, GFP_NOFS);
72 if (!*b)
73 return -ENOMEM;
74 ceph_decode_copy(p, (*b)->vec.iov_base, len);
75 return 0;
76bad:
77 return -EINVAL;
78}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
new file mode 100644
index 000000000000..58d19014068f
--- /dev/null
+++ b/fs/ceph/buffer.h
@@ -0,0 +1,39 @@
1#ifndef __FS_CEPH_BUFFER_H
2#define __FS_CEPH_BUFFER_H
3
4#include <linux/kref.h>
5#include <linux/mm.h>
6#include <linux/vmalloc.h>
7#include <linux/types.h>
8#include <linux/uio.h>
9
10/*
11 * a simple reference counted buffer.
12 *
13 * use kmalloc for small sizes (<= one page), vmalloc for larger
14 * sizes.
15 */
16struct ceph_buffer {
17 struct kref kref;
18 struct kvec vec;
19 size_t alloc_len;
20 bool is_vmalloc;
21};
22
23extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
24extern void ceph_buffer_release(struct kref *kref);
25
26static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
27{
28 kref_get(&b->kref);
29 return b;
30}
31
32static inline void ceph_buffer_put(struct ceph_buffer *b)
33{
34 kref_put(&b->kref, ceph_buffer_release);
35}
36
37extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);
38
39#endif
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644
index 000000000000..db122bb357b8
--- /dev/null
+++ b/fs/ceph/caps.c
@@ -0,0 +1,2927 @@
1#include "ceph_debug.h"
2
3#include <linux/fs.h>
4#include <linux/kernel.h>
5#include <linux/sched.h>
6#include <linux/vmalloc.h>
7#include <linux/wait.h>
8#include <linux/writeback.h>
9
10#include "super.h"
11#include "decode.h"
12#include "messenger.h"
13
14/*
15 * Capability management
16 *
17 * The Ceph metadata servers control client access to inode metadata
18 * and file data by issuing capabilities, granting clients permission
19 * to read and/or write both inode field and file data to OSDs
20 * (storage nodes). Each capability consists of a set of bits
21 * indicating which operations are allowed.
22 *
23 * If the client holds a *_SHARED cap, the client has a coherent value
24 * that can be safely read from the cached inode.
25 *
26 * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
27 * client is allowed to change inode attributes (e.g., file size,
28 * mtime), note its dirty state in the ceph_cap, and asynchronously
29 * flush that metadata change to the MDS.
30 *
31 * In the event of a conflicting operation (perhaps by another
32 * client), the MDS will revoke the conflicting client capabilities.
33 *
34 * In order for a client to cache an inode, it must hold a capability
35 * with at least one MDS server. When inodes are released, release
36 * notifications are batched and periodically sent en masse to the MDS
37 * cluster to release server state.
38 */
39
40
41/*
42 * Generate readable cap strings for debugging output.
43 */
44#define MAX_CAP_STR 20
45static char cap_str[MAX_CAP_STR][40];
46static DEFINE_SPINLOCK(cap_str_lock);
47static int last_cap_str;
48
49static char *gcap_string(char *s, int c)
50{
51 if (c & CEPH_CAP_GSHARED)
52 *s++ = 's';
53 if (c & CEPH_CAP_GEXCL)
54 *s++ = 'x';
55 if (c & CEPH_CAP_GCACHE)
56 *s++ = 'c';
57 if (c & CEPH_CAP_GRD)
58 *s++ = 'r';
59 if (c & CEPH_CAP_GWR)
60 *s++ = 'w';
61 if (c & CEPH_CAP_GBUFFER)
62 *s++ = 'b';
63 if (c & CEPH_CAP_GLAZYIO)
64 *s++ = 'l';
65 return s;
66}
67
68const char *ceph_cap_string(int caps)
69{
70 int i;
71 char *s;
72 int c;
73
74 spin_lock(&cap_str_lock);
75 i = last_cap_str++;
76 if (last_cap_str == MAX_CAP_STR)
77 last_cap_str = 0;
78 spin_unlock(&cap_str_lock);
79
80 s = cap_str[i];
81
82 if (caps & CEPH_CAP_PIN)
83 *s++ = 'p';
84
85 c = (caps >> CEPH_CAP_SAUTH) & 3;
86 if (c) {
87 *s++ = 'A';
88 s = gcap_string(s, c);
89 }
90
91 c = (caps >> CEPH_CAP_SLINK) & 3;
92 if (c) {
93 *s++ = 'L';
94 s = gcap_string(s, c);
95 }
96
97 c = (caps >> CEPH_CAP_SXATTR) & 3;
98 if (c) {
99 *s++ = 'X';
100 s = gcap_string(s, c);
101 }
102
103 c = caps >> CEPH_CAP_SFILE;
104 if (c) {
105 *s++ = 'F';
106 s = gcap_string(s, c);
107 }
108
109 if (s == cap_str[i])
110 *s++ = '-';
111 *s = 0;
112 return cap_str[i];
113}
114
115/*
116 * Cap reservations
117 *
118 * Maintain a global pool of preallocated struct ceph_caps, referenced
119 * by struct ceph_caps_reservations. This ensures that we preallocate
120 * memory needed to successfully process an MDS response. (If an MDS
121 * sends us cap information and we fail to process it, we will have
122 * problems due to the client and MDS being out of sync.)
123 *
124 * Reservations are 'owned' by a ceph_cap_reservation context.
125 */
126static spinlock_t caps_list_lock;
127static struct list_head caps_list; /* unused (reserved or unreserved) */
128static int caps_total_count; /* total caps allocated */
129static int caps_use_count; /* in use */
130static int caps_reserve_count; /* unused, reserved */
131static int caps_avail_count; /* unused, unreserved */
132static int caps_min_count; /* keep at least this many (unreserved) */
133
134void __init ceph_caps_init(void)
135{
136 INIT_LIST_HEAD(&caps_list);
137 spin_lock_init(&caps_list_lock);
138}
139
140void ceph_caps_finalize(void)
141{
142 struct ceph_cap *cap;
143
144 spin_lock(&caps_list_lock);
145 while (!list_empty(&caps_list)) {
146 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
147 list_del(&cap->caps_item);
148 kmem_cache_free(ceph_cap_cachep, cap);
149 }
150 caps_total_count = 0;
151 caps_avail_count = 0;
152 caps_use_count = 0;
153 caps_reserve_count = 0;
154 caps_min_count = 0;
155 spin_unlock(&caps_list_lock);
156}
157
158void ceph_adjust_min_caps(int delta)
159{
160 spin_lock(&caps_list_lock);
161 caps_min_count += delta;
162 BUG_ON(caps_min_count < 0);
163 spin_unlock(&caps_list_lock);
164}
165
166int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
167{
168 int i;
169 struct ceph_cap *cap;
170 int have;
171 int alloc = 0;
172 LIST_HEAD(newcaps);
173 int ret = 0;
174
175 dout("reserve caps ctx=%p need=%d\n", ctx, need);
176
177 /* first reserve any caps that are already allocated */
178 spin_lock(&caps_list_lock);
179 if (caps_avail_count >= need)
180 have = need;
181 else
182 have = caps_avail_count;
183 caps_avail_count -= have;
184 caps_reserve_count += have;
185 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
186 caps_avail_count);
187 spin_unlock(&caps_list_lock);
188
189 for (i = have; i < need; i++) {
190 cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
191 if (!cap) {
192 ret = -ENOMEM;
193 goto out_alloc_count;
194 }
195 list_add(&cap->caps_item, &newcaps);
196 alloc++;
197 }
198 BUG_ON(have + alloc != need);
199
200 spin_lock(&caps_list_lock);
201 caps_total_count += alloc;
202 caps_reserve_count += alloc;
203 list_splice(&newcaps, &caps_list);
204
205 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
206 caps_avail_count);
207 spin_unlock(&caps_list_lock);
208
209 ctx->count = need;
210 dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
211 ctx, caps_total_count, caps_use_count, caps_reserve_count,
212 caps_avail_count);
213 return 0;
214
215out_alloc_count:
216 /* we didn't manage to reserve as much as we needed */
217 pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
218 ctx, need, have);
219 return ret;
220}
221
222int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
223{
224 dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
225 if (ctx->count) {
226 spin_lock(&caps_list_lock);
227 BUG_ON(caps_reserve_count < ctx->count);
228 caps_reserve_count -= ctx->count;
229 caps_avail_count += ctx->count;
230 ctx->count = 0;
231 dout("unreserve caps %d = %d used + %d resv + %d avail\n",
232 caps_total_count, caps_use_count, caps_reserve_count,
233 caps_avail_count);
234 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
235 caps_avail_count);
236 spin_unlock(&caps_list_lock);
237 }
238 return 0;
239}
240
241static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
242{
243 struct ceph_cap *cap = NULL;
244
245 /* temporary, until we do something about cap import/export */
246 if (!ctx)
247 return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
248
249 spin_lock(&caps_list_lock);
250 dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
251 ctx, ctx->count, caps_total_count, caps_use_count,
252 caps_reserve_count, caps_avail_count);
253 BUG_ON(!ctx->count);
254 BUG_ON(ctx->count > caps_reserve_count);
255 BUG_ON(list_empty(&caps_list));
256
257 ctx->count--;
258 caps_reserve_count--;
259 caps_use_count++;
260
261 cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
262 list_del(&cap->caps_item);
263
264 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
265 caps_avail_count);
266 spin_unlock(&caps_list_lock);
267 return cap;
268}
269
270void ceph_put_cap(struct ceph_cap *cap)
271{
272 spin_lock(&caps_list_lock);
273 dout("put_cap %p %d = %d used + %d resv + %d avail\n",
274 cap, caps_total_count, caps_use_count,
275 caps_reserve_count, caps_avail_count);
276 caps_use_count--;
277 /*
278 * Keep some preallocated caps around (ceph_min_count), to
279 * avoid lots of free/alloc churn.
280 */
281 if (caps_avail_count >= caps_reserve_count + caps_min_count) {
282 caps_total_count--;
283 kmem_cache_free(ceph_cap_cachep, cap);
284 } else {
285 caps_avail_count++;
286 list_add(&cap->caps_item, &caps_list);
287 }
288
289 BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
290 caps_avail_count);
291 spin_unlock(&caps_list_lock);
292}
293
294void ceph_reservation_status(struct ceph_client *client,
295 int *total, int *avail, int *used, int *reserved,
296 int *min)
297{
298 if (total)
299 *total = caps_total_count;
300 if (avail)
301 *avail = caps_avail_count;
302 if (used)
303 *used = caps_use_count;
304 if (reserved)
305 *reserved = caps_reserve_count;
306 if (min)
307 *min = caps_min_count;
308}
309
310/*
311 * Find ceph_cap for given mds, if any.
312 *
313 * Called with i_lock held.
314 */
315static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
316{
317 struct ceph_cap *cap;
318 struct rb_node *n = ci->i_caps.rb_node;
319
320 while (n) {
321 cap = rb_entry(n, struct ceph_cap, ci_node);
322 if (mds < cap->mds)
323 n = n->rb_left;
324 else if (mds > cap->mds)
325 n = n->rb_right;
326 else
327 return cap;
328 }
329 return NULL;
330}
331
332/*
333 * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
334 * -1.
335 */
336static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
337{
338 struct ceph_cap *cap;
339 int mds = -1;
340 struct rb_node *p;
341
342 /* prefer mds with WR|WRBUFFER|EXCL caps */
343 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
344 cap = rb_entry(p, struct ceph_cap, ci_node);
345 mds = cap->mds;
346 if (mseq)
347 *mseq = cap->mseq;
348 if (cap->issued & (CEPH_CAP_FILE_WR |
349 CEPH_CAP_FILE_BUFFER |
350 CEPH_CAP_FILE_EXCL))
351 break;
352 }
353 return mds;
354}
355
356int ceph_get_cap_mds(struct inode *inode)
357{
358 int mds;
359 spin_lock(&inode->i_lock);
360 mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
361 spin_unlock(&inode->i_lock);
362 return mds;
363}
364
365/*
366 * Called under i_lock.
367 */
368static void __insert_cap_node(struct ceph_inode_info *ci,
369 struct ceph_cap *new)
370{
371 struct rb_node **p = &ci->i_caps.rb_node;
372 struct rb_node *parent = NULL;
373 struct ceph_cap *cap = NULL;
374
375 while (*p) {
376 parent = *p;
377 cap = rb_entry(parent, struct ceph_cap, ci_node);
378 if (new->mds < cap->mds)
379 p = &(*p)->rb_left;
380 else if (new->mds > cap->mds)
381 p = &(*p)->rb_right;
382 else
383 BUG();
384 }
385
386 rb_link_node(&new->ci_node, parent, p);
387 rb_insert_color(&new->ci_node, &ci->i_caps);
388}
389
390/*
391 * (re)set cap hold timeouts, which control the delayed release
392 * of unused caps back to the MDS. Should be called on cap use.
393 */
394static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
395 struct ceph_inode_info *ci)
396{
397 struct ceph_mount_args *ma = mdsc->client->mount_args;
398
399 ci->i_hold_caps_min = round_jiffies(jiffies +
400 ma->caps_wanted_delay_min * HZ);
401 ci->i_hold_caps_max = round_jiffies(jiffies +
402 ma->caps_wanted_delay_max * HZ);
403 dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
404 ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
405}
406
407/*
408 * (Re)queue cap at the end of the delayed cap release list.
409 *
410 * If I_FLUSH is set, leave the inode at the front of the list.
411 *
412 * Caller holds i_lock
413 * -> we take mdsc->cap_delay_lock
414 */
415static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
416 struct ceph_inode_info *ci)
417{
418 __cap_set_timeouts(mdsc, ci);
419 dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
420 ci->i_ceph_flags, ci->i_hold_caps_max);
421 if (!mdsc->stopping) {
422 spin_lock(&mdsc->cap_delay_lock);
423 if (!list_empty(&ci->i_cap_delay_list)) {
424 if (ci->i_ceph_flags & CEPH_I_FLUSH)
425 goto no_change;
426 list_del_init(&ci->i_cap_delay_list);
427 }
428 list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
429no_change:
430 spin_unlock(&mdsc->cap_delay_lock);
431 }
432}
433
434/*
435 * Queue an inode for immediate writeback. Mark inode with I_FLUSH,
436 * indicating we should send a cap message to flush dirty metadata
437 * asap, and move to the front of the delayed cap list.
438 */
439static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
440 struct ceph_inode_info *ci)
441{
442 dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
443 spin_lock(&mdsc->cap_delay_lock);
444 ci->i_ceph_flags |= CEPH_I_FLUSH;
445 if (!list_empty(&ci->i_cap_delay_list))
446 list_del_init(&ci->i_cap_delay_list);
447 list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
448 spin_unlock(&mdsc->cap_delay_lock);
449}
450
451/*
452 * Cancel delayed work on cap.
453 *
454 * Caller must hold i_lock.
455 */
456static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
457 struct ceph_inode_info *ci)
458{
459 dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
460 if (list_empty(&ci->i_cap_delay_list))
461 return;
462 spin_lock(&mdsc->cap_delay_lock);
463 list_del_init(&ci->i_cap_delay_list);
464 spin_unlock(&mdsc->cap_delay_lock);
465}
466
467/*
468 * Common issue checks for add_cap, handle_cap_grant.
469 */
470static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
471 unsigned issued)
472{
473 unsigned had = __ceph_caps_issued(ci, NULL);
474
475 /*
476 * Each time we receive FILE_CACHE anew, we increment
477 * i_rdcache_gen.
478 */
479 if ((issued & CEPH_CAP_FILE_CACHE) &&
480 (had & CEPH_CAP_FILE_CACHE) == 0)
481 ci->i_rdcache_gen++;
482
483 /*
484 * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
485 * don't know what happened to this directory while we didn't
486 * have the cap.
487 */
488 if ((issued & CEPH_CAP_FILE_SHARED) &&
489 (had & CEPH_CAP_FILE_SHARED) == 0) {
490 ci->i_shared_gen++;
491 if (S_ISDIR(ci->vfs_inode.i_mode)) {
492 dout(" marking %p NOT complete\n", &ci->vfs_inode);
493 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
494 }
495 }
496}
497
498/*
499 * Add a capability under the given MDS session.
500 *
501 * Caller should hold session snap_rwsem (read) and s_mutex.
502 *
503 * @fmode is the open file mode, if we are opening a file, otherwise
504 * it is < 0. (This is so we can atomically add the cap and add an
505 * open file reference to it.)
506 */
507int ceph_add_cap(struct inode *inode,
508 struct ceph_mds_session *session, u64 cap_id,
509 int fmode, unsigned issued, unsigned wanted,
510 unsigned seq, unsigned mseq, u64 realmino, int flags,
511 struct ceph_cap_reservation *caps_reservation)
512{
513 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
514 struct ceph_inode_info *ci = ceph_inode(inode);
515 struct ceph_cap *new_cap = NULL;
516 struct ceph_cap *cap;
517 int mds = session->s_mds;
518 int actual_wanted;
519
520 dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
521 session->s_mds, cap_id, ceph_cap_string(issued), seq);
522
523 /*
524 * If we are opening the file, include file mode wanted bits
525 * in wanted.
526 */
527 if (fmode >= 0)
528 wanted |= ceph_caps_for_mode(fmode);
529
530retry:
531 spin_lock(&inode->i_lock);
532 cap = __get_cap_for_mds(ci, mds);
533 if (!cap) {
534 if (new_cap) {
535 cap = new_cap;
536 new_cap = NULL;
537 } else {
538 spin_unlock(&inode->i_lock);
539 new_cap = get_cap(caps_reservation);
540 if (new_cap == NULL)
541 return -ENOMEM;
542 goto retry;
543 }
544
545 cap->issued = 0;
546 cap->implemented = 0;
547 cap->mds = mds;
548 cap->mds_wanted = 0;
549
550 cap->ci = ci;
551 __insert_cap_node(ci, cap);
552
553 /* clear out old exporting info? (i.e. on cap import) */
554 if (ci->i_cap_exporting_mds == mds) {
555 ci->i_cap_exporting_issued = 0;
556 ci->i_cap_exporting_mseq = 0;
557 ci->i_cap_exporting_mds = -1;
558 }
559
560 /* add to session cap list */
561 cap->session = session;
562 spin_lock(&session->s_cap_lock);
563 list_add_tail(&cap->session_caps, &session->s_caps);
564 session->s_nr_caps++;
565 spin_unlock(&session->s_cap_lock);
566 }
567
568 if (!ci->i_snap_realm) {
569 /*
570 * add this inode to the appropriate snap realm
571 */
572 struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
573 realmino);
574 if (realm) {
575 ceph_get_snap_realm(mdsc, realm);
576 spin_lock(&realm->inodes_with_caps_lock);
577 ci->i_snap_realm = realm;
578 list_add(&ci->i_snap_realm_item,
579 &realm->inodes_with_caps);
580 spin_unlock(&realm->inodes_with_caps_lock);
581 } else {
582 pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
583 realmino);
584 }
585 }
586
587 __check_cap_issue(ci, cap, issued);
588
589 /*
590 * If we are issued caps we don't want, or the mds' wanted
591 * value appears to be off, queue a check so we'll release
592 * later and/or update the mds wanted value.
593 */
594 actual_wanted = __ceph_caps_wanted(ci);
595 if ((wanted & ~actual_wanted) ||
596 (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
597 dout(" issued %s, mds wanted %s, actual %s, queueing\n",
598 ceph_cap_string(issued), ceph_cap_string(wanted),
599 ceph_cap_string(actual_wanted));
600 __cap_delay_requeue(mdsc, ci);
601 }
602
603 if (flags & CEPH_CAP_FLAG_AUTH)
604 ci->i_auth_cap = cap;
605 else if (ci->i_auth_cap == cap)
606 ci->i_auth_cap = NULL;
607
608 dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
609 inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
610 ceph_cap_string(issued|cap->issued), seq, mds);
611 cap->cap_id = cap_id;
612 cap->issued = issued;
613 cap->implemented |= issued;
614 cap->mds_wanted |= wanted;
615 cap->seq = seq;
616 cap->issue_seq = seq;
617 cap->mseq = mseq;
618 cap->cap_gen = session->s_cap_gen;
619
620 if (fmode >= 0)
621 __ceph_get_fmode(ci, fmode);
622 spin_unlock(&inode->i_lock);
623 wake_up(&ci->i_cap_wq);
624 return 0;
625}
626
627/*
628 * Return true if cap has not timed out and belongs to the current
629 * generation of the MDS session (i.e. has not gone 'stale' due to
630 * us losing touch with the mds).
631 */
632static int __cap_is_valid(struct ceph_cap *cap)
633{
634 unsigned long ttl;
635 u32 gen;
636
637 spin_lock(&cap->session->s_cap_lock);
638 gen = cap->session->s_cap_gen;
639 ttl = cap->session->s_cap_ttl;
640 spin_unlock(&cap->session->s_cap_lock);
641
642 if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
643 dout("__cap_is_valid %p cap %p issued %s "
644 "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
645 cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);
646 return 0;
647 }
648
649 return 1;
650}
651
652/*
653 * Return set of valid cap bits issued to us. Note that caps time
654 * out, and may be invalidated in bulk if the client session times out
655 * and session->s_cap_gen is bumped.
656 */
657int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
658{
659 int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
660 struct ceph_cap *cap;
661 struct rb_node *p;
662
663 if (implemented)
664 *implemented = 0;
665 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
666 cap = rb_entry(p, struct ceph_cap, ci_node);
667 if (!__cap_is_valid(cap))
668 continue;
669 dout("__ceph_caps_issued %p cap %p issued %s\n",
670 &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
671 have |= cap->issued;
672 if (implemented)
673 *implemented |= cap->implemented;
674 }
675 return have;
676}
677
678/*
679 * Get cap bits issued by caps other than @ocap
680 */
681int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
682{
683 int have = ci->i_snap_caps;
684 struct ceph_cap *cap;
685 struct rb_node *p;
686
687 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
688 cap = rb_entry(p, struct ceph_cap, ci_node);
689 if (cap == ocap)
690 continue;
691 if (!__cap_is_valid(cap))
692 continue;
693 have |= cap->issued;
694 }
695 return have;
696}
697
698/*
699 * Move a cap to the end of the LRU (oldest caps at list head, newest
700 * at list tail).
701 */
702static void __touch_cap(struct ceph_cap *cap)
703{
704 struct ceph_mds_session *s = cap->session;
705
706 spin_lock(&s->s_cap_lock);
707 if (s->s_cap_iterator == NULL) {
708 dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
709 s->s_mds);
710 list_move_tail(&cap->session_caps, &s->s_caps);
711 } else {
712 dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
713 &cap->ci->vfs_inode, cap, s->s_mds);
714 }
715 spin_unlock(&s->s_cap_lock);
716}
717
718/*
719 * Check if we hold the given mask. If so, move the cap(s) to the
720 * front of their respective LRUs. (This is the preferred way for
721 * callers to check for caps they want.)
722 */
723int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
724{
725 struct ceph_cap *cap;
726 struct rb_node *p;
727 int have = ci->i_snap_caps;
728
729 if ((have & mask) == mask) {
730 dout("__ceph_caps_issued_mask %p snap issued %s"
731 " (mask %s)\n", &ci->vfs_inode,
732 ceph_cap_string(have),
733 ceph_cap_string(mask));
734 return 1;
735 }
736
737 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
738 cap = rb_entry(p, struct ceph_cap, ci_node);
739 if (!__cap_is_valid(cap))
740 continue;
741 if ((cap->issued & mask) == mask) {
742 dout("__ceph_caps_issued_mask %p cap %p issued %s"
743 " (mask %s)\n", &ci->vfs_inode, cap,
744 ceph_cap_string(cap->issued),
745 ceph_cap_string(mask));
746 if (touch)
747 __touch_cap(cap);
748 return 1;
749 }
750
751 /* does a combination of caps satisfy mask? */
752 have |= cap->issued;
753 if ((have & mask) == mask) {
754 dout("__ceph_caps_issued_mask %p combo issued %s"
755 " (mask %s)\n", &ci->vfs_inode,
756 ceph_cap_string(cap->issued),
757 ceph_cap_string(mask));
758 if (touch) {
759 struct rb_node *q;
760
761 /* touch this + preceeding caps */
762 __touch_cap(cap);
763 for (q = rb_first(&ci->i_caps); q != p;
764 q = rb_next(q)) {
765 cap = rb_entry(q, struct ceph_cap,
766 ci_node);
767 if (!__cap_is_valid(cap))
768 continue;
769 __touch_cap(cap);
770 }
771 }
772 return 1;
773 }
774 }
775
776 return 0;
777}
778
779/*
780 * Return true if mask caps are currently being revoked by an MDS.
781 */
782int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
783{
784 struct inode *inode = &ci->vfs_inode;
785 struct ceph_cap *cap;
786 struct rb_node *p;
787 int ret = 0;
788
789 spin_lock(&inode->i_lock);
790 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
791 cap = rb_entry(p, struct ceph_cap, ci_node);
792 if (__cap_is_valid(cap) &&
793 (cap->implemented & ~cap->issued & mask)) {
794 ret = 1;
795 break;
796 }
797 }
798 spin_unlock(&inode->i_lock);
799 dout("ceph_caps_revoking %p %s = %d\n", inode,
800 ceph_cap_string(mask), ret);
801 return ret;
802}
803
804int __ceph_caps_used(struct ceph_inode_info *ci)
805{
806 int used = 0;
807 if (ci->i_pin_ref)
808 used |= CEPH_CAP_PIN;
809 if (ci->i_rd_ref)
810 used |= CEPH_CAP_FILE_RD;
811 if (ci->i_rdcache_ref || ci->i_rdcache_gen)
812 used |= CEPH_CAP_FILE_CACHE;
813 if (ci->i_wr_ref)
814 used |= CEPH_CAP_FILE_WR;
815 if (ci->i_wrbuffer_ref)
816 used |= CEPH_CAP_FILE_BUFFER;
817 return used;
818}
819
820/*
821 * wanted, by virtue of open file modes
822 */
823int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
824{
825 int want = 0;
826 int mode;
827 for (mode = 0; mode < 4; mode++)
828 if (ci->i_nr_by_mode[mode])
829 want |= ceph_caps_for_mode(mode);
830 return want;
831}
832
833/*
834 * Return caps we have registered with the MDS(s) as 'wanted'.
835 */
836int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
837{
838 struct ceph_cap *cap;
839 struct rb_node *p;
840 int mds_wanted = 0;
841
842 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
843 cap = rb_entry(p, struct ceph_cap, ci_node);
844 if (!__cap_is_valid(cap))
845 continue;
846 mds_wanted |= cap->mds_wanted;
847 }
848 return mds_wanted;
849}
850
851/*
852 * called under i_lock
853 */
854static int __ceph_is_any_caps(struct ceph_inode_info *ci)
855{
856 return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
857}
858
859/*
860 * caller should hold i_lock.
861 * caller will not hold session s_mutex if called from destroy_inode.
862 */
863void __ceph_remove_cap(struct ceph_cap *cap)
864{
865 struct ceph_mds_session *session = cap->session;
866 struct ceph_inode_info *ci = cap->ci;
867 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
868
869 dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
870
871 /* remove from inode list */
872 rb_erase(&cap->ci_node, &ci->i_caps);
873 cap->ci = NULL;
874 if (ci->i_auth_cap == cap)
875 ci->i_auth_cap = NULL;
876
877 /* remove from session list */
878 spin_lock(&session->s_cap_lock);
879 if (session->s_cap_iterator == cap) {
880 /* not yet, we are iterating over this very cap */
881 dout("__ceph_remove_cap delaying %p removal from session %p\n",
882 cap, cap->session);
883 } else {
884 list_del_init(&cap->session_caps);
885 session->s_nr_caps--;
886 cap->session = NULL;
887 }
888 spin_unlock(&session->s_cap_lock);
889
890 if (cap->session == NULL)
891 ceph_put_cap(cap);
892
893 if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
894 struct ceph_snap_realm *realm = ci->i_snap_realm;
895 spin_lock(&realm->inodes_with_caps_lock);
896 list_del_init(&ci->i_snap_realm_item);
897 ci->i_snap_realm_counter++;
898 ci->i_snap_realm = NULL;
899 spin_unlock(&realm->inodes_with_caps_lock);
900 ceph_put_snap_realm(mdsc, realm);
901 }
902 if (!__ceph_is_any_real_caps(ci))
903 __cap_delay_cancel(mdsc, ci);
904}
905
906/*
907 * Build and send a cap message to the given MDS.
908 *
909 * Caller should be holding s_mutex.
910 */
911static int send_cap_msg(struct ceph_mds_session *session,
912 u64 ino, u64 cid, int op,
913 int caps, int wanted, int dirty,
914 u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
915 u64 size, u64 max_size,
916 struct timespec *mtime, struct timespec *atime,
917 u64 time_warp_seq,
918 uid_t uid, gid_t gid, mode_t mode,
919 u64 xattr_version,
920 struct ceph_buffer *xattrs_buf,
921 u64 follows)
922{
923 struct ceph_mds_caps *fc;
924 struct ceph_msg *msg;
925
926 dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
927 " seq %u/%u mseq %u follows %lld size %llu/%llu"
928 " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
929 cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
930 ceph_cap_string(dirty),
931 seq, issue_seq, mseq, follows, size, max_size,
932 xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
933
934 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
935 if (IS_ERR(msg))
936 return PTR_ERR(msg);
937
938 msg->hdr.tid = cpu_to_le64(flush_tid);
939
940 fc = msg->front.iov_base;
941 memset(fc, 0, sizeof(*fc));
942
943 fc->cap_id = cpu_to_le64(cid);
944 fc->op = cpu_to_le32(op);
945 fc->seq = cpu_to_le32(seq);
946 fc->issue_seq = cpu_to_le32(issue_seq);
947 fc->migrate_seq = cpu_to_le32(mseq);
948 fc->caps = cpu_to_le32(caps);
949 fc->wanted = cpu_to_le32(wanted);
950 fc->dirty = cpu_to_le32(dirty);
951 fc->ino = cpu_to_le64(ino);
952 fc->snap_follows = cpu_to_le64(follows);
953
954 fc->size = cpu_to_le64(size);
955 fc->max_size = cpu_to_le64(max_size);
956 if (mtime)
957 ceph_encode_timespec(&fc->mtime, mtime);
958 if (atime)
959 ceph_encode_timespec(&fc->atime, atime);
960 fc->time_warp_seq = cpu_to_le32(time_warp_seq);
961
962 fc->uid = cpu_to_le32(uid);
963 fc->gid = cpu_to_le32(gid);
964 fc->mode = cpu_to_le32(mode);
965
966 fc->xattr_version = cpu_to_le64(xattr_version);
967 if (xattrs_buf) {
968 msg->middle = ceph_buffer_get(xattrs_buf);
969 fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
970 msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
971 }
972
973 ceph_con_send(&session->s_con, msg);
974 return 0;
975}
976
977/*
978 * Queue cap releases when an inode is dropped from our cache. Since
979 * inode is about to be destroyed, there is no need for i_lock.
980 */
981void ceph_queue_caps_release(struct inode *inode)
982{
983 struct ceph_inode_info *ci = ceph_inode(inode);
984 struct rb_node *p;
985
986 p = rb_first(&ci->i_caps);
987 while (p) {
988 struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
989 struct ceph_mds_session *session = cap->session;
990 struct ceph_msg *msg;
991 struct ceph_mds_cap_release *head;
992 struct ceph_mds_cap_item *item;
993
994 spin_lock(&session->s_cap_lock);
995 BUG_ON(!session->s_num_cap_releases);
996 msg = list_first_entry(&session->s_cap_releases,
997 struct ceph_msg, list_head);
998
999 dout(" adding %p release to mds%d msg %p (%d left)\n",
1000 inode, session->s_mds, msg, session->s_num_cap_releases);
1001
1002 BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
1003 head = msg->front.iov_base;
1004 head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
1005 item = msg->front.iov_base + msg->front.iov_len;
1006 item->ino = cpu_to_le64(ceph_ino(inode));
1007 item->cap_id = cpu_to_le64(cap->cap_id);
1008 item->migrate_seq = cpu_to_le32(cap->mseq);
1009 item->seq = cpu_to_le32(cap->issue_seq);
1010
1011 session->s_num_cap_releases--;
1012
1013 msg->front.iov_len += sizeof(*item);
1014 if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
1015 dout(" release msg %p full\n", msg);
1016 list_move_tail(&msg->list_head,
1017 &session->s_cap_releases_done);
1018 } else {
1019 dout(" release msg %p at %d/%d (%d)\n", msg,
1020 (int)le32_to_cpu(head->num),
1021 (int)CEPH_CAPS_PER_RELEASE,
1022 (int)msg->front.iov_len);
1023 }
1024 spin_unlock(&session->s_cap_lock);
1025 p = rb_next(p);
1026 __ceph_remove_cap(cap);
1027 }
1028}
1029
1030/*
1031 * Send a cap msg on the given inode. Update our caps state, then
1032 * drop i_lock and send the message.
1033 *
1034 * Make note of max_size reported/requested from mds, revoked caps
1035 * that have now been implemented.
1036 *
1037 * Make half-hearted attempt ot to invalidate page cache if we are
1038 * dropping RDCACHE. Note that this will leave behind locked pages
1039 * that we'll then need to deal with elsewhere.
1040 *
1041 * Return non-zero if delayed release, or we experienced an error
1042 * such that the caller should requeue + retry later.
1043 *
1044 * called with i_lock, then drops it.
1045 * caller should hold snap_rwsem (read), s_mutex.
1046 */
1047static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
1048 int op, int used, int want, int retain, int flushing,
1049 unsigned *pflush_tid)
1050 __releases(cap->ci->vfs_inode->i_lock)
1051{
1052 struct ceph_inode_info *ci = cap->ci;
1053 struct inode *inode = &ci->vfs_inode;
1054 u64 cap_id = cap->cap_id;
1055 int held, revoking, dropping, keep;
1056 u64 seq, issue_seq, mseq, time_warp_seq, follows;
1057 u64 size, max_size;
1058 struct timespec mtime, atime;
1059 int wake = 0;
1060 mode_t mode;
1061 uid_t uid;
1062 gid_t gid;
1063 struct ceph_mds_session *session;
1064 u64 xattr_version = 0;
1065 int delayed = 0;
1066 u64 flush_tid = 0;
1067 int i;
1068 int ret;
1069
1070 held = cap->issued | cap->implemented;
1071 revoking = cap->implemented & ~cap->issued;
1072 retain &= ~revoking;
1073 dropping = cap->issued & ~retain;
1074
1075 dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
1076 inode, cap, cap->session,
1077 ceph_cap_string(held), ceph_cap_string(held & retain),
1078 ceph_cap_string(revoking));
1079 BUG_ON((retain & CEPH_CAP_PIN) == 0);
1080
1081 session = cap->session;
1082
1083 /* don't release wanted unless we've waited a bit. */
1084 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1085 time_before(jiffies, ci->i_hold_caps_min)) {
1086 dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
1087 ceph_cap_string(cap->issued),
1088 ceph_cap_string(cap->issued & retain),
1089 ceph_cap_string(cap->mds_wanted),
1090 ceph_cap_string(want));
1091 want |= cap->mds_wanted;
1092 retain |= cap->issued;
1093 delayed = 1;
1094 }
1095 ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
1096
1097 cap->issued &= retain; /* drop bits we don't want */
1098 if (cap->implemented & ~cap->issued) {
1099 /*
1100 * Wake up any waiters on wanted -> needed transition.
1101 * This is due to the weird transition from buffered
1102 * to sync IO... we need to flush dirty pages _before_
1103 * allowing sync writes to avoid reordering.
1104 */
1105 wake = 1;
1106 }
1107 cap->implemented &= cap->issued | used;
1108 cap->mds_wanted = want;
1109
1110 if (flushing) {
1111 /*
1112 * assign a tid for flush operations so we can avoid
1113 * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
1114 * clean type races. track latest tid for every bit
1115 * so we can handle flush AxFw, flush Fw, and have the
1116 * first ack clean Ax.
1117 */
1118 flush_tid = ++ci->i_cap_flush_last_tid;
1119 if (pflush_tid)
1120 *pflush_tid = flush_tid;
1121 dout(" cap_flush_tid %d\n", (int)flush_tid);
1122 for (i = 0; i < CEPH_CAP_BITS; i++)
1123 if (flushing & (1 << i))
1124 ci->i_cap_flush_tid[i] = flush_tid;
1125 }
1126
1127 keep = cap->implemented;
1128 seq = cap->seq;
1129 issue_seq = cap->issue_seq;
1130 mseq = cap->mseq;
1131 size = inode->i_size;
1132 ci->i_reported_size = size;
1133 max_size = ci->i_wanted_max_size;
1134 ci->i_requested_max_size = max_size;
1135 mtime = inode->i_mtime;
1136 atime = inode->i_atime;
1137 time_warp_seq = ci->i_time_warp_seq;
1138 follows = ci->i_snap_realm->cached_context->seq;
1139 uid = inode->i_uid;
1140 gid = inode->i_gid;
1141 mode = inode->i_mode;
1142
1143 if (dropping & CEPH_CAP_XATTR_EXCL) {
1144 __ceph_build_xattrs_blob(ci);
1145 xattr_version = ci->i_xattrs.version + 1;
1146 }
1147
1148 spin_unlock(&inode->i_lock);
1149
1150 ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
1151 op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
1152 size, max_size, &mtime, &atime, time_warp_seq,
1153 uid, gid, mode,
1154 xattr_version,
1155 (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
1156 follows);
1157 if (ret < 0) {
1158 dout("error sending cap msg, must requeue %p\n", inode);
1159 delayed = 1;
1160 }
1161
1162 if (wake)
1163 wake_up(&ci->i_cap_wq);
1164
1165 return delayed;
1166}
1167
1168/*
1169 * When a snapshot is taken, clients accumulate dirty metadata on
1170 * inodes with capabilities in ceph_cap_snaps to describe the file
1171 * state at the time the snapshot was taken. This must be flushed
1172 * asynchronously back to the MDS once sync writes complete and dirty
1173 * data is written out.
1174 *
1175 * Called under i_lock. Takes s_mutex as needed.
1176 */
1177void __ceph_flush_snaps(struct ceph_inode_info *ci,
1178 struct ceph_mds_session **psession)
1179{
1180 struct inode *inode = &ci->vfs_inode;
1181 int mds;
1182 struct ceph_cap_snap *capsnap;
1183 u32 mseq;
1184 struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
1185 struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
1186 session->s_mutex */
1187 u64 next_follows = 0; /* keep track of how far we've gotten through the
1188 i_cap_snaps list, and skip these entries next time
1189 around to avoid an infinite loop */
1190
1191 if (psession)
1192 session = *psession;
1193
1194 dout("__flush_snaps %p\n", inode);
1195retry:
1196 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
1197 /* avoid an infiniute loop after retry */
1198 if (capsnap->follows < next_follows)
1199 continue;
1200 /*
1201 * we need to wait for sync writes to complete and for dirty
1202 * pages to be written out.
1203 */
1204 if (capsnap->dirty_pages || capsnap->writing)
1205 continue;
1206
1207 /* pick mds, take s_mutex */
1208 mds = __ceph_get_cap_mds(ci, &mseq);
1209 if (session && session->s_mds != mds) {
1210 dout("oops, wrong session %p mutex\n", session);
1211 mutex_unlock(&session->s_mutex);
1212 ceph_put_mds_session(session);
1213 session = NULL;
1214 }
1215 if (!session) {
1216 spin_unlock(&inode->i_lock);
1217 mutex_lock(&mdsc->mutex);
1218 session = __ceph_lookup_mds_session(mdsc, mds);
1219 mutex_unlock(&mdsc->mutex);
1220 if (session) {
1221 dout("inverting session/ino locks on %p\n",
1222 session);
1223 mutex_lock(&session->s_mutex);
1224 }
1225 /*
1226 * if session == NULL, we raced against a cap
1227 * deletion. retry, and we'll get a better
1228 * @mds value next time.
1229 */
1230 spin_lock(&inode->i_lock);
1231 goto retry;
1232 }
1233
1234 capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
1235 atomic_inc(&capsnap->nref);
1236 if (!list_empty(&capsnap->flushing_item))
1237 list_del_init(&capsnap->flushing_item);
1238 list_add_tail(&capsnap->flushing_item,
1239 &session->s_cap_snaps_flushing);
1240 spin_unlock(&inode->i_lock);
1241
1242 dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
1243 inode, capsnap, next_follows, capsnap->size);
1244 send_cap_msg(session, ceph_vino(inode).ino, 0,
1245 CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
1246 capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
1247 capsnap->size, 0,
1248 &capsnap->mtime, &capsnap->atime,
1249 capsnap->time_warp_seq,
1250 capsnap->uid, capsnap->gid, capsnap->mode,
1251 0, NULL,
1252 capsnap->follows);
1253
1254 next_follows = capsnap->follows + 1;
1255 ceph_put_cap_snap(capsnap);
1256
1257 spin_lock(&inode->i_lock);
1258 goto retry;
1259 }
1260
1261 /* we flushed them all; remove this inode from the queue */
1262 spin_lock(&mdsc->snap_flush_lock);
1263 list_del_init(&ci->i_snap_flush_item);
1264 spin_unlock(&mdsc->snap_flush_lock);
1265
1266 if (psession)
1267 *psession = session;
1268 else if (session) {
1269 mutex_unlock(&session->s_mutex);
1270 ceph_put_mds_session(session);
1271 }
1272}
1273
1274static void ceph_flush_snaps(struct ceph_inode_info *ci)
1275{
1276 struct inode *inode = &ci->vfs_inode;
1277
1278 spin_lock(&inode->i_lock);
1279 __ceph_flush_snaps(ci, NULL);
1280 spin_unlock(&inode->i_lock);
1281}
1282
1283/*
1284 * Mark caps dirty. If inode is newly dirty, add to the global dirty
1285 * list.
1286 */
1287void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
1288{
1289 struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
1290 struct inode *inode = &ci->vfs_inode;
1291 int was = ci->i_dirty_caps;
1292 int dirty = 0;
1293
1294 dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
1295 ceph_cap_string(mask), ceph_cap_string(was),
1296 ceph_cap_string(was | mask));
1297 ci->i_dirty_caps |= mask;
1298 if (was == 0) {
1299 dout(" inode %p now dirty\n", &ci->vfs_inode);
1300 BUG_ON(!list_empty(&ci->i_dirty_item));
1301 spin_lock(&mdsc->cap_dirty_lock);
1302 list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
1303 spin_unlock(&mdsc->cap_dirty_lock);
1304 if (ci->i_flushing_caps == 0) {
1305 igrab(inode);
1306 dirty |= I_DIRTY_SYNC;
1307 }
1308 }
1309 BUG_ON(list_empty(&ci->i_dirty_item));
1310 if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
1311 (mask & CEPH_CAP_FILE_BUFFER))
1312 dirty |= I_DIRTY_DATASYNC;
1313 if (dirty)
1314 __mark_inode_dirty(inode, dirty);
1315 __cap_delay_requeue(mdsc, ci);
1316}
1317
1318/*
1319 * Add dirty inode to the flushing list. Assigned a seq number so we
1320 * can wait for caps to flush without starving.
1321 *
1322 * Called under i_lock.
1323 */
1324static int __mark_caps_flushing(struct inode *inode,
1325 struct ceph_mds_session *session)
1326{
1327 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1328 struct ceph_inode_info *ci = ceph_inode(inode);
1329 int flushing;
1330
1331 BUG_ON(ci->i_dirty_caps == 0);
1332 BUG_ON(list_empty(&ci->i_dirty_item));
1333
1334 flushing = ci->i_dirty_caps;
1335 dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
1336 ceph_cap_string(flushing),
1337 ceph_cap_string(ci->i_flushing_caps),
1338 ceph_cap_string(ci->i_flushing_caps | flushing));
1339 ci->i_flushing_caps |= flushing;
1340 ci->i_dirty_caps = 0;
1341 dout(" inode %p now !dirty\n", inode);
1342
1343 spin_lock(&mdsc->cap_dirty_lock);
1344 list_del_init(&ci->i_dirty_item);
1345
1346 ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
1347 if (list_empty(&ci->i_flushing_item)) {
1348 list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1349 mdsc->num_cap_flushing++;
1350 dout(" inode %p now flushing seq %lld\n", inode,
1351 ci->i_cap_flush_seq);
1352 } else {
1353 list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
1354 dout(" inode %p now flushing (more) seq %lld\n", inode,
1355 ci->i_cap_flush_seq);
1356 }
1357 spin_unlock(&mdsc->cap_dirty_lock);
1358
1359 return flushing;
1360}
1361
1362/*
1363 * try to invalidate mapping pages without blocking.
1364 */
1365static int mapping_is_empty(struct address_space *mapping)
1366{
1367 struct page *page = find_get_page(mapping, 0);
1368
1369 if (!page)
1370 return 1;
1371
1372 put_page(page);
1373 return 0;
1374}
1375
1376static int try_nonblocking_invalidate(struct inode *inode)
1377{
1378 struct ceph_inode_info *ci = ceph_inode(inode);
1379 u32 invalidating_gen = ci->i_rdcache_gen;
1380
1381 spin_unlock(&inode->i_lock);
1382 invalidate_mapping_pages(&inode->i_data, 0, -1);
1383 spin_lock(&inode->i_lock);
1384
1385 if (mapping_is_empty(&inode->i_data) &&
1386 invalidating_gen == ci->i_rdcache_gen) {
1387 /* success. */
1388 dout("try_nonblocking_invalidate %p success\n", inode);
1389 ci->i_rdcache_gen = 0;
1390 ci->i_rdcache_revoking = 0;
1391 return 0;
1392 }
1393 dout("try_nonblocking_invalidate %p failed\n", inode);
1394 return -1;
1395}
1396
1397/*
1398 * Swiss army knife function to examine currently used and wanted
1399 * versus held caps. Release, flush, ack revoked caps to mds as
1400 * appropriate.
1401 *
1402 * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
1403 * cap release further.
1404 * CHECK_CAPS_AUTHONLY - we should only check the auth cap
1405 * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
1406 * further delay.
1407 */
1408void ceph_check_caps(struct ceph_inode_info *ci, int flags,
1409 struct ceph_mds_session *session)
1410{
1411 struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);
1412 struct ceph_mds_client *mdsc = &client->mdsc;
1413 struct inode *inode = &ci->vfs_inode;
1414 struct ceph_cap *cap;
1415 int file_wanted, used;
1416 int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
1417 int drop_session_lock = session ? 0 : 1;
1418 int issued, implemented, want, retain, revoking, flushing = 0;
1419 int mds = -1; /* keep track of how far we've gone through i_caps list
1420 to avoid an infinite loop on retry */
1421 struct rb_node *p;
1422 int tried_invalidate = 0;
1423 int delayed = 0, sent = 0, force_requeue = 0, num;
1424 int queue_invalidate = 0;
1425 int is_delayed = flags & CHECK_CAPS_NODELAY;
1426
1427 /* if we are unmounting, flush any unused caps immediately. */
1428 if (mdsc->stopping)
1429 is_delayed = 1;
1430
1431 spin_lock(&inode->i_lock);
1432
1433 if (ci->i_ceph_flags & CEPH_I_FLUSH)
1434 flags |= CHECK_CAPS_FLUSH;
1435
1436 /* flush snaps first time around only */
1437 if (!list_empty(&ci->i_cap_snaps))
1438 __ceph_flush_snaps(ci, &session);
1439 goto retry_locked;
1440retry:
1441 spin_lock(&inode->i_lock);
1442retry_locked:
1443 file_wanted = __ceph_caps_file_wanted(ci);
1444 used = __ceph_caps_used(ci);
1445 want = file_wanted | used;
1446 issued = __ceph_caps_issued(ci, &implemented);
1447 revoking = implemented & ~issued;
1448
1449 retain = want | CEPH_CAP_PIN;
1450 if (!mdsc->stopping && inode->i_nlink > 0) {
1451 if (want) {
1452 retain |= CEPH_CAP_ANY; /* be greedy */
1453 } else {
1454 retain |= CEPH_CAP_ANY_SHARED;
1455 /*
1456 * keep RD only if we didn't have the file open RW,
1457 * because then the mds would revoke it anyway to
1458 * journal max_size=0.
1459 */
1460 if (ci->i_max_size == 0)
1461 retain |= CEPH_CAP_ANY_RD;
1462 }
1463 }
1464
1465 dout("check_caps %p file_want %s used %s dirty %s flushing %s"
1466 " issued %s revoking %s retain %s %s%s%s\n", inode,
1467 ceph_cap_string(file_wanted),
1468 ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
1469 ceph_cap_string(ci->i_flushing_caps),
1470 ceph_cap_string(issued), ceph_cap_string(revoking),
1471 ceph_cap_string(retain),
1472 (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
1473 (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
1474 (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
1475
1476 /*
1477 * If we no longer need to hold onto old our caps, and we may
1478 * have cached pages, but don't want them, then try to invalidate.
1479 * If we fail, it's because pages are locked.... try again later.
1480 */
1481 if ((!is_delayed || mdsc->stopping) &&
1482 ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
1483 ci->i_rdcache_gen && /* may have cached pages */
1484 (file_wanted == 0 || /* no open files */
1485 (revoking & CEPH_CAP_FILE_CACHE)) && /* or revoking cache */
1486 !tried_invalidate) {
1487 dout("check_caps trying to invalidate on %p\n", inode);
1488 if (try_nonblocking_invalidate(inode) < 0) {
1489 if (revoking & CEPH_CAP_FILE_CACHE) {
1490 dout("check_caps queuing invalidate\n");
1491 queue_invalidate = 1;
1492 ci->i_rdcache_revoking = ci->i_rdcache_gen;
1493 } else {
1494 dout("check_caps failed to invalidate pages\n");
1495 /* we failed to invalidate pages. check these
1496 caps again later. */
1497 force_requeue = 1;
1498 __cap_set_timeouts(mdsc, ci);
1499 }
1500 }
1501 tried_invalidate = 1;
1502 goto retry_locked;
1503 }
1504
1505 num = 0;
1506 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
1507 cap = rb_entry(p, struct ceph_cap, ci_node);
1508 num++;
1509
1510 /* avoid looping forever */
1511 if (mds >= cap->mds ||
1512 ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
1513 continue;
1514
1515 /* NOTE: no side-effects allowed, until we take s_mutex */
1516
1517 revoking = cap->implemented & ~cap->issued;
1518 if (revoking)
1519 dout(" mds%d revoking %s\n", cap->mds,
1520 ceph_cap_string(revoking));
1521
1522 if (cap == ci->i_auth_cap &&
1523 (cap->issued & CEPH_CAP_FILE_WR)) {
1524 /* request larger max_size from MDS? */
1525 if (ci->i_wanted_max_size > ci->i_max_size &&
1526 ci->i_wanted_max_size > ci->i_requested_max_size) {
1527 dout("requesting new max_size\n");
1528 goto ack;
1529 }
1530
1531 /* approaching file_max? */
1532 if ((inode->i_size << 1) >= ci->i_max_size &&
1533 (ci->i_reported_size << 1) < ci->i_max_size) {
1534 dout("i_size approaching max_size\n");
1535 goto ack;
1536 }
1537 }
1538 /* flush anything dirty? */
1539 if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
1540 ci->i_dirty_caps) {
1541 dout("flushing dirty caps\n");
1542 goto ack;
1543 }
1544
1545 /* completed revocation? going down and there are no caps? */
1546 if (revoking && (revoking & used) == 0) {
1547 dout("completed revocation of %s\n",
1548 ceph_cap_string(cap->implemented & ~cap->issued));
1549 goto ack;
1550 }
1551
1552 /* want more caps from mds? */
1553 if (want & ~(cap->mds_wanted | cap->issued))
1554 goto ack;
1555
1556 /* things we might delay */
1557 if ((cap->issued & ~retain) == 0 &&
1558 cap->mds_wanted == want)
1559 continue; /* nope, all good */
1560
1561 if (is_delayed)
1562 goto ack;
1563
1564 /* delay? */
1565 if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
1566 time_before(jiffies, ci->i_hold_caps_max)) {
1567 dout(" delaying issued %s -> %s, wanted %s -> %s\n",
1568 ceph_cap_string(cap->issued),
1569 ceph_cap_string(cap->issued & retain),
1570 ceph_cap_string(cap->mds_wanted),
1571 ceph_cap_string(want));
1572 delayed++;
1573 continue;
1574 }
1575
1576ack:
1577 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1578 dout(" skipping %p I_NOFLUSH set\n", inode);
1579 continue;
1580 }
1581
1582 if (session && session != cap->session) {
1583 dout("oops, wrong session %p mutex\n", session);
1584 mutex_unlock(&session->s_mutex);
1585 session = NULL;
1586 }
1587 if (!session) {
1588 session = cap->session;
1589 if (mutex_trylock(&session->s_mutex) == 0) {
1590 dout("inverting session/ino locks on %p\n",
1591 session);
1592 spin_unlock(&inode->i_lock);
1593 if (took_snap_rwsem) {
1594 up_read(&mdsc->snap_rwsem);
1595 took_snap_rwsem = 0;
1596 }
1597 mutex_lock(&session->s_mutex);
1598 goto retry;
1599 }
1600 }
1601 /* take snap_rwsem after session mutex */
1602 if (!took_snap_rwsem) {
1603 if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
1604 dout("inverting snap/in locks on %p\n",
1605 inode);
1606 spin_unlock(&inode->i_lock);
1607 down_read(&mdsc->snap_rwsem);
1608 took_snap_rwsem = 1;
1609 goto retry;
1610 }
1611 took_snap_rwsem = 1;
1612 }
1613
1614 if (cap == ci->i_auth_cap && ci->i_dirty_caps)
1615 flushing = __mark_caps_flushing(inode, session);
1616
1617 mds = cap->mds; /* remember mds, so we don't repeat */
1618 sent++;
1619
1620 /* __send_cap drops i_lock */
1621 delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
1622 retain, flushing, NULL);
1623 goto retry; /* retake i_lock and restart our cap scan. */
1624 }
1625
1626 /*
1627 * Reschedule delayed caps release if we delayed anything,
1628 * otherwise cancel.
1629 */
1630 if (delayed && is_delayed)
1631 force_requeue = 1; /* __send_cap delayed release; requeue */
1632 if (!delayed && !is_delayed)
1633 __cap_delay_cancel(mdsc, ci);
1634 else if (!is_delayed || force_requeue)
1635 __cap_delay_requeue(mdsc, ci);
1636
1637 spin_unlock(&inode->i_lock);
1638
1639 if (queue_invalidate)
1640 ceph_queue_invalidate(inode);
1641
1642 if (session && drop_session_lock)
1643 mutex_unlock(&session->s_mutex);
1644 if (took_snap_rwsem)
1645 up_read(&mdsc->snap_rwsem);
1646}
1647
1648/*
1649 * Try to flush dirty caps back to the auth mds.
1650 */
1651static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
1652 unsigned *flush_tid)
1653{
1654 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1655 struct ceph_inode_info *ci = ceph_inode(inode);
1656 int unlock_session = session ? 0 : 1;
1657 int flushing = 0;
1658
1659retry:
1660 spin_lock(&inode->i_lock);
1661 if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
1662 dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
1663 goto out;
1664 }
1665 if (ci->i_dirty_caps && ci->i_auth_cap) {
1666 struct ceph_cap *cap = ci->i_auth_cap;
1667 int used = __ceph_caps_used(ci);
1668 int want = __ceph_caps_wanted(ci);
1669 int delayed;
1670
1671 if (!session) {
1672 spin_unlock(&inode->i_lock);
1673 session = cap->session;
1674 mutex_lock(&session->s_mutex);
1675 goto retry;
1676 }
1677 BUG_ON(session != cap->session);
1678 if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
1679 goto out;
1680
1681 flushing = __mark_caps_flushing(inode, session);
1682
1683 /* __send_cap drops i_lock */
1684 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
1685 cap->issued | cap->implemented, flushing,
1686 flush_tid);
1687 if (!delayed)
1688 goto out_unlocked;
1689
1690 spin_lock(&inode->i_lock);
1691 __cap_delay_requeue(mdsc, ci);
1692 }
1693out:
1694 spin_unlock(&inode->i_lock);
1695out_unlocked:
1696 if (session && unlock_session)
1697 mutex_unlock(&session->s_mutex);
1698 return flushing;
1699}
1700
1701/*
1702 * Return true if we've flushed caps through the given flush_tid.
1703 */
1704static int caps_are_flushed(struct inode *inode, unsigned tid)
1705{
1706 struct ceph_inode_info *ci = ceph_inode(inode);
1707 int dirty, i, ret = 1;
1708
1709 spin_lock(&inode->i_lock);
1710 dirty = __ceph_caps_dirty(ci);
1711 for (i = 0; i < CEPH_CAP_BITS; i++)
1712 if ((ci->i_flushing_caps & (1 << i)) &&
1713 ci->i_cap_flush_tid[i] <= tid) {
1714 /* still flushing this bit */
1715 ret = 0;
1716 break;
1717 }
1718 spin_unlock(&inode->i_lock);
1719 return ret;
1720}
1721
1722/*
1723 * Wait on any unsafe replies for the given inode. First wait on the
1724 * newest request, and make that the upper bound. Then, if there are
1725 * more requests, keep waiting on the oldest as long as it is still older
1726 * than the original request.
1727 */
1728static void sync_write_wait(struct inode *inode)
1729{
1730 struct ceph_inode_info *ci = ceph_inode(inode);
1731 struct list_head *head = &ci->i_unsafe_writes;
1732 struct ceph_osd_request *req;
1733 u64 last_tid;
1734
1735 spin_lock(&ci->i_unsafe_lock);
1736 if (list_empty(head))
1737 goto out;
1738
1739 /* set upper bound as _last_ entry in chain */
1740 req = list_entry(head->prev, struct ceph_osd_request,
1741 r_unsafe_item);
1742 last_tid = req->r_tid;
1743
1744 do {
1745 ceph_osdc_get_request(req);
1746 spin_unlock(&ci->i_unsafe_lock);
1747 dout("sync_write_wait on tid %llu (until %llu)\n",
1748 req->r_tid, last_tid);
1749 wait_for_completion(&req->r_safe_completion);
1750 spin_lock(&ci->i_unsafe_lock);
1751 ceph_osdc_put_request(req);
1752
1753 /*
1754 * from here on look at first entry in chain, since we
1755 * only want to wait for anything older than last_tid
1756 */
1757 if (list_empty(head))
1758 break;
1759 req = list_entry(head->next, struct ceph_osd_request,
1760 r_unsafe_item);
1761 } while (req->r_tid < last_tid);
1762out:
1763 spin_unlock(&ci->i_unsafe_lock);
1764}
1765
1766int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
1767{
1768 struct inode *inode = dentry->d_inode;
1769 struct ceph_inode_info *ci = ceph_inode(inode);
1770 unsigned flush_tid;
1771 int ret;
1772 int dirty;
1773
1774 dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
1775 sync_write_wait(inode);
1776
1777 ret = filemap_write_and_wait(inode->i_mapping);
1778 if (ret < 0)
1779 return ret;
1780
1781 dirty = try_flush_caps(inode, NULL, &flush_tid);
1782 dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
1783
1784 /*
1785 * only wait on non-file metadata writeback (the mds
1786 * can recover size and mtime, so we don't need to
1787 * wait for that)
1788 */
1789 if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
1790 dout("fsync waiting for flush_tid %u\n", flush_tid);
1791 ret = wait_event_interruptible(ci->i_cap_wq,
1792 caps_are_flushed(inode, flush_tid));
1793 }
1794
1795 dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");
1796 return ret;
1797}
1798
1799/*
1800 * Flush any dirty caps back to the mds. If we aren't asked to wait,
1801 * queue inode for flush but don't do so immediately, because we can
1802 * get by with fewer MDS messages if we wait for data writeback to
1803 * complete first.
1804 */
1805int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
1806{
1807 struct ceph_inode_info *ci = ceph_inode(inode);
1808 unsigned flush_tid;
1809 int err = 0;
1810 int dirty;
1811 int wait = wbc->sync_mode == WB_SYNC_ALL;
1812
1813 dout("write_inode %p wait=%d\n", inode, wait);
1814 if (wait) {
1815 dirty = try_flush_caps(inode, NULL, &flush_tid);
1816 if (dirty)
1817 err = wait_event_interruptible(ci->i_cap_wq,
1818 caps_are_flushed(inode, flush_tid));
1819 } else {
1820 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
1821
1822 spin_lock(&inode->i_lock);
1823 if (__ceph_caps_dirty(ci))
1824 __cap_delay_requeue_front(mdsc, ci);
1825 spin_unlock(&inode->i_lock);
1826 }
1827 return err;
1828}
1829
1830/*
1831 * After a recovering MDS goes active, we need to resend any caps
1832 * we were flushing.
1833 *
1834 * Caller holds session->s_mutex.
1835 */
1836static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
1837 struct ceph_mds_session *session)
1838{
1839 struct ceph_cap_snap *capsnap;
1840
1841 dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
1842 list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
1843 flushing_item) {
1844 struct ceph_inode_info *ci = capsnap->ci;
1845 struct inode *inode = &ci->vfs_inode;
1846 struct ceph_cap *cap;
1847
1848 spin_lock(&inode->i_lock);
1849 cap = ci->i_auth_cap;
1850 if (cap && cap->session == session) {
1851 dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
1852 cap, capsnap);
1853 __ceph_flush_snaps(ci, &session);
1854 } else {
1855 pr_err("%p auth cap %p not mds%d ???\n", inode,
1856 cap, session->s_mds);
1857 spin_unlock(&inode->i_lock);
1858 }
1859 }
1860}
1861
1862void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
1863 struct ceph_mds_session *session)
1864{
1865 struct ceph_inode_info *ci;
1866
1867 kick_flushing_capsnaps(mdsc, session);
1868
1869 dout("kick_flushing_caps mds%d\n", session->s_mds);
1870 list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
1871 struct inode *inode = &ci->vfs_inode;
1872 struct ceph_cap *cap;
1873 int delayed = 0;
1874
1875 spin_lock(&inode->i_lock);
1876 cap = ci->i_auth_cap;
1877 if (cap && cap->session == session) {
1878 dout("kick_flushing_caps %p cap %p %s\n", inode,
1879 cap, ceph_cap_string(ci->i_flushing_caps));
1880 delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
1881 __ceph_caps_used(ci),
1882 __ceph_caps_wanted(ci),
1883 cap->issued | cap->implemented,
1884 ci->i_flushing_caps, NULL);
1885 if (delayed) {
1886 spin_lock(&inode->i_lock);
1887 __cap_delay_requeue(mdsc, ci);
1888 spin_unlock(&inode->i_lock);
1889 }
1890 } else {
1891 pr_err("%p auth cap %p not mds%d ???\n", inode,
1892 cap, session->s_mds);
1893 spin_unlock(&inode->i_lock);
1894 }
1895 }
1896}
1897
1898
1899/*
1900 * Take references to capabilities we hold, so that we don't release
1901 * them to the MDS prematurely.
1902 *
1903 * Protected by i_lock.
1904 */
1905static void __take_cap_refs(struct ceph_inode_info *ci, int got)
1906{
1907 if (got & CEPH_CAP_PIN)
1908 ci->i_pin_ref++;
1909 if (got & CEPH_CAP_FILE_RD)
1910 ci->i_rd_ref++;
1911 if (got & CEPH_CAP_FILE_CACHE)
1912 ci->i_rdcache_ref++;
1913 if (got & CEPH_CAP_FILE_WR)
1914 ci->i_wr_ref++;
1915 if (got & CEPH_CAP_FILE_BUFFER) {
1916 if (ci->i_wrbuffer_ref == 0)
1917 igrab(&ci->vfs_inode);
1918 ci->i_wrbuffer_ref++;
1919 dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
1920 &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
1921 }
1922}
1923
1924/*
1925 * Try to grab cap references. Specify those refs we @want, and the
1926 * minimal set we @need. Also include the larger offset we are writing
1927 * to (when applicable), and check against max_size here as well.
1928 * Note that caller is responsible for ensuring max_size increases are
1929 * requested from the MDS.
1930 */
1931static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
1932 int *got, loff_t endoff, int *check_max, int *err)
1933{
1934 struct inode *inode = &ci->vfs_inode;
1935 int ret = 0;
1936 int have, implemented;
1937 int file_wanted;
1938
1939 dout("get_cap_refs %p need %s want %s\n", inode,
1940 ceph_cap_string(need), ceph_cap_string(want));
1941 spin_lock(&inode->i_lock);
1942
1943 /* make sure file is actually open */
1944 file_wanted = __ceph_caps_file_wanted(ci);
1945 if ((file_wanted & need) == 0) {
1946 dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
1947 ceph_cap_string(need), ceph_cap_string(file_wanted));
1948 *err = -EBADF;
1949 ret = 1;
1950 goto out;
1951 }
1952
1953 if (need & CEPH_CAP_FILE_WR) {
1954 if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
1955 dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
1956 inode, endoff, ci->i_max_size);
1957 if (endoff > ci->i_wanted_max_size) {
1958 *check_max = 1;
1959 ret = 1;
1960 }
1961 goto out;
1962 }
1963 /*
1964 * If a sync write is in progress, we must wait, so that we
1965 * can get a final snapshot value for size+mtime.
1966 */
1967 if (__ceph_have_pending_cap_snap(ci)) {
1968 dout("get_cap_refs %p cap_snap_pending\n", inode);
1969 goto out;
1970 }
1971 }
1972 have = __ceph_caps_issued(ci, &implemented);
1973
1974 /*
1975 * disallow writes while a truncate is pending
1976 */
1977 if (ci->i_truncate_pending)
1978 have &= ~CEPH_CAP_FILE_WR;
1979
1980 if ((have & need) == need) {
1981 /*
1982 * Look at (implemented & ~have & not) so that we keep waiting
1983 * on transition from wanted -> needed caps. This is needed
1984 * for WRBUFFER|WR -> WR to avoid a new WR sync write from
1985 * going before a prior buffered writeback happens.
1986 */
1987 int not = want & ~(have & need);
1988 int revoking = implemented & ~have;
1989 dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
1990 inode, ceph_cap_string(have), ceph_cap_string(not),
1991 ceph_cap_string(revoking));
1992 if ((revoking & not) == 0) {
1993 *got = need | (have & want);
1994 __take_cap_refs(ci, *got);
1995 ret = 1;
1996 }
1997 } else {
1998 dout("get_cap_refs %p have %s needed %s\n", inode,
1999 ceph_cap_string(have), ceph_cap_string(need));
2000 }
2001out:
2002 spin_unlock(&inode->i_lock);
2003 dout("get_cap_refs %p ret %d got %s\n", inode,
2004 ret, ceph_cap_string(*got));
2005 return ret;
2006}
2007
2008/*
2009 * Check the offset we are writing up to against our current
2010 * max_size. If necessary, tell the MDS we want to write to
2011 * a larger offset.
2012 */
2013static void check_max_size(struct inode *inode, loff_t endoff)
2014{
2015 struct ceph_inode_info *ci = ceph_inode(inode);
2016 int check = 0;
2017
2018 /* do we need to explicitly request a larger max_size? */
2019 spin_lock(&inode->i_lock);
2020 if ((endoff >= ci->i_max_size ||
2021 endoff > (inode->i_size << 1)) &&
2022 endoff > ci->i_wanted_max_size) {
2023 dout("write %p at large endoff %llu, req max_size\n",
2024 inode, endoff);
2025 ci->i_wanted_max_size = endoff;
2026 check = 1;
2027 }
2028 spin_unlock(&inode->i_lock);
2029 if (check)
2030 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2031}
2032
2033/*
2034 * Wait for caps, and take cap references. If we can't get a WR cap
2035 * due to a small max_size, make sure we check_max_size (and possibly
2036 * ask the mds) so we don't get hung up indefinitely.
2037 */
2038int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
2039 loff_t endoff)
2040{
2041 int check_max, ret, err;
2042
2043retry:
2044 if (endoff > 0)
2045 check_max_size(&ci->vfs_inode, endoff);
2046 check_max = 0;
2047 err = 0;
2048 ret = wait_event_interruptible(ci->i_cap_wq,
2049 try_get_cap_refs(ci, need, want,
2050 got, endoff,
2051 &check_max, &err));
2052 if (err)
2053 ret = err;
2054 if (check_max)
2055 goto retry;
2056 return ret;
2057}
2058
2059/*
2060 * Take cap refs. Caller must already know we hold at least one ref
2061 * on the caps in question or we don't know this is safe.
2062 */
2063void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
2064{
2065 spin_lock(&ci->vfs_inode.i_lock);
2066 __take_cap_refs(ci, caps);
2067 spin_unlock(&ci->vfs_inode.i_lock);
2068}
2069
2070/*
2071 * Release cap refs.
2072 *
2073 * If we released the last ref on any given cap, call ceph_check_caps
2074 * to release (or schedule a release).
2075 *
2076 * If we are releasing a WR cap (from a sync write), finalize any affected
2077 * cap_snap, and wake up any waiters.
2078 */
2079void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
2080{
2081 struct inode *inode = &ci->vfs_inode;
2082 int last = 0, put = 0, flushsnaps = 0, wake = 0;
2083 struct ceph_cap_snap *capsnap;
2084
2085 spin_lock(&inode->i_lock);
2086 if (had & CEPH_CAP_PIN)
2087 --ci->i_pin_ref;
2088 if (had & CEPH_CAP_FILE_RD)
2089 if (--ci->i_rd_ref == 0)
2090 last++;
2091 if (had & CEPH_CAP_FILE_CACHE)
2092 if (--ci->i_rdcache_ref == 0)
2093 last++;
2094 if (had & CEPH_CAP_FILE_BUFFER) {
2095 if (--ci->i_wrbuffer_ref == 0) {
2096 last++;
2097 put++;
2098 }
2099 dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
2100 inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
2101 }
2102 if (had & CEPH_CAP_FILE_WR)
2103 if (--ci->i_wr_ref == 0) {
2104 last++;
2105 if (!list_empty(&ci->i_cap_snaps)) {
2106 capsnap = list_first_entry(&ci->i_cap_snaps,
2107 struct ceph_cap_snap,
2108 ci_item);
2109 if (capsnap->writing) {
2110 capsnap->writing = 0;
2111 flushsnaps =
2112 __ceph_finish_cap_snap(ci,
2113 capsnap);
2114 wake = 1;
2115 }
2116 }
2117 }
2118 spin_unlock(&inode->i_lock);
2119
2120 dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
2121 last ? "last" : "");
2122
2123 if (last && !flushsnaps)
2124 ceph_check_caps(ci, 0, NULL);
2125 else if (flushsnaps)
2126 ceph_flush_snaps(ci);
2127 if (wake)
2128 wake_up(&ci->i_cap_wq);
2129 if (put)
2130 iput(inode);
2131}
2132
2133/*
2134 * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
2135 * context. Adjust per-snap dirty page accounting as appropriate.
2136 * Once all dirty data for a cap_snap is flushed, flush snapped file
2137 * metadata back to the MDS. If we dropped the last ref, call
2138 * ceph_check_caps.
2139 */
2140void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
2141 struct ceph_snap_context *snapc)
2142{
2143 struct inode *inode = &ci->vfs_inode;
2144 int last = 0;
2145 int last_snap = 0;
2146 int found = 0;
2147 struct ceph_cap_snap *capsnap = NULL;
2148
2149 spin_lock(&inode->i_lock);
2150 ci->i_wrbuffer_ref -= nr;
2151 last = !ci->i_wrbuffer_ref;
2152
2153 if (ci->i_head_snapc == snapc) {
2154 ci->i_wrbuffer_ref_head -= nr;
2155 if (!ci->i_wrbuffer_ref_head) {
2156 ceph_put_snap_context(ci->i_head_snapc);
2157 ci->i_head_snapc = NULL;
2158 }
2159 dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
2160 inode,
2161 ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,
2162 ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
2163 last ? " LAST" : "");
2164 } else {
2165 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2166 if (capsnap->context == snapc) {
2167 found = 1;
2168 capsnap->dirty_pages -= nr;
2169 last_snap = !capsnap->dirty_pages;
2170 break;
2171 }
2172 }
2173 BUG_ON(!found);
2174 dout("put_wrbuffer_cap_refs on %p cap_snap %p "
2175 " snap %lld %d/%d -> %d/%d %s%s\n",
2176 inode, capsnap, capsnap->context->seq,
2177 ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
2178 ci->i_wrbuffer_ref, capsnap->dirty_pages,
2179 last ? " (wrbuffer last)" : "",
2180 last_snap ? " (capsnap last)" : "");
2181 }
2182
2183 spin_unlock(&inode->i_lock);
2184
2185 if (last) {
2186 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
2187 iput(inode);
2188 } else if (last_snap) {
2189 ceph_flush_snaps(ci);
2190 wake_up(&ci->i_cap_wq);
2191 }
2192}
2193
2194/*
2195 * Handle a cap GRANT message from the MDS. (Note that a GRANT may
2196 * actually be a revocation if it specifies a smaller cap set.)
2197 *
2198 * caller holds s_mutex.
2199 * return value:
2200 * 0 - ok
2201 * 1 - check_caps on auth cap only (writeback)
2202 * 2 - check_caps (ack revoke)
2203 */
2204static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
2205 struct ceph_mds_session *session,
2206 struct ceph_cap *cap,
2207 struct ceph_buffer *xattr_buf)
2208 __releases(inode->i_lock)
2209
2210{
2211 struct ceph_inode_info *ci = ceph_inode(inode);
2212 int mds = session->s_mds;
2213 int seq = le32_to_cpu(grant->seq);
2214 int newcaps = le32_to_cpu(grant->caps);
2215 int issued, implemented, used, wanted, dirty;
2216 u64 size = le64_to_cpu(grant->size);
2217 u64 max_size = le64_to_cpu(grant->max_size);
2218 struct timespec mtime, atime, ctime;
2219 int reply = 0;
2220 int wake = 0;
2221 int writeback = 0;
2222 int revoked_rdcache = 0;
2223 int queue_invalidate = 0;
2224
2225 dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
2226 inode, cap, mds, seq, ceph_cap_string(newcaps));
2227 dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
2228 inode->i_size);
2229
2230 /*
2231 * If CACHE is being revoked, and we have no dirty buffers,
2232 * try to invalidate (once). (If there are dirty buffers, we
2233 * will invalidate _after_ writeback.)
2234 */
2235 if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
2236 !ci->i_wrbuffer_ref) {
2237 if (try_nonblocking_invalidate(inode) == 0) {
2238 revoked_rdcache = 1;
2239 } else {
2240 /* there were locked pages.. invalidate later
2241 in a separate thread. */
2242 if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
2243 queue_invalidate = 1;
2244 ci->i_rdcache_revoking = ci->i_rdcache_gen;
2245 }
2246 }
2247 }
2248
2249 /* side effects now are allowed */
2250
2251 issued = __ceph_caps_issued(ci, &implemented);
2252 issued |= implemented | __ceph_caps_dirty(ci);
2253
2254 cap->cap_gen = session->s_cap_gen;
2255
2256 __check_cap_issue(ci, cap, newcaps);
2257
2258 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
2259 inode->i_mode = le32_to_cpu(grant->mode);
2260 inode->i_uid = le32_to_cpu(grant->uid);
2261 inode->i_gid = le32_to_cpu(grant->gid);
2262 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
2263 inode->i_uid, inode->i_gid);
2264 }
2265
2266 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
2267 inode->i_nlink = le32_to_cpu(grant->nlink);
2268
2269 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
2270 int len = le32_to_cpu(grant->xattr_len);
2271 u64 version = le64_to_cpu(grant->xattr_version);
2272
2273 if (version > ci->i_xattrs.version) {
2274 dout(" got new xattrs v%llu on %p len %d\n",
2275 version, inode, len);
2276 if (ci->i_xattrs.blob)
2277 ceph_buffer_put(ci->i_xattrs.blob);
2278 ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
2279 ci->i_xattrs.version = version;
2280 }
2281 }
2282
2283 /* size/ctime/mtime/atime? */
2284 ceph_fill_file_size(inode, issued,
2285 le32_to_cpu(grant->truncate_seq),
2286 le64_to_cpu(grant->truncate_size), size);
2287 ceph_decode_timespec(&mtime, &grant->mtime);
2288 ceph_decode_timespec(&atime, &grant->atime);
2289 ceph_decode_timespec(&ctime, &grant->ctime);
2290 ceph_fill_file_time(inode, issued,
2291 le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
2292 &atime);
2293
2294 /* max size increase? */
2295 if (max_size != ci->i_max_size) {
2296 dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
2297 ci->i_max_size = max_size;
2298 if (max_size >= ci->i_wanted_max_size) {
2299 ci->i_wanted_max_size = 0; /* reset */
2300 ci->i_requested_max_size = 0;
2301 }
2302 wake = 1;
2303 }
2304
2305 /* check cap bits */
2306 wanted = __ceph_caps_wanted(ci);
2307 used = __ceph_caps_used(ci);
2308 dirty = __ceph_caps_dirty(ci);
2309 dout(" my wanted = %s, used = %s, dirty %s\n",
2310 ceph_cap_string(wanted),
2311 ceph_cap_string(used),
2312 ceph_cap_string(dirty));
2313 if (wanted != le32_to_cpu(grant->wanted)) {
2314 dout("mds wanted %s -> %s\n",
2315 ceph_cap_string(le32_to_cpu(grant->wanted)),
2316 ceph_cap_string(wanted));
2317 grant->wanted = cpu_to_le32(wanted);
2318 }
2319
2320 cap->seq = seq;
2321
2322 /* file layout may have changed */
2323 ci->i_layout = grant->layout;
2324
2325 /* revocation, grant, or no-op? */
2326 if (cap->issued & ~newcaps) {
2327 dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
2328 ceph_cap_string(newcaps));
2329 if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
2330 writeback = 1; /* will delay ack */
2331 else if (dirty & ~newcaps)
2332 reply = 1; /* initiate writeback in check_caps */
2333 else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
2334 revoked_rdcache)
2335 reply = 2; /* send revoke ack in check_caps */
2336 cap->issued = newcaps;
2337 } else if (cap->issued == newcaps) {
2338 dout("caps unchanged: %s -> %s\n",
2339 ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
2340 } else {
2341 dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
2342 ceph_cap_string(newcaps));
2343 cap->issued = newcaps;
2344 cap->implemented |= newcaps; /* add bits only, to
2345 * avoid stepping on a
2346 * pending revocation */
2347 wake = 1;
2348 }
2349
2350 spin_unlock(&inode->i_lock);
2351 if (writeback)
2352 /*
2353 * queue inode for writeback: we can't actually call
2354 * filemap_write_and_wait, etc. from message handler
2355 * context.
2356 */
2357 ceph_queue_writeback(inode);
2358 if (queue_invalidate)
2359 ceph_queue_invalidate(inode);
2360 if (wake)
2361 wake_up(&ci->i_cap_wq);
2362 return reply;
2363}
2364
2365/*
2366 * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
2367 * MDS has been safely committed.
2368 */
2369static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
2370 struct ceph_mds_caps *m,
2371 struct ceph_mds_session *session,
2372 struct ceph_cap *cap)
2373 __releases(inode->i_lock)
2374{
2375 struct ceph_inode_info *ci = ceph_inode(inode);
2376 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
2377 unsigned seq = le32_to_cpu(m->seq);
2378 int dirty = le32_to_cpu(m->dirty);
2379 int cleaned = 0;
2380 int drop = 0;
2381 int i;
2382
2383 for (i = 0; i < CEPH_CAP_BITS; i++)
2384 if ((dirty & (1 << i)) &&
2385 flush_tid == ci->i_cap_flush_tid[i])
2386 cleaned |= 1 << i;
2387
2388 dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
2389 " flushing %s -> %s\n",
2390 inode, session->s_mds, seq, ceph_cap_string(dirty),
2391 ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
2392 ceph_cap_string(ci->i_flushing_caps & ~cleaned));
2393
2394 if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
2395 goto out;
2396
2397 ci->i_flushing_caps &= ~cleaned;
2398
2399 spin_lock(&mdsc->cap_dirty_lock);
2400 if (ci->i_flushing_caps == 0) {
2401 list_del_init(&ci->i_flushing_item);
2402 if (!list_empty(&session->s_cap_flushing))
2403 dout(" mds%d still flushing cap on %p\n",
2404 session->s_mds,
2405 &list_entry(session->s_cap_flushing.next,
2406 struct ceph_inode_info,
2407 i_flushing_item)->vfs_inode);
2408 mdsc->num_cap_flushing--;
2409 wake_up(&mdsc->cap_flushing_wq);
2410 dout(" inode %p now !flushing\n", inode);
2411
2412 if (ci->i_dirty_caps == 0) {
2413 dout(" inode %p now clean\n", inode);
2414 BUG_ON(!list_empty(&ci->i_dirty_item));
2415 drop = 1;
2416 } else {
2417 BUG_ON(list_empty(&ci->i_dirty_item));
2418 }
2419 }
2420 spin_unlock(&mdsc->cap_dirty_lock);
2421 wake_up(&ci->i_cap_wq);
2422
2423out:
2424 spin_unlock(&inode->i_lock);
2425 if (drop)
2426 iput(inode);
2427}
2428
2429/*
2430 * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
2431 * throw away our cap_snap.
2432 *
2433 * Caller hold s_mutex.
2434 */
2435static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
2436 struct ceph_mds_caps *m,
2437 struct ceph_mds_session *session)
2438{
2439 struct ceph_inode_info *ci = ceph_inode(inode);
2440 u64 follows = le64_to_cpu(m->snap_follows);
2441 struct ceph_cap_snap *capsnap;
2442 int drop = 0;
2443
2444 dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
2445 inode, ci, session->s_mds, follows);
2446
2447 spin_lock(&inode->i_lock);
2448 list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
2449 if (capsnap->follows == follows) {
2450 if (capsnap->flush_tid != flush_tid) {
2451 dout(" cap_snap %p follows %lld tid %lld !="
2452 " %lld\n", capsnap, follows,
2453 flush_tid, capsnap->flush_tid);
2454 break;
2455 }
2456 WARN_ON(capsnap->dirty_pages || capsnap->writing);
2457 dout(" removing cap_snap %p follows %lld\n",
2458 capsnap, follows);
2459 ceph_put_snap_context(capsnap->context);
2460 list_del(&capsnap->ci_item);
2461 list_del(&capsnap->flushing_item);
2462 ceph_put_cap_snap(capsnap);
2463 drop = 1;
2464 break;
2465 } else {
2466 dout(" skipping cap_snap %p follows %lld\n",
2467 capsnap, capsnap->follows);
2468 }
2469 }
2470 spin_unlock(&inode->i_lock);
2471 if (drop)
2472 iput(inode);
2473}
2474
2475/*
2476 * Handle TRUNC from MDS, indicating file truncation.
2477 *
2478 * caller hold s_mutex.
2479 */
2480static void handle_cap_trunc(struct inode *inode,
2481 struct ceph_mds_caps *trunc,
2482 struct ceph_mds_session *session)
2483 __releases(inode->i_lock)
2484{
2485 struct ceph_inode_info *ci = ceph_inode(inode);
2486 int mds = session->s_mds;
2487 int seq = le32_to_cpu(trunc->seq);
2488 u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
2489 u64 truncate_size = le64_to_cpu(trunc->truncate_size);
2490 u64 size = le64_to_cpu(trunc->size);
2491 int implemented = 0;
2492 int dirty = __ceph_caps_dirty(ci);
2493 int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
2494 int queue_trunc = 0;
2495
2496 issued |= implemented | dirty;
2497
2498 dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
2499 inode, mds, seq, truncate_size, truncate_seq);
2500 queue_trunc = ceph_fill_file_size(inode, issued,
2501 truncate_seq, truncate_size, size);
2502 spin_unlock(&inode->i_lock);
2503
2504 if (queue_trunc)
2505 ceph_queue_vmtruncate(inode);
2506}
2507
2508/*
2509 * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
2510 * different one. If we are the most recent migration we've seen (as
2511 * indicated by mseq), make note of the migrating cap bits for the
2512 * duration (until we see the corresponding IMPORT).
2513 *
2514 * caller holds s_mutex
2515 */
2516static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
2517 struct ceph_mds_session *session)
2518{
2519 struct ceph_inode_info *ci = ceph_inode(inode);
2520 int mds = session->s_mds;
2521 unsigned mseq = le32_to_cpu(ex->migrate_seq);
2522 struct ceph_cap *cap = NULL, *t;
2523 struct rb_node *p;
2524 int remember = 1;
2525
2526 dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
2527 inode, ci, mds, mseq);
2528
2529 spin_lock(&inode->i_lock);
2530
2531 /* make sure we haven't seen a higher mseq */
2532 for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
2533 t = rb_entry(p, struct ceph_cap, ci_node);
2534 if (ceph_seq_cmp(t->mseq, mseq) > 0) {
2535 dout(" higher mseq on cap from mds%d\n",
2536 t->session->s_mds);
2537 remember = 0;
2538 }
2539 if (t->session->s_mds == mds)
2540 cap = t;
2541 }
2542
2543 if (cap) {
2544 if (remember) {
2545 /* make note */
2546 ci->i_cap_exporting_mds = mds;
2547 ci->i_cap_exporting_mseq = mseq;
2548 ci->i_cap_exporting_issued = cap->issued;
2549 }
2550 __ceph_remove_cap(cap);
2551 } else {
2552 WARN_ON(!cap);
2553 }
2554
2555 spin_unlock(&inode->i_lock);
2556}
2557
2558/*
2559 * Handle cap IMPORT. If there are temp bits from an older EXPORT,
2560 * clean them up.
2561 *
2562 * caller holds s_mutex.
2563 */
2564static void handle_cap_import(struct ceph_mds_client *mdsc,
2565 struct inode *inode, struct ceph_mds_caps *im,
2566 struct ceph_mds_session *session,
2567 void *snaptrace, int snaptrace_len)
2568{
2569 struct ceph_inode_info *ci = ceph_inode(inode);
2570 int mds = session->s_mds;
2571 unsigned issued = le32_to_cpu(im->caps);
2572 unsigned wanted = le32_to_cpu(im->wanted);
2573 unsigned seq = le32_to_cpu(im->seq);
2574 unsigned mseq = le32_to_cpu(im->migrate_seq);
2575 u64 realmino = le64_to_cpu(im->realm);
2576 u64 cap_id = le64_to_cpu(im->cap_id);
2577
2578 if (ci->i_cap_exporting_mds >= 0 &&
2579 ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
2580 dout("handle_cap_import inode %p ci %p mds%d mseq %d"
2581 " - cleared exporting from mds%d\n",
2582 inode, ci, mds, mseq,
2583 ci->i_cap_exporting_mds);
2584 ci->i_cap_exporting_issued = 0;
2585 ci->i_cap_exporting_mseq = 0;
2586 ci->i_cap_exporting_mds = -1;
2587 } else {
2588 dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
2589 inode, ci, mds, mseq);
2590 }
2591
2592 down_write(&mdsc->snap_rwsem);
2593 ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
2594 false);
2595 downgrade_write(&mdsc->snap_rwsem);
2596 ceph_add_cap(inode, session, cap_id, -1,
2597 issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
2598 NULL /* no caps context */);
2599 try_flush_caps(inode, session, NULL);
2600 up_read(&mdsc->snap_rwsem);
2601}
2602
2603/*
2604 * Handle a caps message from the MDS.
2605 *
2606 * Identify the appropriate session, inode, and call the right handler
2607 * based on the cap op.
2608 */
2609void ceph_handle_caps(struct ceph_mds_session *session,
2610 struct ceph_msg *msg)
2611{
2612 struct ceph_mds_client *mdsc = session->s_mdsc;
2613 struct super_block *sb = mdsc->client->sb;
2614 struct inode *inode;
2615 struct ceph_cap *cap;
2616 struct ceph_mds_caps *h;
2617 int mds = session->s_mds;
2618 int op;
2619 u32 seq;
2620 struct ceph_vino vino;
2621 u64 cap_id;
2622 u64 size, max_size;
2623 u64 tid;
2624 int check_caps = 0;
2625 void *snaptrace;
2626 int r;
2627
2628 dout("handle_caps from mds%d\n", mds);
2629
2630 /* decode */
2631 tid = le64_to_cpu(msg->hdr.tid);
2632 if (msg->front.iov_len < sizeof(*h))
2633 goto bad;
2634 h = msg->front.iov_base;
2635 snaptrace = h + 1;
2636 op = le32_to_cpu(h->op);
2637 vino.ino = le64_to_cpu(h->ino);
2638 vino.snap = CEPH_NOSNAP;
2639 cap_id = le64_to_cpu(h->cap_id);
2640 seq = le32_to_cpu(h->seq);
2641 size = le64_to_cpu(h->size);
2642 max_size = le64_to_cpu(h->max_size);
2643
2644 mutex_lock(&session->s_mutex);
2645 session->s_seq++;
2646 dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
2647 (unsigned)seq);
2648
2649 /* lookup ino */
2650 inode = ceph_find_inode(sb, vino);
2651 dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
2652 vino.snap, inode);
2653 if (!inode) {
2654 dout(" i don't have ino %llx\n", vino.ino);
2655 goto done;
2656 }
2657
2658 /* these will work even if we don't have a cap yet */
2659 switch (op) {
2660 case CEPH_CAP_OP_FLUSHSNAP_ACK:
2661 handle_cap_flushsnap_ack(inode, tid, h, session);
2662 goto done;
2663
2664 case CEPH_CAP_OP_EXPORT:
2665 handle_cap_export(inode, h, session);
2666 goto done;
2667
2668 case CEPH_CAP_OP_IMPORT:
2669 handle_cap_import(mdsc, inode, h, session,
2670 snaptrace, le32_to_cpu(h->snap_trace_len));
2671 check_caps = 1; /* we may have sent a RELEASE to the old auth */
2672 goto done;
2673 }
2674
2675 /* the rest require a cap */
2676 spin_lock(&inode->i_lock);
2677 cap = __get_cap_for_mds(ceph_inode(inode), mds);
2678 if (!cap) {
2679 dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
2680 inode, ceph_ino(inode), ceph_snap(inode), mds);
2681 spin_unlock(&inode->i_lock);
2682 goto done;
2683 }
2684
2685 /* note that each of these drops i_lock for us */
2686 switch (op) {
2687 case CEPH_CAP_OP_REVOKE:
2688 case CEPH_CAP_OP_GRANT:
2689 r = handle_cap_grant(inode, h, session, cap, msg->middle);
2690 if (r == 1)
2691 ceph_check_caps(ceph_inode(inode),
2692 CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
2693 session);
2694 else if (r == 2)
2695 ceph_check_caps(ceph_inode(inode),
2696 CHECK_CAPS_NODELAY,
2697 session);
2698 break;
2699
2700 case CEPH_CAP_OP_FLUSH_ACK:
2701 handle_cap_flush_ack(inode, tid, h, session, cap);
2702 break;
2703
2704 case CEPH_CAP_OP_TRUNC:
2705 handle_cap_trunc(inode, h, session);
2706 break;
2707
2708 default:
2709 spin_unlock(&inode->i_lock);
2710 pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
2711 ceph_cap_op_name(op));
2712 }
2713
2714done:
2715 mutex_unlock(&session->s_mutex);
2716
2717 if (check_caps)
2718 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL);
2719 if (inode)
2720 iput(inode);
2721 return;
2722
2723bad:
2724 pr_err("ceph_handle_caps: corrupt message\n");
2725 ceph_msg_dump(msg);
2726 return;
2727}
2728
2729/*
2730 * Delayed work handler to process end of delayed cap release LRU list.
2731 */
2732void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
2733{
2734 struct ceph_inode_info *ci;
2735 int flags = CHECK_CAPS_NODELAY;
2736
2737 dout("check_delayed_caps\n");
2738 while (1) {
2739 spin_lock(&mdsc->cap_delay_lock);
2740 if (list_empty(&mdsc->cap_delay_list))
2741 break;
2742 ci = list_first_entry(&mdsc->cap_delay_list,
2743 struct ceph_inode_info,
2744 i_cap_delay_list);
2745 if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
2746 time_before(jiffies, ci->i_hold_caps_max))
2747 break;
2748 list_del_init(&ci->i_cap_delay_list);
2749 spin_unlock(&mdsc->cap_delay_lock);
2750 dout("check_delayed_caps on %p\n", &ci->vfs_inode);
2751 ceph_check_caps(ci, flags, NULL);
2752 }
2753 spin_unlock(&mdsc->cap_delay_lock);
2754}
2755
2756/*
2757 * Flush all dirty caps to the mds
2758 */
2759void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
2760{
2761 struct ceph_inode_info *ci, *nci = NULL;
2762 struct inode *inode, *ninode = NULL;
2763 struct list_head *p, *n;
2764
2765 dout("flush_dirty_caps\n");
2766 spin_lock(&mdsc->cap_dirty_lock);
2767 list_for_each_safe(p, n, &mdsc->cap_dirty) {
2768 if (nci) {
2769 ci = nci;
2770 inode = ninode;
2771 ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
2772 dout("flush_dirty_caps inode %p (was next inode)\n",
2773 inode);
2774 } else {
2775 ci = list_entry(p, struct ceph_inode_info,
2776 i_dirty_item);
2777 inode = igrab(&ci->vfs_inode);
2778 BUG_ON(!inode);
2779 dout("flush_dirty_caps inode %p\n", inode);
2780 }
2781 if (n != &mdsc->cap_dirty) {
2782 nci = list_entry(n, struct ceph_inode_info,
2783 i_dirty_item);
2784 ninode = igrab(&nci->vfs_inode);
2785 BUG_ON(!ninode);
2786 nci->i_ceph_flags |= CEPH_I_NOFLUSH;
2787 dout("flush_dirty_caps next inode %p, noflush\n",
2788 ninode);
2789 } else {
2790 nci = NULL;
2791 ninode = NULL;
2792 }
2793 spin_unlock(&mdsc->cap_dirty_lock);
2794 if (inode) {
2795 ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
2796 NULL);
2797 iput(inode);
2798 }
2799 spin_lock(&mdsc->cap_dirty_lock);
2800 }
2801 spin_unlock(&mdsc->cap_dirty_lock);
2802}
2803
2804/*
2805 * Drop open file reference. If we were the last open file,
2806 * we may need to release capabilities to the MDS (or schedule
2807 * their delayed release).
2808 */
2809void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
2810{
2811 struct inode *inode = &ci->vfs_inode;
2812 int last = 0;
2813
2814 spin_lock(&inode->i_lock);
2815 dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
2816 ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
2817 BUG_ON(ci->i_nr_by_mode[fmode] == 0);
2818 if (--ci->i_nr_by_mode[fmode] == 0)
2819 last++;
2820 spin_unlock(&inode->i_lock);
2821
2822 if (last && ci->i_vino.snap == CEPH_NOSNAP)
2823 ceph_check_caps(ci, 0, NULL);
2824}
2825
2826/*
2827 * Helpers for embedding cap and dentry lease releases into mds
2828 * requests.
2829 *
2830 * @force is used by dentry_release (below) to force inclusion of a
2831 * record for the directory inode, even when there aren't any caps to
2832 * drop.
2833 */
2834int ceph_encode_inode_release(void **p, struct inode *inode,
2835 int mds, int drop, int unless, int force)
2836{
2837 struct ceph_inode_info *ci = ceph_inode(inode);
2838 struct ceph_cap *cap;
2839 struct ceph_mds_request_release *rel = *p;
2840 int ret = 0;
2841
2842 dout("encode_inode_release %p mds%d drop %s unless %s\n", inode,
2843 mds, ceph_cap_string(drop), ceph_cap_string(unless));
2844
2845 spin_lock(&inode->i_lock);
2846 cap = __get_cap_for_mds(ci, mds);
2847 if (cap && __cap_is_valid(cap)) {
2848 if (force ||
2849 ((cap->issued & drop) &&
2850 (cap->issued & unless) == 0)) {
2851 if ((cap->issued & drop) &&
2852 (cap->issued & unless) == 0) {
2853 dout("encode_inode_release %p cap %p %s -> "
2854 "%s\n", inode, cap,
2855 ceph_cap_string(cap->issued),
2856 ceph_cap_string(cap->issued & ~drop));
2857 cap->issued &= ~drop;
2858 cap->implemented &= ~drop;
2859 if (ci->i_ceph_flags & CEPH_I_NODELAY) {
2860 int wanted = __ceph_caps_wanted(ci);
2861 dout(" wanted %s -> %s (act %s)\n",
2862 ceph_cap_string(cap->mds_wanted),
2863 ceph_cap_string(cap->mds_wanted &
2864 ~wanted),
2865 ceph_cap_string(wanted));
2866 cap->mds_wanted &= wanted;
2867 }
2868 } else {
2869 dout("encode_inode_release %p cap %p %s"
2870 " (force)\n", inode, cap,
2871 ceph_cap_string(cap->issued));
2872 }
2873
2874 rel->ino = cpu_to_le64(ceph_ino(inode));
2875 rel->cap_id = cpu_to_le64(cap->cap_id);
2876 rel->seq = cpu_to_le32(cap->seq);
2877 rel->issue_seq = cpu_to_le32(cap->issue_seq),
2878 rel->mseq = cpu_to_le32(cap->mseq);
2879 rel->caps = cpu_to_le32(cap->issued);
2880 rel->wanted = cpu_to_le32(cap->mds_wanted);
2881 rel->dname_len = 0;
2882 rel->dname_seq = 0;
2883 *p += sizeof(*rel);
2884 ret = 1;
2885 } else {
2886 dout("encode_inode_release %p cap %p %s\n",
2887 inode, cap, ceph_cap_string(cap->issued));
2888 }
2889 }
2890 spin_unlock(&inode->i_lock);
2891 return ret;
2892}
2893
2894int ceph_encode_dentry_release(void **p, struct dentry *dentry,
2895 int mds, int drop, int unless)
2896{
2897 struct inode *dir = dentry->d_parent->d_inode;
2898 struct ceph_mds_request_release *rel = *p;
2899 struct ceph_dentry_info *di = ceph_dentry(dentry);
2900 int force = 0;
2901 int ret;
2902
2903 /*
2904 * force an record for the directory caps if we have a dentry lease.
2905 * this is racy (can't take i_lock and d_lock together), but it
2906 * doesn't have to be perfect; the mds will revoke anything we don't
2907 * release.
2908 */
2909 spin_lock(&dentry->d_lock);
2910 if (di->lease_session && di->lease_session->s_mds == mds)
2911 force = 1;
2912 spin_unlock(&dentry->d_lock);
2913
2914 ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
2915
2916 spin_lock(&dentry->d_lock);
2917 if (ret && di->lease_session && di->lease_session->s_mds == mds) {
2918 dout("encode_dentry_release %p mds%d seq %d\n",
2919 dentry, mds, (int)di->lease_seq);
2920 rel->dname_len = cpu_to_le32(dentry->d_name.len);
2921 memcpy(*p, dentry->d_name.name, dentry->d_name.len);
2922 *p += dentry->d_name.len;
2923 rel->dname_seq = cpu_to_le32(di->lease_seq);
2924 }
2925 spin_unlock(&dentry->d_lock);
2926 return ret;
2927}
diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
new file mode 100644
index 000000000000..1818c2305610
--- /dev/null
+++ b/fs/ceph/ceph_debug.h
@@ -0,0 +1,37 @@
1#ifndef _FS_CEPH_DEBUG_H
2#define _FS_CEPH_DEBUG_H
3
4#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
5
6#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
7
8/*
9 * wrap pr_debug to include a filename:lineno prefix on each line.
10 * this incurs some overhead (kernel size and execution time) due to
11 * the extra function call at each call site.
12 */
13
14# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
15extern const char *ceph_file_part(const char *s, int len);
16# define dout(fmt, ...) \
17 pr_debug(" %12.12s:%-4d : " fmt, \
18 ceph_file_part(__FILE__, sizeof(__FILE__)), \
19 __LINE__, ##__VA_ARGS__)
20# else
21/* faux printk call just to see any compiler warnings. */
22# define dout(fmt, ...) do { \
23 if (0) \
24 printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
25 } while (0)
26# endif
27
28#else
29
30/*
31 * or, just wrap pr_debug
32 */
33# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
34
35#endif
36
37#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644
index 000000000000..ab6cf35c4091
--- /dev/null
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,21 @@
1/*
2 * Ceph 'frag' type
3 */
4#include "types.h"
5
6int ceph_frag_compare(__u32 a, __u32 b)
7{
8 unsigned va = ceph_frag_value(a);
9 unsigned vb = ceph_frag_value(b);
10 if (va < vb)
11 return -1;
12 if (va > vb)
13 return 1;
14 va = ceph_frag_bits(a);
15 vb = ceph_frag_bits(b);
16 if (va < vb)
17 return -1;
18 if (va > vb)
19 return 1;
20 return 0;
21}
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
new file mode 100644
index 000000000000..793f50cb7c22
--- /dev/null
+++ b/fs/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
1#ifndef _FS_CEPH_FRAG_H
2#define _FS_CEPH_FRAG_H
3
4/*
5 * "Frags" are a way to describe a subset of a 32-bit number space,
6 * using a mask and a value to match against that mask. Any given frag
7 * (subset of the number space) can be partitioned into 2^n sub-frags.
8 *
9 * Frags are encoded into a 32-bit word:
10 * 8 upper bits = "bits"
11 * 24 lower bits = "value"
12 * (We could go to 5+27 bits, but who cares.)
13 *
14 * We use the _most_ significant bits of the 24 bit value. This makes
15 * values logically sort.
16 *
17 * Unfortunately, because the "bits" field is still in the high bits, we
18 * can't sort encoded frags numerically. However, it does allow you
19 * to feed encoded frags as values into frag_contains_value.
20 */
21static inline __u32 ceph_frag_make(__u32 b, __u32 v)
22{
23 return (b << 24) |
24 (v & (0xffffffu << (24-b)) & 0xffffffu);
25}
26static inline __u32 ceph_frag_bits(__u32 f)
27{
28 return f >> 24;
29}
30static inline __u32 ceph_frag_value(__u32 f)
31{
32 return f & 0xffffffu;
33}
34static inline __u32 ceph_frag_mask(__u32 f)
35{
36 return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
37}
38static inline __u32 ceph_frag_mask_shift(__u32 f)
39{
40 return 24 - ceph_frag_bits(f);
41}
42
43static inline int ceph_frag_contains_value(__u32 f, __u32 v)
44{
45 return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
46}
47static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
48{
49 /* is sub as specific as us, and contained by us? */
50 return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
51 (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
52}
53
54static inline __u32 ceph_frag_parent(__u32 f)
55{
56 return ceph_frag_make(ceph_frag_bits(f) - 1,
57 ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
58}
59static inline int ceph_frag_is_left_child(__u32 f)
60{
61 return ceph_frag_bits(f) > 0 &&
62 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
63}
64static inline int ceph_frag_is_right_child(__u32 f)
65{
66 return ceph_frag_bits(f) > 0 &&
67 (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
68}
69static inline __u32 ceph_frag_sibling(__u32 f)
70{
71 return ceph_frag_make(ceph_frag_bits(f),
72 ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
73}
74static inline __u32 ceph_frag_left_child(__u32 f)
75{
76 return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
77}
78static inline __u32 ceph_frag_right_child(__u32 f)
79{
80 return ceph_frag_make(ceph_frag_bits(f)+1,
81 ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
82}
83static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
84{
85 int newbits = ceph_frag_bits(f) + by;
86 return ceph_frag_make(newbits,
87 ceph_frag_value(f) | (i << (24 - newbits)));
88}
89static inline int ceph_frag_is_leftmost(__u32 f)
90{
91 return ceph_frag_value(f) == 0;
92}
93static inline int ceph_frag_is_rightmost(__u32 f)
94{
95 return ceph_frag_value(f) == ceph_frag_mask(f);
96}
97static inline __u32 ceph_frag_next(__u32 f)
98{
99 return ceph_frag_make(ceph_frag_bits(f),
100 ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
101}
102
103/*
104 * comparator to sort frags logically, as when traversing the
105 * number space in ascending order...
106 */
107int ceph_frag_compare(__u32 a, __u32 b);
108
109#endif
diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
new file mode 100644
index 000000000000..79d76bc4303f
--- /dev/null
+++ b/fs/ceph/ceph_fs.c
@@ -0,0 +1,74 @@
1/*
2 * Some non-inline ceph helpers
3 */
4#include "types.h"
5
6/*
7 * return true if @layout appears to be valid
8 */
9int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
10{
11 __u32 su = le32_to_cpu(layout->fl_stripe_unit);
12 __u32 sc = le32_to_cpu(layout->fl_stripe_count);
13 __u32 os = le32_to_cpu(layout->fl_object_size);
14
15 /* stripe unit, object size must be non-zero, 64k increment */
16 if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
17 return 0;
18 if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
19 return 0;
20 /* object size must be a multiple of stripe unit */
21 if (os < su || os % su)
22 return 0;
23 /* stripe count must be non-zero */
24 if (!sc)
25 return 0;
26 return 1;
27}
28
29
30int ceph_flags_to_mode(int flags)
31{
32#ifdef O_DIRECTORY /* fixme */
33 if ((flags & O_DIRECTORY) == O_DIRECTORY)
34 return CEPH_FILE_MODE_PIN;
35#endif
36#ifdef O_LAZY
37 if (flags & O_LAZY)
38 return CEPH_FILE_MODE_LAZY;
39#endif
40 if ((flags & O_APPEND) == O_APPEND)
41 flags |= O_WRONLY;
42
43 flags &= O_ACCMODE;
44 if ((flags & O_RDWR) == O_RDWR)
45 return CEPH_FILE_MODE_RDWR;
46 if ((flags & O_WRONLY) == O_WRONLY)
47 return CEPH_FILE_MODE_WR;
48 return CEPH_FILE_MODE_RD;
49}
50
51int ceph_caps_for_mode(int mode)
52{
53 switch (mode) {
54 case CEPH_FILE_MODE_PIN:
55 return CEPH_CAP_PIN;
56 case CEPH_FILE_MODE_RD:
57 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
58 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
59 case CEPH_FILE_MODE_RDWR:
60 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
61 CEPH_CAP_FILE_EXCL |
62 CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
63 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
64 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
65 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
66 case CEPH_FILE_MODE_WR:
67 return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
68 CEPH_CAP_FILE_EXCL |
69 CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
70 CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
71 CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
72 }
73 return 0;
74}
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
new file mode 100644
index 000000000000..0c2241ef3653
--- /dev/null
+++ b/fs/ceph/ceph_fs.h
@@ -0,0 +1,650 @@
1/*
2 * ceph_fs.h - Ceph constants and data types to share between kernel and
3 * user space.
4 *
5 * Most types in this file are defined as little-endian, and are
6 * primarily intended to describe data structures that pass over the
7 * wire or that are stored on disk.
8 *
9 * LGPL2
10 */
11
12#ifndef _FS_CEPH_CEPH_FS_H
13#define _FS_CEPH_CEPH_FS_H
14
15#include "msgr.h"
16#include "rados.h"
17
18/*
19 * Ceph release version
20 */
21#define CEPH_VERSION_MAJOR 0
22#define CEPH_VERSION_MINOR 19
23#define CEPH_VERSION_PATCH 0
24
25#define _CEPH_STRINGIFY(x) #x
26#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
27#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
28 "." CEPH_STRINGIFY(z)
29#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
30 CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
31
32/*
33 * subprotocol versions. when specific messages types or high-level
34 * protocols change, bump the affected components. we keep rev
35 * internal cluster protocols separately from the public,
36 * client-facing protocol.
37 */
38#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
39#define CEPH_MDS_PROTOCOL 9 /* cluster internal */
40#define CEPH_MON_PROTOCOL 5 /* cluster internal */
41#define CEPH_OSDC_PROTOCOL 24 /* server/client */
42#define CEPH_MDSC_PROTOCOL 32 /* server/client */
43#define CEPH_MONC_PROTOCOL 15 /* server/client */
44
45
46#define CEPH_INO_ROOT 1
47#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
48
49/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
50#define CEPH_MAX_MON 31
51
52
53/*
54 * feature bits
55 */
56#define CEPH_FEATURE_SUPPORTED 0
57#define CEPH_FEATURE_REQUIRED 0
58
59
60/*
61 * ceph_file_layout - describe data layout for a file/inode
62 */
63struct ceph_file_layout {
64 /* file -> object mapping */
65 __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
66 of page size. */
67 __le32 fl_stripe_count; /* over this many objects */
68 __le32 fl_object_size; /* until objects are this big, then move to
69 new objects */
70 __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
71
72 /* pg -> disk layout */
73 __le32 fl_object_stripe_unit; /* for per-object parity, if any */
74
75 /* object -> pg layout */
76 __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
77 __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
78} __attribute__ ((packed));
79
80#define CEPH_MIN_STRIPE_UNIT 65536
81
82int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
83
84
85/* crypto algorithms */
86#define CEPH_CRYPTO_NONE 0x0
87#define CEPH_CRYPTO_AES 0x1
88
89/* security/authentication protocols */
90#define CEPH_AUTH_UNKNOWN 0x0
91#define CEPH_AUTH_NONE 0x1
92#define CEPH_AUTH_CEPHX 0x2
93
94
95/*********************************************
96 * message layer
97 */
98
99/*
100 * message types
101 */
102
103/* misc */
104#define CEPH_MSG_SHUTDOWN 1
105#define CEPH_MSG_PING 2
106
107/* client <-> monitor */
108#define CEPH_MSG_MON_MAP 4
109#define CEPH_MSG_MON_GET_MAP 5
110#define CEPH_MSG_STATFS 13
111#define CEPH_MSG_STATFS_REPLY 14
112#define CEPH_MSG_MON_SUBSCRIBE 15
113#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
114#define CEPH_MSG_AUTH 17
115#define CEPH_MSG_AUTH_REPLY 18
116
117/* client <-> mds */
118#define CEPH_MSG_MDS_MAP 21
119
120#define CEPH_MSG_CLIENT_SESSION 22
121#define CEPH_MSG_CLIENT_RECONNECT 23
122
123#define CEPH_MSG_CLIENT_REQUEST 24
124#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
125#define CEPH_MSG_CLIENT_REPLY 26
126#define CEPH_MSG_CLIENT_CAPS 0x310
127#define CEPH_MSG_CLIENT_LEASE 0x311
128#define CEPH_MSG_CLIENT_SNAP 0x312
129#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
130
131/* osd */
132#define CEPH_MSG_OSD_MAP 41
133#define CEPH_MSG_OSD_OP 42
134#define CEPH_MSG_OSD_OPREPLY 43
135
136struct ceph_mon_request_header {
137 __le64 have_version;
138 __le16 session_mon;
139 __le64 session_mon_tid;
140} __attribute__ ((packed));
141
142struct ceph_mon_statfs {
143 struct ceph_mon_request_header monhdr;
144 struct ceph_fsid fsid;
145} __attribute__ ((packed));
146
147struct ceph_statfs {
148 __le64 kb, kb_used, kb_avail;
149 __le64 num_objects;
150} __attribute__ ((packed));
151
152struct ceph_mon_statfs_reply {
153 struct ceph_fsid fsid;
154 __le64 version;
155 struct ceph_statfs st;
156} __attribute__ ((packed));
157
158struct ceph_osd_getmap {
159 struct ceph_mon_request_header monhdr;
160 struct ceph_fsid fsid;
161 __le32 start;
162} __attribute__ ((packed));
163
164struct ceph_mds_getmap {
165 struct ceph_mon_request_header monhdr;
166 struct ceph_fsid fsid;
167} __attribute__ ((packed));
168
169struct ceph_client_mount {
170 struct ceph_mon_request_header monhdr;
171} __attribute__ ((packed));
172
173struct ceph_mon_subscribe_item {
174 __le64 have_version; __le64 have;
175 __u8 onetime;
176} __attribute__ ((packed));
177
178struct ceph_mon_subscribe_ack {
179 __le32 duration; /* seconds */
180 struct ceph_fsid fsid;
181} __attribute__ ((packed));
182
183/*
184 * mds states
185 * > 0 -> in
186 * <= 0 -> out
187 */
188#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
189#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
190 empty log. */
191#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
192#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
193#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
194#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
195#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
196
197#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
198#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
199 operations (import, rename, etc.) */
200#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
201#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
202#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
203#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
204#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
205
206extern const char *ceph_mds_state_name(int s);
207
208
209/*
210 * metadata lock types.
211 * - these are bitmasks.. we can compose them
212 * - they also define the lock ordering by the MDS
213 * - a few of these are internal to the mds
214 */
215#define CEPH_LOCK_DN 1
216#define CEPH_LOCK_ISNAP 2
217#define CEPH_LOCK_IVERSION 4 /* mds internal */
218#define CEPH_LOCK_IFILE 8 /* mds internal */
219#define CEPH_LOCK_IAUTH 32
220#define CEPH_LOCK_ILINK 64
221#define CEPH_LOCK_IDFT 128 /* dir frag tree */
222#define CEPH_LOCK_INEST 256 /* mds internal */
223#define CEPH_LOCK_IXATTR 512
224#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */
225
226/* client_session ops */
227enum {
228 CEPH_SESSION_REQUEST_OPEN,
229 CEPH_SESSION_OPEN,
230 CEPH_SESSION_REQUEST_CLOSE,
231 CEPH_SESSION_CLOSE,
232 CEPH_SESSION_REQUEST_RENEWCAPS,
233 CEPH_SESSION_RENEWCAPS,
234 CEPH_SESSION_STALE,
235 CEPH_SESSION_RECALL_STATE,
236};
237
238extern const char *ceph_session_op_name(int op);
239
240struct ceph_mds_session_head {
241 __le32 op;
242 __le64 seq;
243 struct ceph_timespec stamp;
244 __le32 max_caps, max_leases;
245} __attribute__ ((packed));
246
247/* client_request */
248/*
249 * metadata ops.
250 * & 0x001000 -> write op
251 * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
252 & & 0x100000 -> use weird ino/path trace
253 */
254#define CEPH_MDS_OP_WRITE 0x001000
255enum {
256 CEPH_MDS_OP_LOOKUP = 0x00100,
257 CEPH_MDS_OP_GETATTR = 0x00101,
258 CEPH_MDS_OP_LOOKUPHASH = 0x00102,
259 CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
260
261 CEPH_MDS_OP_SETXATTR = 0x01105,
262 CEPH_MDS_OP_RMXATTR = 0x01106,
263 CEPH_MDS_OP_SETLAYOUT = 0x01107,
264 CEPH_MDS_OP_SETATTR = 0x01108,
265
266 CEPH_MDS_OP_MKNOD = 0x01201,
267 CEPH_MDS_OP_LINK = 0x01202,
268 CEPH_MDS_OP_UNLINK = 0x01203,
269 CEPH_MDS_OP_RENAME = 0x01204,
270 CEPH_MDS_OP_MKDIR = 0x01220,
271 CEPH_MDS_OP_RMDIR = 0x01221,
272 CEPH_MDS_OP_SYMLINK = 0x01222,
273
274 CEPH_MDS_OP_CREATE = 0x01301,
275 CEPH_MDS_OP_OPEN = 0x00302,
276 CEPH_MDS_OP_READDIR = 0x00305,
277
278 CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
279 CEPH_MDS_OP_MKSNAP = 0x01400,
280 CEPH_MDS_OP_RMSNAP = 0x01401,
281 CEPH_MDS_OP_LSSNAP = 0x00402,
282};
283
284extern const char *ceph_mds_op_name(int op);
285
286
287#define CEPH_SETATTR_MODE 1
288#define CEPH_SETATTR_UID 2
289#define CEPH_SETATTR_GID 4
290#define CEPH_SETATTR_MTIME 8
291#define CEPH_SETATTR_ATIME 16
292#define CEPH_SETATTR_SIZE 32
293#define CEPH_SETATTR_CTIME 64
294
295union ceph_mds_request_args {
296 struct {
297 __le32 mask; /* CEPH_CAP_* */
298 } __attribute__ ((packed)) getattr;
299 struct {
300 __le32 mode;
301 __le32 uid;
302 __le32 gid;
303 struct ceph_timespec mtime;
304 struct ceph_timespec atime;
305 __le64 size, old_size; /* old_size needed by truncate */
306 __le32 mask; /* CEPH_SETATTR_* */
307 } __attribute__ ((packed)) setattr;
308 struct {
309 __le32 frag; /* which dir fragment */
310 __le32 max_entries; /* how many dentries to grab */
311 } __attribute__ ((packed)) readdir;
312 struct {
313 __le32 mode;
314 __le32 rdev;
315 } __attribute__ ((packed)) mknod;
316 struct {
317 __le32 mode;
318 } __attribute__ ((packed)) mkdir;
319 struct {
320 __le32 flags;
321 __le32 mode;
322 __le32 stripe_unit; /* layout for newly created file */
323 __le32 stripe_count; /* ... */
324 __le32 object_size;
325 __le32 file_replication;
326 __le32 preferred;
327 } __attribute__ ((packed)) open;
328 struct {
329 __le32 flags;
330 } __attribute__ ((packed)) setxattr;
331 struct {
332 struct ceph_file_layout layout;
333 } __attribute__ ((packed)) setlayout;
334} __attribute__ ((packed));
335
336#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
337#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
338
339struct ceph_mds_request_head {
340 __le64 oldest_client_tid;
341 __le32 mdsmap_epoch; /* on client */
342 __le32 flags; /* CEPH_MDS_FLAG_* */
343 __u8 num_retry, num_fwd; /* count retry, fwd attempts */
344 __le16 num_releases; /* # include cap/lease release records */
345 __le32 op; /* mds op code */
346 __le32 caller_uid, caller_gid;
347 __le64 ino; /* use this ino for openc, mkdir, mknod,
348 etc. (if replaying) */
349 union ceph_mds_request_args args;
350} __attribute__ ((packed));
351
352/* cap/lease release record */
353struct ceph_mds_request_release {
354 __le64 ino, cap_id; /* ino and unique cap id */
355 __le32 caps, wanted; /* new issued, wanted */
356 __le32 seq, issue_seq, mseq;
357 __le32 dname_seq; /* if releasing a dentry lease, a */
358 __le32 dname_len; /* string follows. */
359} __attribute__ ((packed));
360
361/* client reply */
362struct ceph_mds_reply_head {
363 __le32 op;
364 __le32 result;
365 __le32 mdsmap_epoch;
366 __u8 safe; /* true if committed to disk */
367 __u8 is_dentry, is_target; /* true if dentry, target inode records
368 are included with reply */
369} __attribute__ ((packed));
370
371/* one for each node split */
372struct ceph_frag_tree_split {
373 __le32 frag; /* this frag splits... */
374 __le32 by; /* ...by this many bits */
375} __attribute__ ((packed));
376
377struct ceph_frag_tree_head {
378 __le32 nsplits; /* num ceph_frag_tree_split records */
379 struct ceph_frag_tree_split splits[];
380} __attribute__ ((packed));
381
382/* capability issue, for bundling with mds reply */
383struct ceph_mds_reply_cap {
384 __le32 caps, wanted; /* caps issued, wanted */
385 __le64 cap_id;
386 __le32 seq, mseq;
387 __le64 realm; /* snap realm */
388 __u8 flags; /* CEPH_CAP_FLAG_* */
389} __attribute__ ((packed));
390
391#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
392
393/* inode record, for bundling with mds reply */
394struct ceph_mds_reply_inode {
395 __le64 ino;
396 __le64 snapid;
397 __le32 rdev;
398 __le64 version; /* inode version */
399 __le64 xattr_version; /* version for xattr blob */
400 struct ceph_mds_reply_cap cap; /* caps issued for this inode */
401 struct ceph_file_layout layout;
402 struct ceph_timespec ctime, mtime, atime;
403 __le32 time_warp_seq;
404 __le64 size, max_size, truncate_size;
405 __le32 truncate_seq;
406 __le32 mode, uid, gid;
407 __le32 nlink;
408 __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
409 struct ceph_timespec rctime;
410 struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
411} __attribute__ ((packed));
412/* followed by frag array, then symlink string, then xattr blob */
413
414/* reply_lease follows dname, and reply_inode */
415struct ceph_mds_reply_lease {
416 __le16 mask; /* lease type(s) */
417 __le32 duration_ms; /* lease duration */
418 __le32 seq;
419} __attribute__ ((packed));
420
421struct ceph_mds_reply_dirfrag {
422 __le32 frag; /* fragment */
423 __le32 auth; /* auth mds, if this is a delegation point */
424 __le32 ndist; /* number of mds' this is replicated on */
425 __le32 dist[];
426} __attribute__ ((packed));
427
428/* file access modes */
429#define CEPH_FILE_MODE_PIN 0
430#define CEPH_FILE_MODE_RD 1
431#define CEPH_FILE_MODE_WR 2
432#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
433#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
434#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
435
436int ceph_flags_to_mode(int flags);
437
438
439/* capability bits */
440#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
441
442/* generic cap bits */
443#define CEPH_CAP_GSHARED 1 /* client can reads */
444#define CEPH_CAP_GEXCL 2 /* client can read and update */
445#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
446#define CEPH_CAP_GRD 8 /* (file) client can read */
447#define CEPH_CAP_GWR 16 /* (file) client can write */
448#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
449#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
450#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
451
452/* per-lock shift */
453#define CEPH_CAP_SAUTH 2
454#define CEPH_CAP_SLINK 4
455#define CEPH_CAP_SXATTR 6
456#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */
457
458#define CEPH_CAP_BITS 16
459
460/* composed values */
461#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
462#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
463#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
464#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
465#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
466#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
467#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
468#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
469#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
470#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
471#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
472#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
473#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
474#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
475#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
476
477/* cap masks (for getattr) */
478#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
479#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
480#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
481#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
482#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
483#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
484#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
485#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
486#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
487#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
488#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
489#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
490#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
491 CEPH_CAP_AUTH_SHARED | \
492 CEPH_CAP_LINK_SHARED | \
493 CEPH_CAP_FILE_SHARED | \
494 CEPH_CAP_XATTR_SHARED)
495
496#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
497 CEPH_CAP_LINK_SHARED | \
498 CEPH_CAP_XATTR_SHARED | \
499 CEPH_CAP_FILE_SHARED)
500#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
501 CEPH_CAP_FILE_CACHE)
502
503#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
504 CEPH_CAP_LINK_EXCL | \
505 CEPH_CAP_XATTR_EXCL | \
506 CEPH_CAP_FILE_EXCL)
507#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
508 CEPH_CAP_FILE_EXCL)
509#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
510#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
511 CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
512
513#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
514 CEPH_LOCK_IXATTR)
515
516int ceph_caps_for_mode(int mode);
517
518enum {
519 CEPH_CAP_OP_GRANT, /* mds->client grant */
520 CEPH_CAP_OP_REVOKE, /* mds->client revoke */
521 CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
522 CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
523 CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
524 CEPH_CAP_OP_UPDATE, /* client->mds update */
525 CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
526 CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
527 CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
528 CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
529 CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
530 CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
531 CEPH_CAP_OP_RENEW, /* client->mds renewal request */
532};
533
534extern const char *ceph_cap_op_name(int op);
535
536/*
537 * caps message, used for capability callbacks, acks, requests, etc.
538 */
539struct ceph_mds_caps {
540 __le32 op; /* CEPH_CAP_OP_* */
541 __le64 ino, realm;
542 __le64 cap_id;
543 __le32 seq, issue_seq;
544 __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
545 __le32 migrate_seq;
546 __le64 snap_follows;
547 __le32 snap_trace_len;
548
549 /* authlock */
550 __le32 uid, gid, mode;
551
552 /* linklock */
553 __le32 nlink;
554
555 /* xattrlock */
556 __le32 xattr_len;
557 __le64 xattr_version;
558
559 /* filelock */
560 __le64 size, max_size, truncate_size;
561 __le32 truncate_seq;
562 struct ceph_timespec mtime, atime, ctime;
563 struct ceph_file_layout layout;
564 __le32 time_warp_seq;
565} __attribute__ ((packed));
566
567/* cap release msg head */
568struct ceph_mds_cap_release {
569 __le32 num; /* number of cap_items that follow */
570} __attribute__ ((packed));
571
572struct ceph_mds_cap_item {
573 __le64 ino;
574 __le64 cap_id;
575 __le32 migrate_seq, seq;
576} __attribute__ ((packed));
577
578#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
579#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
580#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
581#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
582
583extern const char *ceph_lease_op_name(int o);
584
585/* lease msg header */
586struct ceph_mds_lease {
587 __u8 action; /* CEPH_MDS_LEASE_* */
588 __le16 mask; /* which lease */
589 __le64 ino;
590 __le64 first, last; /* snap range */
591 __le32 seq;
592 __le32 duration_ms; /* duration of renewal */
593} __attribute__ ((packed));
594/* followed by a __le32+string for dname */
595
596/* client reconnect */
597struct ceph_mds_cap_reconnect {
598 __le64 cap_id;
599 __le32 wanted;
600 __le32 issued;
601 __le64 size;
602 struct ceph_timespec mtime, atime;
603 __le64 snaprealm;
604 __le64 pathbase; /* base ino for our path to this ino */
605} __attribute__ ((packed));
606/* followed by encoded string */
607
608struct ceph_mds_snaprealm_reconnect {
609 __le64 ino; /* snap realm base */
610 __le64 seq; /* snap seq for this snap realm */
611 __le64 parent; /* parent realm */
612} __attribute__ ((packed));
613
614/*
615 * snaps
616 */
617enum {
618 CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
619 CEPH_SNAP_OP_CREATE,
620 CEPH_SNAP_OP_DESTROY,
621 CEPH_SNAP_OP_SPLIT,
622};
623
624extern const char *ceph_snap_op_name(int o);
625
626/* snap msg header */
627struct ceph_mds_snap_head {
628 __le32 op; /* CEPH_SNAP_OP_* */
629 __le64 split; /* ino to split off, if any */
630 __le32 num_split_inos; /* # inos belonging to new child realm */
631 __le32 num_split_realms; /* # child realms udner new child realm */
632 __le32 trace_len; /* size of snap trace blob */
633} __attribute__ ((packed));
634/* followed by split ino list, then split realms, then the trace blob */
635
636/*
637 * encode info about a snaprealm, as viewed by a client
638 */
639struct ceph_mds_snap_realm {
640 __le64 ino; /* ino */
641 __le64 created; /* snap: when created */
642 __le64 parent; /* ino: parent realm */
643 __le64 parent_since; /* snap: same parent since */
644 __le64 seq; /* snap: version */
645 __le32 num_snaps;
646 __le32 num_prior_parent_snaps;
647} __attribute__ ((packed));
648/* followed by my snap list, then prior parent snap list */
649
650#endif
diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
new file mode 100644
index 000000000000..bd570015d147
--- /dev/null
+++ b/fs/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
1
2#include "types.h"
3
4/*
5 * Robert Jenkin's hash function.
6 * http://burtleburtle.net/bob/hash/evahash.html
7 * This is in the public domain.
8 */
9#define mix(a, b, c) \
10 do { \
11 a = a - b; a = a - c; a = a ^ (c >> 13); \
12 b = b - c; b = b - a; b = b ^ (a << 8); \
13 c = c - a; c = c - b; c = c ^ (b >> 13); \
14 a = a - b; a = a - c; a = a ^ (c >> 12); \
15 b = b - c; b = b - a; b = b ^ (a << 16); \
16 c = c - a; c = c - b; c = c ^ (b >> 5); \
17 a = a - b; a = a - c; a = a ^ (c >> 3); \
18 b = b - c; b = b - a; b = b ^ (a << 10); \
19 c = c - a; c = c - b; c = c ^ (b >> 15); \
20 } while (0)
21
22unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
23{
24 const unsigned char *k = (const unsigned char *)str;
25 __u32 a, b, c; /* the internal state */
26 __u32 len; /* how many key bytes still need mixing */
27
28 /* Set up the internal state */
29 len = length;
30 a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
31 b = a;
32 c = 0; /* variable initialization of internal state */
33
34 /* handle most of the key */
35 while (len >= 12) {
36 a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
37 ((__u32)k[3] << 24));
38 b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
39 ((__u32)k[7] << 24));
40 c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
41 ((__u32)k[11] << 24));
42 mix(a, b, c);
43 k = k + 12;
44 len = len - 12;
45 }
46
47 /* handle the last 11 bytes */
48 c = c + length;
49 switch (len) { /* all the case statements fall through */
50 case 11:
51 c = c + ((__u32)k[10] << 24);
52 case 10:
53 c = c + ((__u32)k[9] << 16);
54 case 9:
55 c = c + ((__u32)k[8] << 8);
56 /* the first byte of c is reserved for the length */
57 case 8:
58 b = b + ((__u32)k[7] << 24);
59 case 7:
60 b = b + ((__u32)k[6] << 16);
61 case 6:
62 b = b + ((__u32)k[5] << 8);
63 case 5:
64 b = b + k[4];
65 case 4:
66 a = a + ((__u32)k[3] << 24);
67 case 3:
68 a = a + ((__u32)k[2] << 16);
69 case 2:
70 a = a + ((__u32)k[1] << 8);
71 case 1:
72 a = a + k[0];
73 /* case 0: nothing left to add */
74 }
75 mix(a, b, c);
76
77 return c;
78}
79
80/*
81 * linux dcache hash
82 */
83unsigned ceph_str_hash_linux(const char *str, unsigned length)
84{
85 unsigned long hash = 0;
86 unsigned char c;
87
88 while (length--) {
89 c = *str++;
90 hash = (hash + (c << 4) + (c >> 4)) * 11;
91 }
92 return hash;
93}
94
95
96unsigned ceph_str_hash(int type, const char *s, unsigned len)
97{
98 switch (type) {
99 case CEPH_STR_HASH_LINUX:
100 return ceph_str_hash_linux(s, len);
101 case CEPH_STR_HASH_RJENKINS:
102 return ceph_str_hash_rjenkins(s, len);
103 default:
104 return -1;
105 }
106}
107
108const char *ceph_str_hash_name(int type)
109{
110 switch (type) {
111 case CEPH_STR_HASH_LINUX:
112 return "linux";
113 case CEPH_STR_HASH_RJENKINS:
114 return "rjenkins";
115 default:
116 return "unknown";
117 }
118}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
new file mode 100644
index 000000000000..5ac470c433c9
--- /dev/null
+++ b/fs/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
1#ifndef _FS_CEPH_HASH_H
2#define _FS_CEPH_HASH_H
3
4#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
5#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
6
7extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
8extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
9
10extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
11extern const char *ceph_str_hash_name(int type);
12
13#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
new file mode 100644
index 000000000000..8e4be6a80c62
--- /dev/null
+++ b/fs/ceph/ceph_strings.c
@@ -0,0 +1,176 @@
1/*
2 * Ceph string constants
3 */
4#include "types.h"
5
6const char *ceph_entity_type_name(int type)
7{
8 switch (type) {
9 case CEPH_ENTITY_TYPE_MDS: return "mds";
10 case CEPH_ENTITY_TYPE_OSD: return "osd";
11 case CEPH_ENTITY_TYPE_MON: return "mon";
12 case CEPH_ENTITY_TYPE_CLIENT: return "client";
13 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
14 case CEPH_ENTITY_TYPE_AUTH: return "auth";
15 default: return "unknown";
16 }
17}
18
19const char *ceph_osd_op_name(int op)
20{
21 switch (op) {
22 case CEPH_OSD_OP_READ: return "read";
23 case CEPH_OSD_OP_STAT: return "stat";
24
25 case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
26
27 case CEPH_OSD_OP_WRITE: return "write";
28 case CEPH_OSD_OP_DELETE: return "delete";
29 case CEPH_OSD_OP_TRUNCATE: return "truncate";
30 case CEPH_OSD_OP_ZERO: return "zero";
31 case CEPH_OSD_OP_WRITEFULL: return "writefull";
32
33 case CEPH_OSD_OP_APPEND: return "append";
34 case CEPH_OSD_OP_STARTSYNC: return "startsync";
35 case CEPH_OSD_OP_SETTRUNC: return "settrunc";
36 case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
37
38 case CEPH_OSD_OP_TMAPUP: return "tmapup";
39 case CEPH_OSD_OP_TMAPGET: return "tmapget";
40 case CEPH_OSD_OP_TMAPPUT: return "tmapput";
41
42 case CEPH_OSD_OP_GETXATTR: return "getxattr";
43 case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
44 case CEPH_OSD_OP_SETXATTR: return "setxattr";
45 case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
46 case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
47 case CEPH_OSD_OP_RMXATTR: return "rmxattr";
48
49 case CEPH_OSD_OP_PULL: return "pull";
50 case CEPH_OSD_OP_PUSH: return "push";
51 case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
52 case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
53 case CEPH_OSD_OP_SCRUB: return "scrub";
54
55 case CEPH_OSD_OP_WRLOCK: return "wrlock";
56 case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
57 case CEPH_OSD_OP_RDLOCK: return "rdlock";
58 case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
59 case CEPH_OSD_OP_UPLOCK: return "uplock";
60 case CEPH_OSD_OP_DNLOCK: return "dnlock";
61
62 case CEPH_OSD_OP_CALL: return "call";
63
64 case CEPH_OSD_OP_PGLS: return "pgls";
65 }
66 return "???";
67}
68
69const char *ceph_mds_state_name(int s)
70{
71 switch (s) {
72 /* down and out */
73 case CEPH_MDS_STATE_DNE: return "down:dne";
74 case CEPH_MDS_STATE_STOPPED: return "down:stopped";
75 /* up and out */
76 case CEPH_MDS_STATE_BOOT: return "up:boot";
77 case CEPH_MDS_STATE_STANDBY: return "up:standby";
78 case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
79 case CEPH_MDS_STATE_CREATING: return "up:creating";
80 case CEPH_MDS_STATE_STARTING: return "up:starting";
81 /* up and in */
82 case CEPH_MDS_STATE_REPLAY: return "up:replay";
83 case CEPH_MDS_STATE_RESOLVE: return "up:resolve";
84 case CEPH_MDS_STATE_RECONNECT: return "up:reconnect";
85 case CEPH_MDS_STATE_REJOIN: return "up:rejoin";
86 case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
87 case CEPH_MDS_STATE_ACTIVE: return "up:active";
88 case CEPH_MDS_STATE_STOPPING: return "up:stopping";
89 }
90 return "???";
91}
92
93const char *ceph_session_op_name(int op)
94{
95 switch (op) {
96 case CEPH_SESSION_REQUEST_OPEN: return "request_open";
97 case CEPH_SESSION_OPEN: return "open";
98 case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
99 case CEPH_SESSION_CLOSE: return "close";
100 case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
101 case CEPH_SESSION_RENEWCAPS: return "renewcaps";
102 case CEPH_SESSION_STALE: return "stale";
103 case CEPH_SESSION_RECALL_STATE: return "recall_state";
104 }
105 return "???";
106}
107
108const char *ceph_mds_op_name(int op)
109{
110 switch (op) {
111 case CEPH_MDS_OP_LOOKUP: return "lookup";
112 case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
113 case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
114 case CEPH_MDS_OP_GETATTR: return "getattr";
115 case CEPH_MDS_OP_SETXATTR: return "setxattr";
116 case CEPH_MDS_OP_SETATTR: return "setattr";
117 case CEPH_MDS_OP_RMXATTR: return "rmxattr";
118 case CEPH_MDS_OP_READDIR: return "readdir";
119 case CEPH_MDS_OP_MKNOD: return "mknod";
120 case CEPH_MDS_OP_LINK: return "link";
121 case CEPH_MDS_OP_UNLINK: return "unlink";
122 case CEPH_MDS_OP_RENAME: return "rename";
123 case CEPH_MDS_OP_MKDIR: return "mkdir";
124 case CEPH_MDS_OP_RMDIR: return "rmdir";
125 case CEPH_MDS_OP_SYMLINK: return "symlink";
126 case CEPH_MDS_OP_CREATE: return "create";
127 case CEPH_MDS_OP_OPEN: return "open";
128 case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
129 case CEPH_MDS_OP_LSSNAP: return "lssnap";
130 case CEPH_MDS_OP_MKSNAP: return "mksnap";
131 case CEPH_MDS_OP_RMSNAP: return "rmsnap";
132 }
133 return "???";
134}
135
136const char *ceph_cap_op_name(int op)
137{
138 switch (op) {
139 case CEPH_CAP_OP_GRANT: return "grant";
140 case CEPH_CAP_OP_REVOKE: return "revoke";
141 case CEPH_CAP_OP_TRUNC: return "trunc";
142 case CEPH_CAP_OP_EXPORT: return "export";
143 case CEPH_CAP_OP_IMPORT: return "import";
144 case CEPH_CAP_OP_UPDATE: return "update";
145 case CEPH_CAP_OP_DROP: return "drop";
146 case CEPH_CAP_OP_FLUSH: return "flush";
147 case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
148 case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
149 case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
150 case CEPH_CAP_OP_RELEASE: return "release";
151 case CEPH_CAP_OP_RENEW: return "renew";
152 }
153 return "???";
154}
155
156const char *ceph_lease_op_name(int o)
157{
158 switch (o) {
159 case CEPH_MDS_LEASE_REVOKE: return "revoke";
160 case CEPH_MDS_LEASE_RELEASE: return "release";
161 case CEPH_MDS_LEASE_RENEW: return "renew";
162 case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
163 }
164 return "???";
165}
166
167const char *ceph_snap_op_name(int o)
168{
169 switch (o) {
170 case CEPH_SNAP_OP_UPDATE: return "update";
171 case CEPH_SNAP_OP_CREATE: return "create";
172 case CEPH_SNAP_OP_DESTROY: return "destroy";
173 case CEPH_SNAP_OP_SPLIT: return "split";
174 }
175 return "???";
176}
diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
new file mode 100644
index 000000000000..fabd302e5779
--- /dev/null
+++ b/fs/ceph/crush/crush.c
@@ -0,0 +1,151 @@
1
2#ifdef __KERNEL__
3# include <linux/slab.h>
4#else
5# include <stdlib.h>
6# include <assert.h>
7# define kfree(x) do { if (x) free(x); } while (0)
8# define BUG_ON(x) assert(!(x))
9#endif
10
11#include "crush.h"
12
13const char *crush_bucket_alg_name(int alg)
14{
15 switch (alg) {
16 case CRUSH_BUCKET_UNIFORM: return "uniform";
17 case CRUSH_BUCKET_LIST: return "list";
18 case CRUSH_BUCKET_TREE: return "tree";
19 case CRUSH_BUCKET_STRAW: return "straw";
20 default: return "unknown";
21 }
22}
23
24/**
25 * crush_get_bucket_item_weight - Get weight of an item in given bucket
26 * @b: bucket pointer
27 * @p: item index in bucket
28 */
29int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
30{
31 if (p >= b->size)
32 return 0;
33
34 switch (b->alg) {
35 case CRUSH_BUCKET_UNIFORM:
36 return ((struct crush_bucket_uniform *)b)->item_weight;
37 case CRUSH_BUCKET_LIST:
38 return ((struct crush_bucket_list *)b)->item_weights[p];
39 case CRUSH_BUCKET_TREE:
40 if (p & 1)
41 return ((struct crush_bucket_tree *)b)->node_weights[p];
42 return 0;
43 case CRUSH_BUCKET_STRAW:
44 return ((struct crush_bucket_straw *)b)->item_weights[p];
45 }
46 return 0;
47}
48
49/**
50 * crush_calc_parents - Calculate parent vectors for the given crush map.
51 * @map: crush_map pointer
52 */
53void crush_calc_parents(struct crush_map *map)
54{
55 int i, b, c;
56
57 for (b = 0; b < map->max_buckets; b++) {
58 if (map->buckets[b] == NULL)
59 continue;
60 for (i = 0; i < map->buckets[b]->size; i++) {
61 c = map->buckets[b]->items[i];
62 BUG_ON(c >= map->max_devices ||
63 c < -map->max_buckets);
64 if (c >= 0)
65 map->device_parents[c] = map->buckets[b]->id;
66 else
67 map->bucket_parents[-1-c] = map->buckets[b]->id;
68 }
69 }
70}
71
72void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
73{
74 kfree(b->h.perm);
75 kfree(b->h.items);
76 kfree(b);
77}
78
79void crush_destroy_bucket_list(struct crush_bucket_list *b)
80{
81 kfree(b->item_weights);
82 kfree(b->sum_weights);
83 kfree(b->h.perm);
84 kfree(b->h.items);
85 kfree(b);
86}
87
88void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
89{
90 kfree(b->node_weights);
91 kfree(b);
92}
93
94void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
95{
96 kfree(b->straws);
97 kfree(b->item_weights);
98 kfree(b->h.perm);
99 kfree(b->h.items);
100 kfree(b);
101}
102
103void crush_destroy_bucket(struct crush_bucket *b)
104{
105 switch (b->alg) {
106 case CRUSH_BUCKET_UNIFORM:
107 crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);
108 break;
109 case CRUSH_BUCKET_LIST:
110 crush_destroy_bucket_list((struct crush_bucket_list *)b);
111 break;
112 case CRUSH_BUCKET_TREE:
113 crush_destroy_bucket_tree((struct crush_bucket_tree *)b);
114 break;
115 case CRUSH_BUCKET_STRAW:
116 crush_destroy_bucket_straw((struct crush_bucket_straw *)b);
117 break;
118 }
119}
120
121/**
122 * crush_destroy - Destroy a crush_map
123 * @map: crush_map pointer
124 */
125void crush_destroy(struct crush_map *map)
126{
127 int b;
128
129 /* buckets */
130 if (map->buckets) {
131 for (b = 0; b < map->max_buckets; b++) {
132 if (map->buckets[b] == NULL)
133 continue;
134 crush_destroy_bucket(map->buckets[b]);
135 }
136 kfree(map->buckets);
137 }
138
139 /* rules */
140 if (map->rules) {
141 for (b = 0; b < map->max_rules; b++)
142 kfree(map->rules[b]);
143 kfree(map->rules);
144 }
145
146 kfree(map->bucket_parents);
147 kfree(map->device_parents);
148 kfree(map);
149}
150
151
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644
index 000000000000..dcd7e7523700
--- /dev/null
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,180 @@
1#ifndef _CRUSH_CRUSH_H
2#define _CRUSH_CRUSH_H
3
4#include <linux/types.h>
5
6/*
7 * CRUSH is a pseudo-random data distribution algorithm that
8 * efficiently distributes input values (typically, data objects)
9 * across a heterogeneous, structured storage cluster.
10 *
11 * The algorithm was originally described in detail in this paper
12 * (although the algorithm has evolved somewhat since then):
13 *
14 * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf
15 *
16 * LGPL2
17 */
18
19
20#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
21
22
23#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
24#define CRUSH_MAX_SET 10 /* max size of a mapping result */
25
26
27/*
28 * CRUSH uses user-defined "rules" to describe how inputs should be
29 * mapped to devices. A rule consists of sequence of steps to perform
30 * to generate the set of output devices.
31 */
32struct crush_rule_step {
33 __u32 op;
34 __s32 arg1;
35 __s32 arg2;
36};
37
38/* step op codes */
39enum {
40 CRUSH_RULE_NOOP = 0,
41 CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
42 CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
43 /* arg2 = type */
44 CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
45 CRUSH_RULE_EMIT = 4, /* no args */
46 CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
47 CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
48};
49
50/*
51 * for specifying choose num (arg1) relative to the max parameter
52 * passed to do_rule
53 */
54#define CRUSH_CHOOSE_N 0
55#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
56
57/*
58 * The rule mask is used to describe what the rule is intended for.
59 * Given a ruleset and size of output set, we search through the
60 * rule list for a matching rule_mask.
61 */
62struct crush_rule_mask {
63 __u8 ruleset;
64 __u8 type;
65 __u8 min_size;
66 __u8 max_size;
67};
68
69struct crush_rule {
70 __u32 len;
71 struct crush_rule_mask mask;
72 struct crush_rule_step steps[0];
73};
74
75#define crush_rule_size(len) (sizeof(struct crush_rule) + \
76 (len)*sizeof(struct crush_rule_step))
77
78
79
80/*
81 * A bucket is a named container of other items (either devices or
82 * other buckets). Items within a bucket are chosen using one of a
83 * few different algorithms. The table summarizes how the speed of
84 * each option measures up against mapping stability when items are
85 * added or removed.
86 *
87 * Bucket Alg Speed Additions Removals
88 * ------------------------------------------------
89 * uniform O(1) poor poor
90 * list O(n) optimal poor
91 * tree O(log n) good good
92 * straw O(n) optimal optimal
93 */
94enum {
95 CRUSH_BUCKET_UNIFORM = 1,
96 CRUSH_BUCKET_LIST = 2,
97 CRUSH_BUCKET_TREE = 3,
98 CRUSH_BUCKET_STRAW = 4
99};
100extern const char *crush_bucket_alg_name(int alg);
101
102struct crush_bucket {
103 __s32 id; /* this'll be negative */
104 __u16 type; /* non-zero; type=0 is reserved for devices */
105 __u8 alg; /* one of CRUSH_BUCKET_* */
106 __u8 hash; /* which hash function to use, CRUSH_HASH_* */
107 __u32 weight; /* 16-bit fixed point */
108 __u32 size; /* num items */
109 __s32 *items;
110
111 /*
112 * cached random permutation: used for uniform bucket and for
113 * the linear search fallback for the other bucket types.
114 */
115 __u32 perm_x; /* @x for which *perm is defined */
116 __u32 perm_n; /* num elements of *perm that are permuted/defined */
117 __u32 *perm;
118};
119
120struct crush_bucket_uniform {
121 struct crush_bucket h;
122 __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
123};
124
125struct crush_bucket_list {
126 struct crush_bucket h;
127 __u32 *item_weights; /* 16-bit fixed point */
128 __u32 *sum_weights; /* 16-bit fixed point. element i is sum
129 of weights 0..i, inclusive */
130};
131
132struct crush_bucket_tree {
133 struct crush_bucket h; /* note: h.size is _tree_ size, not number of
134 actual items */
135 __u8 num_nodes;
136 __u32 *node_weights;
137};
138
139struct crush_bucket_straw {
140 struct crush_bucket h;
141 __u32 *item_weights; /* 16-bit fixed point */
142 __u32 *straws; /* 16-bit fixed point */
143};
144
145
146
147/*
148 * CRUSH map includes all buckets, rules, etc.
149 */
150struct crush_map {
151 struct crush_bucket **buckets;
152 struct crush_rule **rules;
153
154 /*
155 * Parent pointers to identify the parent bucket a device or
156 * bucket in the hierarchy. If an item appears more than
157 * once, this is the _last_ time it appeared (where buckets
158 * are processed in bucket id order, from -1 on down to
159 * -max_buckets.
160 */
161 __u32 *bucket_parents;
162 __u32 *device_parents;
163
164 __s32 max_buckets;
165 __u32 max_rules;
166 __s32 max_devices;
167};
168
169
170/* crush.c */
171extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
172extern void crush_calc_parents(struct crush_map *map);
173extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
174extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
175extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
176extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
177extern void crush_destroy_bucket(struct crush_bucket *b);
178extern void crush_destroy(struct crush_map *map);
179
180#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
new file mode 100644
index 000000000000..5873aed694bf
--- /dev/null
+++ b/fs/ceph/crush/hash.c
@@ -0,0 +1,149 @@
1
2#include <linux/types.h>
3#include "hash.h"
4
5/*
6 * Robert Jenkins' function for mixing 32-bit values
7 * http://burtleburtle.net/bob/hash/evahash.html
8 * a, b = random bits, c = input and output
9 */
10#define crush_hashmix(a, b, c) do { \
11 a = a-b; a = a-c; a = a^(c>>13); \
12 b = b-c; b = b-a; b = b^(a<<8); \
13 c = c-a; c = c-b; c = c^(b>>13); \
14 a = a-b; a = a-c; a = a^(c>>12); \
15 b = b-c; b = b-a; b = b^(a<<16); \
16 c = c-a; c = c-b; c = c^(b>>5); \
17 a = a-b; a = a-c; a = a^(c>>3); \
18 b = b-c; b = b-a; b = b^(a<<10); \
19 c = c-a; c = c-b; c = c^(b>>15); \
20 } while (0)
21
22#define crush_hash_seed 1315423911
23
24static __u32 crush_hash32_rjenkins1(__u32 a)
25{
26 __u32 hash = crush_hash_seed ^ a;
27 __u32 b = a;
28 __u32 x = 231232;
29 __u32 y = 1232;
30 crush_hashmix(b, x, hash);
31 crush_hashmix(y, a, hash);
32 return hash;
33}
34
35static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
36{
37 __u32 hash = crush_hash_seed ^ a ^ b;
38 __u32 x = 231232;
39 __u32 y = 1232;
40 crush_hashmix(a, b, hash);
41 crush_hashmix(x, a, hash);
42 crush_hashmix(b, y, hash);
43 return hash;
44}
45
46static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
47{
48 __u32 hash = crush_hash_seed ^ a ^ b ^ c;
49 __u32 x = 231232;
50 __u32 y = 1232;
51 crush_hashmix(a, b, hash);
52 crush_hashmix(c, x, hash);
53 crush_hashmix(y, a, hash);
54 crush_hashmix(b, x, hash);
55 crush_hashmix(y, c, hash);
56 return hash;
57}
58
59static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
60{
61 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
62 __u32 x = 231232;
63 __u32 y = 1232;
64 crush_hashmix(a, b, hash);
65 crush_hashmix(c, d, hash);
66 crush_hashmix(a, x, hash);
67 crush_hashmix(y, b, hash);
68 crush_hashmix(c, x, hash);
69 crush_hashmix(y, d, hash);
70 return hash;
71}
72
73static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
74 __u32 e)
75{
76 __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
77 __u32 x = 231232;
78 __u32 y = 1232;
79 crush_hashmix(a, b, hash);
80 crush_hashmix(c, d, hash);
81 crush_hashmix(e, x, hash);
82 crush_hashmix(y, a, hash);
83 crush_hashmix(b, x, hash);
84 crush_hashmix(y, c, hash);
85 crush_hashmix(d, x, hash);
86 crush_hashmix(y, e, hash);
87 return hash;
88}
89
90
91__u32 crush_hash32(int type, __u32 a)
92{
93 switch (type) {
94 case CRUSH_HASH_RJENKINS1:
95 return crush_hash32_rjenkins1(a);
96 default:
97 return 0;
98 }
99}
100
101__u32 crush_hash32_2(int type, __u32 a, __u32 b)
102{
103 switch (type) {
104 case CRUSH_HASH_RJENKINS1:
105 return crush_hash32_rjenkins1_2(a, b);
106 default:
107 return 0;
108 }
109}
110
111__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
112{
113 switch (type) {
114 case CRUSH_HASH_RJENKINS1:
115 return crush_hash32_rjenkins1_3(a, b, c);
116 default:
117 return 0;
118 }
119}
120
121__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
122{
123 switch (type) {
124 case CRUSH_HASH_RJENKINS1:
125 return crush_hash32_rjenkins1_4(a, b, c, d);
126 default:
127 return 0;
128 }
129}
130
131__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
132{
133 switch (type) {
134 case CRUSH_HASH_RJENKINS1:
135 return crush_hash32_rjenkins1_5(a, b, c, d, e);
136 default:
137 return 0;
138 }
139}
140
141const char *crush_hash_name(int type)
142{
143 switch (type) {
144 case CRUSH_HASH_RJENKINS1:
145 return "rjenkins1";
146 default:
147 return "unknown";
148 }
149}
diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
new file mode 100644
index 000000000000..ff48e110e4bb
--- /dev/null
+++ b/fs/ceph/crush/hash.h
@@ -0,0 +1,17 @@
1#ifndef _CRUSH_HASH_H
2#define _CRUSH_HASH_H
3
4#define CRUSH_HASH_RJENKINS1 0
5
6#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
7
8extern const char *crush_hash_name(int type);
9
10extern __u32 crush_hash32(int type, __u32 a);
11extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
12extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
13extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
14extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
15 __u32 e);
16
17#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644
index 000000000000..9ba54efb6543
--- /dev/null
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,596 @@
1
2#ifdef __KERNEL__
3# include <linux/string.h>
4# include <linux/slab.h>
5# include <linux/bug.h>
6# include <linux/kernel.h>
7# ifndef dprintk
8# define dprintk(args...)
9# endif
10#else
11# include <string.h>
12# include <stdio.h>
13# include <stdlib.h>
14# include <assert.h>
15# define BUG_ON(x) assert(!(x))
16# define dprintk(args...) /* printf(args) */
17# define kmalloc(x, f) malloc(x)
18# define kfree(x) free(x)
19#endif
20
21#include "crush.h"
22#include "hash.h"
23
24/*
25 * Implement the core CRUSH mapping algorithm.
26 */
27
28/**
29 * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
30 * @map: the crush_map
31 * @ruleset: the storage ruleset id (user defined)
32 * @type: storage ruleset type (user defined)
33 * @size: output set size
34 */
35int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)
36{
37 int i;
38
39 for (i = 0; i < map->max_rules; i++) {
40 if (map->rules[i] &&
41 map->rules[i]->mask.ruleset == ruleset &&
42 map->rules[i]->mask.type == type &&
43 map->rules[i]->mask.min_size <= size &&
44 map->rules[i]->mask.max_size >= size)
45 return i;
46 }
47 return -1;
48}
49
50
51/*
52 * bucket choose methods
53 *
54 * For each bucket algorithm, we have a "choose" method that, given a
55 * crush input @x and replica position (usually, position in output set) @r,
56 * will produce an item in the bucket.
57 */
58
59/*
60 * Choose based on a random permutation of the bucket.
61 *
62 * We used to use some prime number arithmetic to do this, but it
63 * wasn't very random, and had some other bad behaviors. Instead, we
64 * calculate an actual random permutation of the bucket members.
65 * Since this is expensive, we optimize for the r=0 case, which
66 * captures the vast majority of calls.
67 */
68static int bucket_perm_choose(struct crush_bucket *bucket,
69 int x, int r)
70{
71 unsigned pr = r % bucket->size;
72 unsigned i, s;
73
74 /* start a new permutation if @x has changed */
75 if (bucket->perm_x != x || bucket->perm_n == 0) {
76 dprintk("bucket %d new x=%d\n", bucket->id, x);
77 bucket->perm_x = x;
78
79 /* optimize common r=0 case */
80 if (pr == 0) {
81 s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
82 bucket->size;
83 bucket->perm[0] = s;
84 bucket->perm_n = 0xffff; /* magic value, see below */
85 goto out;
86 }
87
88 for (i = 0; i < bucket->size; i++)
89 bucket->perm[i] = i;
90 bucket->perm_n = 0;
91 } else if (bucket->perm_n == 0xffff) {
92 /* clean up after the r=0 case above */
93 for (i = 1; i < bucket->size; i++)
94 bucket->perm[i] = i;
95 bucket->perm[bucket->perm[0]] = 0;
96 bucket->perm_n = 1;
97 }
98
99 /* calculate permutation up to pr */
100 for (i = 0; i < bucket->perm_n; i++)
101 dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
102 while (bucket->perm_n <= pr) {
103 unsigned p = bucket->perm_n;
104 /* no point in swapping the final entry */
105 if (p < bucket->size - 1) {
106 i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
107 (bucket->size - p);
108 if (i) {
109 unsigned t = bucket->perm[p + i];
110 bucket->perm[p + i] = bucket->perm[p];
111 bucket->perm[p] = t;
112 }
113 dprintk(" perm_choose swap %d with %d\n", p, p+i);
114 }
115 bucket->perm_n++;
116 }
117 for (i = 0; i < bucket->size; i++)
118 dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
119
120 s = bucket->perm[pr];
121out:
122 dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
123 bucket->size, x, r, pr, s);
124 return bucket->items[s];
125}
126
127/* uniform */
128static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
129 int x, int r)
130{
131 return bucket_perm_choose(&bucket->h, x, r);
132}
133
134/* list */
135static int bucket_list_choose(struct crush_bucket_list *bucket,
136 int x, int r)
137{
138 int i;
139
140 for (i = bucket->h.size-1; i >= 0; i--) {
141 __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
142 r, bucket->h.id);
143 w &= 0xffff;
144 dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
145 "sw %x rand %llx",
146 i, x, r, bucket->h.items[i], bucket->item_weights[i],
147 bucket->sum_weights[i], w);
148 w *= bucket->sum_weights[i];
149 w = w >> 16;
150 /*dprintk(" scaled %llx\n", w);*/
151 if (w < bucket->item_weights[i])
152 return bucket->h.items[i];
153 }
154
155 BUG_ON(1);
156 return 0;
157}
158
159
160/* (binary) tree */
161static int height(int n)
162{
163 int h = 0;
164 while ((n & 1) == 0) {
165 h++;
166 n = n >> 1;
167 }
168 return h;
169}
170
171static int left(int x)
172{
173 int h = height(x);
174 return x - (1 << (h-1));
175}
176
177static int right(int x)
178{
179 int h = height(x);
180 return x + (1 << (h-1));
181}
182
183static int terminal(int x)
184{
185 return x & 1;
186}
187
188static int bucket_tree_choose(struct crush_bucket_tree *bucket,
189 int x, int r)
190{
191 int n, l;
192 __u32 w;
193 __u64 t;
194
195 /* start at root */
196 n = bucket->num_nodes >> 1;
197
198 while (!terminal(n)) {
199 /* pick point in [0, w) */
200 w = bucket->node_weights[n];
201 t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
202 bucket->h.id) * (__u64)w;
203 t = t >> 32;
204
205 /* descend to the left or right? */
206 l = left(n);
207 if (t < bucket->node_weights[l])
208 n = l;
209 else
210 n = right(n);
211 }
212
213 return bucket->h.items[n >> 1];
214}
215
216
217/* straw */
218
219static int bucket_straw_choose(struct crush_bucket_straw *bucket,
220 int x, int r)
221{
222 int i;
223 int high = 0;
224 __u64 high_draw = 0;
225 __u64 draw;
226
227 for (i = 0; i < bucket->h.size; i++) {
228 draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
229 draw &= 0xffff;
230 draw *= bucket->straws[i];
231 if (i == 0 || draw > high_draw) {
232 high = i;
233 high_draw = draw;
234 }
235 }
236 return bucket->h.items[high];
237}
238
239static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
240{
241 dprintk("choose %d x=%d r=%d\n", in->id, x, r);
242 switch (in->alg) {
243 case CRUSH_BUCKET_UNIFORM:
244 return bucket_uniform_choose((struct crush_bucket_uniform *)in,
245 x, r);
246 case CRUSH_BUCKET_LIST:
247 return bucket_list_choose((struct crush_bucket_list *)in,
248 x, r);
249 case CRUSH_BUCKET_TREE:
250 return bucket_tree_choose((struct crush_bucket_tree *)in,
251 x, r);
252 case CRUSH_BUCKET_STRAW:
253 return bucket_straw_choose((struct crush_bucket_straw *)in,
254 x, r);
255 default:
256 BUG_ON(1);
257 return in->items[0];
258 }
259}
260
261/*
262 * true if device is marked "out" (failed, fully offloaded)
263 * of the cluster
264 */
265static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
266{
267 if (weight[item] >= 0x1000)
268 return 0;
269 if (weight[item] == 0)
270 return 1;
271 if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
272 < weight[item])
273 return 0;
274 return 1;
275}
276
277/**
278 * crush_choose - choose numrep distinct items of given type
279 * @map: the crush_map
280 * @bucket: the bucket we are choose an item from
281 * @x: crush input value
282 * @numrep: the number of items to choose
283 * @type: the type of item to choose
284 * @out: pointer to output vector
285 * @outpos: our position in that vector
286 * @firstn: true if choosing "first n" items, false if choosing "indep"
287 * @recurse_to_leaf: true if we want one device under each item of given type
288 * @out2: second output vector for leaf items (if @recurse_to_leaf)
289 */
290static int crush_choose(struct crush_map *map,
291 struct crush_bucket *bucket,
292 __u32 *weight,
293 int x, int numrep, int type,
294 int *out, int outpos,
295 int firstn, int recurse_to_leaf,
296 int *out2)
297{
298 int rep;
299 int ftotal, flocal;
300 int retry_descent, retry_bucket, skip_rep;
301 struct crush_bucket *in = bucket;
302 int r;
303 int i;
304 int item = 0;
305 int itemtype;
306 int collide, reject;
307 const int orig_tries = 5; /* attempts before we fall back to search */
308 dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
309
310 for (rep = outpos; rep < numrep; rep++) {
311 /* keep trying until we get a non-out, non-colliding item */
312 ftotal = 0;
313 skip_rep = 0;
314 do {
315 retry_descent = 0;
316 in = bucket; /* initial bucket */
317
318 /* choose through intervening buckets */
319 flocal = 0;
320 do {
321 collide = 0;
322 retry_bucket = 0;
323 r = rep;
324 if (in->alg == CRUSH_BUCKET_UNIFORM) {
325 /* be careful */
326 if (firstn || numrep >= in->size)
327 /* r' = r + f_total */
328 r += ftotal;
329 else if (in->size % numrep == 0)
330 /* r'=r+(n+1)*f_local */
331 r += (numrep+1) *
332 (flocal+ftotal);
333 else
334 /* r' = r + n*f_local */
335 r += numrep * (flocal+ftotal);
336 } else {
337 if (firstn)
338 /* r' = r + f_total */
339 r += ftotal;
340 else
341 /* r' = r + n*f_local */
342 r += numrep * (flocal+ftotal);
343 }
344
345 /* bucket choose */
346 if (in->size == 0) {
347 reject = 1;
348 goto reject;
349 }
350 if (flocal >= (in->size>>1) &&
351 flocal > orig_tries)
352 item = bucket_perm_choose(in, x, r);
353 else
354 item = crush_bucket_choose(in, x, r);
355 BUG_ON(item >= map->max_devices);
356
357 /* desired type? */
358 if (item < 0)
359 itemtype = map->buckets[-1-item]->type;
360 else
361 itemtype = 0;
362 dprintk(" item %d type %d\n", item, itemtype);
363
364 /* keep going? */
365 if (itemtype != type) {
366 BUG_ON(item >= 0 ||
367 (-1-item) >= map->max_buckets);
368 in = map->buckets[-1-item];
369 continue;
370 }
371
372 /* collision? */
373 for (i = 0; i < outpos; i++) {
374 if (out[i] == item) {
375 collide = 1;
376 break;
377 }
378 }
379
380 if (recurse_to_leaf &&
381 item < 0 &&
382 crush_choose(map, map->buckets[-1-item],
383 weight,
384 x, outpos+1, 0,
385 out2, outpos,
386 firstn, 0, NULL) <= outpos) {
387 reject = 1;
388 } else {
389 /* out? */
390 if (itemtype == 0)
391 reject = is_out(map, weight,
392 item, x);
393 else
394 reject = 0;
395 }
396
397reject:
398 if (reject || collide) {
399 ftotal++;
400 flocal++;
401
402 if (collide && flocal < 3)
403 /* retry locally a few times */
404 retry_bucket = 1;
405 else if (flocal < in->size + orig_tries)
406 /* exhaustive bucket search */
407 retry_bucket = 1;
408 else if (ftotal < 20)
409 /* then retry descent */
410 retry_descent = 1;
411 else
412 /* else give up */
413 skip_rep = 1;
414 dprintk(" reject %d collide %d "
415 "ftotal %d flocal %d\n",
416 reject, collide, ftotal,
417 flocal);
418 }
419 } while (retry_bucket);
420 } while (retry_descent);
421
422 if (skip_rep) {
423 dprintk("skip rep\n");
424 continue;
425 }
426
427 dprintk("choose got %d\n", item);
428 out[outpos] = item;
429 outpos++;
430 }
431
432 dprintk("choose returns %d\n", outpos);
433 return outpos;
434}
435
436
437/**
438 * crush_do_rule - calculate a mapping with the given input and rule
439 * @map: the crush_map
440 * @ruleno: the rule id
441 * @x: hash input
442 * @result: pointer to result vector
443 * @result_max: maximum result size
444 * @force: force initial replica choice; -1 for none
445 */
446int crush_do_rule(struct crush_map *map,
447 int ruleno, int x, int *result, int result_max,
448 int force, __u32 *weight)
449{
450 int result_len;
451 int force_context[CRUSH_MAX_DEPTH];
452 int force_pos = -1;
453 int a[CRUSH_MAX_SET];
454 int b[CRUSH_MAX_SET];
455 int c[CRUSH_MAX_SET];
456 int recurse_to_leaf;
457 int *w;
458 int wsize = 0;
459 int *o;
460 int osize;
461 int *tmp;
462 struct crush_rule *rule;
463 int step;
464 int i, j;
465 int numrep;
466 int firstn;
467 int rc = -1;
468
469 BUG_ON(ruleno >= map->max_rules);
470
471 rule = map->rules[ruleno];
472 result_len = 0;
473 w = a;
474 o = b;
475
476 /*
477 * determine hierarchical context of force, if any. note
478 * that this may or may not correspond to the specific types
479 * referenced by the crush rule.
480 */
481 if (force >= 0) {
482 if (force >= map->max_devices ||
483 map->device_parents[force] == 0) {
484 /*dprintk("CRUSH: forcefed device dne\n");*/
485 rc = -1; /* force fed device dne */
486 goto out;
487 }
488 if (!is_out(map, weight, force, x)) {
489 while (1) {
490 force_context[++force_pos] = force;
491 if (force >= 0)
492 force = map->device_parents[force];
493 else
494 force = map->bucket_parents[-1-force];
495 if (force == 0)
496 break;
497 }
498 }
499 }
500
501 for (step = 0; step < rule->len; step++) {
502 firstn = 0;
503 switch (rule->steps[step].op) {
504 case CRUSH_RULE_TAKE:
505 w[0] = rule->steps[step].arg1;
506 if (force_pos >= 0) {
507 BUG_ON(force_context[force_pos] != w[0]);
508 force_pos--;
509 }
510 wsize = 1;
511 break;
512
513 case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
514 case CRUSH_RULE_CHOOSE_FIRSTN:
515 firstn = 1;
516 case CRUSH_RULE_CHOOSE_LEAF_INDEP:
517 case CRUSH_RULE_CHOOSE_INDEP:
518 BUG_ON(wsize == 0);
519
520 recurse_to_leaf =
521 rule->steps[step].op ==
522 CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
523 rule->steps[step].op ==
524 CRUSH_RULE_CHOOSE_LEAF_INDEP;
525
526 /* reset output */
527 osize = 0;
528
529 for (i = 0; i < wsize; i++) {
530 /*
531 * see CRUSH_N, CRUSH_N_MINUS macros.
532 * basically, numrep <= 0 means relative to
533 * the provided result_max
534 */
535 numrep = rule->steps[step].arg1;
536 if (numrep <= 0) {
537 numrep += result_max;
538 if (numrep <= 0)
539 continue;
540 }
541 j = 0;
542 if (osize == 0 && force_pos >= 0) {
543 /* skip any intermediate types */
544 while (force_pos &&
545 force_context[force_pos] < 0 &&
546 rule->steps[step].arg2 !=
547 map->buckets[-1 -
548 force_context[force_pos]]->type)
549 force_pos--;
550 o[osize] = force_context[force_pos];
551 if (recurse_to_leaf)
552 c[osize] = force_context[0];
553 j++;
554 force_pos--;
555 }
556 osize += crush_choose(map,
557 map->buckets[-1-w[i]],
558 weight,
559 x, numrep,
560 rule->steps[step].arg2,
561 o+osize, j,
562 firstn,
563 recurse_to_leaf, c+osize);
564 }
565
566 if (recurse_to_leaf)
567 /* copy final _leaf_ values to output set */
568 memcpy(o, c, osize*sizeof(*o));
569
570 /* swap t and w arrays */
571 tmp = o;
572 o = w;
573 w = tmp;
574 wsize = osize;
575 break;
576
577
578 case CRUSH_RULE_EMIT:
579 for (i = 0; i < wsize && result_len < result_max; i++) {
580 result[result_len] = w[i];
581 result_len++;
582 }
583 wsize = 0;
584 break;
585
586 default:
587 BUG_ON(1);
588 }
589 }
590 rc = result_len;
591
592out:
593 return rc;
594}
595
596
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
new file mode 100644
index 000000000000..98e90046fd9f
--- /dev/null
+++ b/fs/ceph/crush/mapper.h
@@ -0,0 +1,20 @@
1#ifndef _CRUSH_MAPPER_H
2#define _CRUSH_MAPPER_H
3
4/*
5 * CRUSH functions for find rules and then mapping an input to an
6 * output set.
7 *
8 * LGPL2
9 */
10
11#include "crush.h"
12
13extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
14extern int crush_do_rule(struct crush_map *map,
15 int ruleno,
16 int x, int *result, int result_max,
17 int forcefeed, /* -1 for none */
18 __u32 *weights);
19
20#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
new file mode 100644
index 000000000000..291ac288e791
--- /dev/null
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,408 @@
1
2#include "ceph_debug.h"
3
4#include <linux/err.h>
5#include <linux/scatterlist.h>
6#include <crypto/hash.h>
7
8#include "crypto.h"
9#include "decode.h"
10
11int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
12{
13 if (*p + sizeof(u16) + sizeof(key->created) +
14 sizeof(u16) + key->len > end)
15 return -ERANGE;
16 ceph_encode_16(p, key->type);
17 ceph_encode_copy(p, &key->created, sizeof(key->created));
18 ceph_encode_16(p, key->len);
19 ceph_encode_copy(p, key->key, key->len);
20 return 0;
21}
22
23int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
24{
25 ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
26 key->type = ceph_decode_16(p);
27 ceph_decode_copy(p, &key->created, sizeof(key->created));
28 key->len = ceph_decode_16(p);
29 ceph_decode_need(p, end, key->len, bad);
30 key->key = kmalloc(key->len, GFP_NOFS);
31 if (!key->key)
32 return -ENOMEM;
33 ceph_decode_copy(p, key->key, key->len);
34 return 0;
35
36bad:
37 dout("failed to decode crypto key\n");
38 return -EINVAL;
39}
40
41int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
42{
43 int inlen = strlen(inkey);
44 int blen = inlen * 3 / 4;
45 void *buf, *p;
46 int ret;
47
48 dout("crypto_key_unarmor %s\n", inkey);
49 buf = kmalloc(blen, GFP_NOFS);
50 if (!buf)
51 return -ENOMEM;
52 blen = ceph_unarmor(buf, inkey, inkey+inlen);
53 if (blen < 0) {
54 kfree(buf);
55 return blen;
56 }
57
58 p = buf;
59 ret = ceph_crypto_key_decode(key, &p, p + blen);
60 kfree(buf);
61 if (ret)
62 return ret;
63 dout("crypto_key_unarmor key %p type %d len %d\n", key,
64 key->type, key->len);
65 return 0;
66}
67
68
69
70#define AES_KEY_SIZE 16
71
72static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
73{
74 return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
75}
76
77const u8 *aes_iv = "cephsageyudagreg";
78
79int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
80 const void *src, size_t src_len)
81{
82 struct scatterlist sg_in[2], sg_out[1];
83 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
84 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
85 int ret;
86 void *iv;
87 int ivsize;
88 size_t zero_padding = (0x10 - (src_len & 0x0f));
89 char pad[16];
90
91 if (IS_ERR(tfm))
92 return PTR_ERR(tfm);
93
94 memset(pad, zero_padding, zero_padding);
95
96 *dst_len = src_len + zero_padding;
97
98 crypto_blkcipher_setkey((void *)tfm, key, key_len);
99 sg_init_table(sg_in, 2);
100 sg_set_buf(&sg_in[0], src, src_len);
101 sg_set_buf(&sg_in[1], pad, zero_padding);
102 sg_init_table(sg_out, 1);
103 sg_set_buf(sg_out, dst, *dst_len);
104 iv = crypto_blkcipher_crt(tfm)->iv;
105 ivsize = crypto_blkcipher_ivsize(tfm);
106
107 memcpy(iv, aes_iv, ivsize);
108 /*
109 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
110 key, key_len, 1);
111 print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
112 src, src_len, 1);
113 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
114 pad, zero_padding, 1);
115 */
116 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
117 src_len + zero_padding);
118 crypto_free_blkcipher(tfm);
119 if (ret < 0)
120 pr_err("ceph_aes_crypt failed %d\n", ret);
121 /*
122 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
123 dst, *dst_len, 1);
124 */
125 return 0;
126}
127
128int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
129 const void *src1, size_t src1_len,
130 const void *src2, size_t src2_len)
131{
132 struct scatterlist sg_in[3], sg_out[1];
133 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
134 struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
135 int ret;
136 void *iv;
137 int ivsize;
138 size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
139 char pad[16];
140
141 if (IS_ERR(tfm))
142 return PTR_ERR(tfm);
143
144 memset(pad, zero_padding, zero_padding);
145
146 *dst_len = src1_len + src2_len + zero_padding;
147
148 crypto_blkcipher_setkey((void *)tfm, key, key_len);
149 sg_init_table(sg_in, 3);
150 sg_set_buf(&sg_in[0], src1, src1_len);
151 sg_set_buf(&sg_in[1], src2, src2_len);
152 sg_set_buf(&sg_in[2], pad, zero_padding);
153 sg_init_table(sg_out, 1);
154 sg_set_buf(sg_out, dst, *dst_len);
155 iv = crypto_blkcipher_crt(tfm)->iv;
156 ivsize = crypto_blkcipher_ivsize(tfm);
157
158 memcpy(iv, aes_iv, ivsize);
159 /*
160 print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
161 key, key_len, 1);
162 print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
163 src1, src1_len, 1);
164 print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
165 src2, src2_len, 1);
166 print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
167 pad, zero_padding, 1);
168 */
169 ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
170 src1_len + src2_len + zero_padding);
171 crypto_free_blkcipher(tfm);
172 if (ret < 0)
173 pr_err("ceph_aes_crypt2 failed %d\n", ret);
174 /*
175 print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
176 dst, *dst_len, 1);
177 */
178 return 0;
179}
180
181int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
182 const void *src, size_t src_len)
183{
184 struct scatterlist sg_in[1], sg_out[2];
185 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
186 struct blkcipher_desc desc = { .tfm = tfm };
187 char pad[16];
188 void *iv;
189 int ivsize;
190 int ret;
191 int last_byte;
192
193 if (IS_ERR(tfm))
194 return PTR_ERR(tfm);
195
196 crypto_blkcipher_setkey((void *)tfm, key, key_len);
197 sg_init_table(sg_in, 1);
198 sg_init_table(sg_out, 2);
199 sg_set_buf(sg_in, src, src_len);
200 sg_set_buf(&sg_out[0], dst, *dst_len);
201 sg_set_buf(&sg_out[1], pad, sizeof(pad));
202
203 iv = crypto_blkcipher_crt(tfm)->iv;
204 ivsize = crypto_blkcipher_ivsize(tfm);
205
206 memcpy(iv, aes_iv, ivsize);
207
208 /*
209 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
210 key, key_len, 1);
211 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
212 src, src_len, 1);
213 */
214
215 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
216 crypto_free_blkcipher(tfm);
217 if (ret < 0) {
218 pr_err("ceph_aes_decrypt failed %d\n", ret);
219 return ret;
220 }
221
222 if (src_len <= *dst_len)
223 last_byte = ((char *)dst)[src_len - 1];
224 else
225 last_byte = pad[src_len - *dst_len - 1];
226 if (last_byte <= 16 && src_len >= last_byte) {
227 *dst_len = src_len - last_byte;
228 } else {
229 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
230 last_byte, (int)src_len);
231 return -EPERM; /* bad padding */
232 }
233 /*
234 print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
235 dst, *dst_len, 1);
236 */
237 return 0;
238}
239
240int ceph_aes_decrypt2(const void *key, int key_len,
241 void *dst1, size_t *dst1_len,
242 void *dst2, size_t *dst2_len,
243 const void *src, size_t src_len)
244{
245 struct scatterlist sg_in[1], sg_out[3];
246 struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
247 struct blkcipher_desc desc = { .tfm = tfm };
248 char pad[16];
249 void *iv;
250 int ivsize;
251 int ret;
252 int last_byte;
253
254 if (IS_ERR(tfm))
255 return PTR_ERR(tfm);
256
257 sg_init_table(sg_in, 1);
258 sg_set_buf(sg_in, src, src_len);
259 sg_init_table(sg_out, 3);
260 sg_set_buf(&sg_out[0], dst1, *dst1_len);
261 sg_set_buf(&sg_out[1], dst2, *dst2_len);
262 sg_set_buf(&sg_out[2], pad, sizeof(pad));
263
264 crypto_blkcipher_setkey((void *)tfm, key, key_len);
265 iv = crypto_blkcipher_crt(tfm)->iv;
266 ivsize = crypto_blkcipher_ivsize(tfm);
267
268 memcpy(iv, aes_iv, ivsize);
269
270 /*
271 print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
272 key, key_len, 1);
273 print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
274 src, src_len, 1);
275 */
276
277 ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
278 crypto_free_blkcipher(tfm);
279 if (ret < 0) {
280 pr_err("ceph_aes_decrypt failed %d\n", ret);
281 return ret;
282 }
283
284 if (src_len <= *dst1_len)
285 last_byte = ((char *)dst1)[src_len - 1];
286 else if (src_len <= *dst1_len + *dst2_len)
287 last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
288 else
289 last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
290 if (last_byte <= 16 && src_len >= last_byte) {
291 src_len -= last_byte;
292 } else {
293 pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
294 last_byte, (int)src_len);
295 return -EPERM; /* bad padding */
296 }
297
298 if (src_len < *dst1_len) {
299 *dst1_len = src_len;
300 *dst2_len = 0;
301 } else {
302 *dst2_len = src_len - *dst1_len;
303 }
304 /*
305 print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
306 dst1, *dst1_len, 1);
307 print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
308 dst2, *dst2_len, 1);
309 */
310
311 return 0;
312}
313
314
315int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
316 const void *src, size_t src_len)
317{
318 switch (secret->type) {
319 case CEPH_CRYPTO_NONE:
320 if (*dst_len < src_len)
321 return -ERANGE;
322 memcpy(dst, src, src_len);
323 *dst_len = src_len;
324 return 0;
325
326 case CEPH_CRYPTO_AES:
327 return ceph_aes_decrypt(secret->key, secret->len, dst,
328 dst_len, src, src_len);
329
330 default:
331 return -EINVAL;
332 }
333}
334
335int ceph_decrypt2(struct ceph_crypto_key *secret,
336 void *dst1, size_t *dst1_len,
337 void *dst2, size_t *dst2_len,
338 const void *src, size_t src_len)
339{
340 size_t t;
341
342 switch (secret->type) {
343 case CEPH_CRYPTO_NONE:
344 if (*dst1_len + *dst2_len < src_len)
345 return -ERANGE;
346 t = min(*dst1_len, src_len);
347 memcpy(dst1, src, t);
348 *dst1_len = t;
349 src += t;
350 src_len -= t;
351 if (src_len) {
352 t = min(*dst2_len, src_len);
353 memcpy(dst2, src, t);
354 *dst2_len = t;
355 }
356 return 0;
357
358 case CEPH_CRYPTO_AES:
359 return ceph_aes_decrypt2(secret->key, secret->len,
360 dst1, dst1_len, dst2, dst2_len,
361 src, src_len);
362
363 default:
364 return -EINVAL;
365 }
366}
367
368int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
369 const void *src, size_t src_len)
370{
371 switch (secret->type) {
372 case CEPH_CRYPTO_NONE:
373 if (*dst_len < src_len)
374 return -ERANGE;
375 memcpy(dst, src, src_len);
376 *dst_len = src_len;
377 return 0;
378
379 case CEPH_CRYPTO_AES:
380 return ceph_aes_encrypt(secret->key, secret->len, dst,
381 dst_len, src, src_len);
382
383 default:
384 return -EINVAL;
385 }
386}
387
388int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
389 const void *src1, size_t src1_len,
390 const void *src2, size_t src2_len)
391{
392 switch (secret->type) {
393 case CEPH_CRYPTO_NONE:
394 if (*dst_len < src1_len + src2_len)
395 return -ERANGE;
396 memcpy(dst, src1, src1_len);
397 memcpy(dst + src1_len, src2, src2_len);
398 *dst_len = src1_len + src2_len;
399 return 0;
400
401 case CEPH_CRYPTO_AES:
402 return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
403 src1, src1_len, src2, src2_len);
404
405 default:
406 return -EINVAL;
407 }
408}
diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
new file mode 100644
index 000000000000..40b502e6bd89
--- /dev/null
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,48 @@
1#ifndef _FS_CEPH_CRYPTO_H
2#define _FS_CEPH_CRYPTO_H
3
4#include "types.h"
5#include "buffer.h"
6
7/*
8 * cryptographic secret
9 */
10struct ceph_crypto_key {
11 int type;
12 struct ceph_timespec created;
13 int len;
14 void *key;
15};
16
17static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
18{
19 kfree(key->key);
20}
21
22extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
23 void **p, void *end);
24extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
25 void **p, void *end);
26extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
27
28/* crypto.c */
29extern int ceph_decrypt(struct ceph_crypto_key *secret,
30 void *dst, size_t *dst_len,
31 const void *src, size_t src_len);
32extern int ceph_encrypt(struct ceph_crypto_key *secret,
33 void *dst, size_t *dst_len,
34 const void *src, size_t src_len);
35extern int ceph_decrypt2(struct ceph_crypto_key *secret,
36 void *dst1, size_t *dst1_len,
37 void *dst2, size_t *dst2_len,
38 const void *src, size_t src_len);
39extern int ceph_encrypt2(struct ceph_crypto_key *secret,
40 void *dst, size_t *dst_len,
41 const void *src1, size_t src1_len,
42 const void *src2, size_t src2_len);
43
44/* armor.c */
45extern int ceph_armor(char *dst, const void *src, const void *end);
46extern int ceph_unarmor(void *dst, const char *src, const char *end);
47
48#endif
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
new file mode 100644
index 000000000000..e159f1415110
--- /dev/null
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,483 @@
1#include "ceph_debug.h"
2
3#include <linux/device.h>
4#include <linux/module.h>
5#include <linux/ctype.h>
6#include <linux/debugfs.h>
7#include <linux/seq_file.h>
8
9#include "super.h"
10#include "mds_client.h"
11#include "mon_client.h"
12#include "auth.h"
13
14#ifdef CONFIG_DEBUG_FS
15
16/*
17 * Implement /sys/kernel/debug/ceph fun
18 *
19 * /sys/kernel/debug/ceph/client* - an instance of the ceph client
20 * .../osdmap - current osdmap
21 * .../mdsmap - current mdsmap
22 * .../monmap - current monmap
23 * .../osdc - active osd requests
24 * .../mdsc - active mds requests
25 * .../monc - mon client state
26 * .../dentry_lru - dump contents of dentry lru
27 * .../caps - expose cap (reservation) stats
28 * .../bdi - symlink to ../../bdi/something
29 */
30
31static struct dentry *ceph_debugfs_dir;
32
33static int monmap_show(struct seq_file *s, void *p)
34{
35 int i;
36 struct ceph_client *client = s->private;
37
38 if (client->monc.monmap == NULL)
39 return 0;
40
41 seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
42 for (i = 0; i < client->monc.monmap->num_mon; i++) {
43 struct ceph_entity_inst *inst =
44 &client->monc.monmap->mon_inst[i];
45
46 seq_printf(s, "\t%s%lld\t%s\n",
47 ENTITY_NAME(inst->name),
48 pr_addr(&inst->addr.in_addr));
49 }
50 return 0;
51}
52
53static int mdsmap_show(struct seq_file *s, void *p)
54{
55 int i;
56 struct ceph_client *client = s->private;
57
58 if (client->mdsc.mdsmap == NULL)
59 return 0;
60 seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
61 seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
62 seq_printf(s, "session_timeout %d\n",
63 client->mdsc.mdsmap->m_session_timeout);
64 seq_printf(s, "session_autoclose %d\n",
65 client->mdsc.mdsmap->m_session_autoclose);
66 for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
67 struct ceph_entity_addr *addr =
68 &client->mdsc.mdsmap->m_info[i].addr;
69 int state = client->mdsc.mdsmap->m_info[i].state;
70
71 seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
72 ceph_mds_state_name(state));
73 }
74 return 0;
75}
76
77static int osdmap_show(struct seq_file *s, void *p)
78{
79 int i;
80 struct ceph_client *client = s->private;
81 struct rb_node *n;
82
83 if (client->osdc.osdmap == NULL)
84 return 0;
85 seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
86 seq_printf(s, "flags%s%s\n",
87 (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
88 " NEARFULL" : "",
89 (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
90 " FULL" : "");
91 for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
92 struct ceph_pg_pool_info *pool =
93 rb_entry(n, struct ceph_pg_pool_info, node);
94 seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
95 pool->id, pool->v.pg_num, pool->pg_num_mask,
96 pool->v.lpg_num, pool->lpg_num_mask);
97 }
98 for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
99 struct ceph_entity_addr *addr =
100 &client->osdc.osdmap->osd_addr[i];
101 int state = client->osdc.osdmap->osd_state[i];
102 char sb[64];
103
104 seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
105 i, pr_addr(&addr->in_addr),
106 ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
107 ceph_osdmap_state_str(sb, sizeof(sb), state));
108 }
109 return 0;
110}
111
112static int monc_show(struct seq_file *s, void *p)
113{
114 struct ceph_client *client = s->private;
115 struct ceph_mon_statfs_request *req;
116 struct ceph_mon_client *monc = &client->monc;
117 struct rb_node *rp;
118
119 mutex_lock(&monc->mutex);
120
121 if (monc->have_mdsmap)
122 seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
123 if (monc->have_osdmap)
124 seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
125 if (monc->want_next_osdmap)
126 seq_printf(s, "want next osdmap\n");
127
128 for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
129 req = rb_entry(rp, struct ceph_mon_statfs_request, node);
130 seq_printf(s, "%lld statfs\n", req->tid);
131 }
132
133 mutex_unlock(&monc->mutex);
134 return 0;
135}
136
137static int mdsc_show(struct seq_file *s, void *p)
138{
139 struct ceph_client *client = s->private;
140 struct ceph_mds_client *mdsc = &client->mdsc;
141 struct ceph_mds_request *req;
142 struct rb_node *rp;
143 int pathlen;
144 u64 pathbase;
145 char *path;
146
147 mutex_lock(&mdsc->mutex);
148 for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
149 req = rb_entry(rp, struct ceph_mds_request, r_node);
150
151 if (req->r_request)
152 seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
153 else
154 seq_printf(s, "%lld\t(no request)\t", req->r_tid);
155
156 seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
157
158 if (req->r_got_unsafe)
159 seq_printf(s, "\t(unsafe)");
160 else
161 seq_printf(s, "\t");
162
163 if (req->r_inode) {
164 seq_printf(s, " #%llx", ceph_ino(req->r_inode));
165 } else if (req->r_dentry) {
166 path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
167 &pathbase, 0);
168 spin_lock(&req->r_dentry->d_lock);
169 seq_printf(s, " #%llx/%.*s (%s)",
170 ceph_ino(req->r_dentry->d_parent->d_inode),
171 req->r_dentry->d_name.len,
172 req->r_dentry->d_name.name,
173 path ? path : "");
174 spin_unlock(&req->r_dentry->d_lock);
175 kfree(path);
176 } else if (req->r_path1) {
177 seq_printf(s, " #%llx/%s", req->r_ino1.ino,
178 req->r_path1);
179 }
180
181 if (req->r_old_dentry) {
182 path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
183 &pathbase, 0);
184 spin_lock(&req->r_old_dentry->d_lock);
185 seq_printf(s, " #%llx/%.*s (%s)",
186 ceph_ino(req->r_old_dentry->d_parent->d_inode),
187 req->r_old_dentry->d_name.len,
188 req->r_old_dentry->d_name.name,
189 path ? path : "");
190 spin_unlock(&req->r_old_dentry->d_lock);
191 kfree(path);
192 } else if (req->r_path2) {
193 if (req->r_ino2.ino)
194 seq_printf(s, " #%llx/%s", req->r_ino2.ino,
195 req->r_path2);
196 else
197 seq_printf(s, " %s", req->r_path2);
198 }
199
200 seq_printf(s, "\n");
201 }
202 mutex_unlock(&mdsc->mutex);
203
204 return 0;
205}
206
207static int osdc_show(struct seq_file *s, void *pp)
208{
209 struct ceph_client *client = s->private;
210 struct ceph_osd_client *osdc = &client->osdc;
211 struct rb_node *p;
212
213 mutex_lock(&osdc->request_mutex);
214 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
215 struct ceph_osd_request *req;
216 struct ceph_osd_request_head *head;
217 struct ceph_osd_op *op;
218 int num_ops;
219 int opcode, olen;
220 int i;
221
222 req = rb_entry(p, struct ceph_osd_request, r_node);
223
224 seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
225 req->r_osd ? req->r_osd->o_osd : -1,
226 le32_to_cpu(req->r_pgid.pool),
227 le16_to_cpu(req->r_pgid.ps));
228
229 head = req->r_request->front.iov_base;
230 op = (void *)(head + 1);
231
232 num_ops = le16_to_cpu(head->num_ops);
233 olen = le32_to_cpu(head->object_len);
234 seq_printf(s, "%.*s", olen,
235 (const char *)(head->ops + num_ops));
236
237 if (req->r_reassert_version.epoch)
238 seq_printf(s, "\t%u'%llu",
239 (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
240 le64_to_cpu(req->r_reassert_version.version));
241 else
242 seq_printf(s, "\t");
243
244 for (i = 0; i < num_ops; i++) {
245 opcode = le16_to_cpu(op->op);
246 seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
247 op++;
248 }
249
250 seq_printf(s, "\n");
251 }
252 mutex_unlock(&osdc->request_mutex);
253 return 0;
254}
255
256static int caps_show(struct seq_file *s, void *p)
257{
258 struct ceph_client *client = p;
259 int total, avail, used, reserved, min;
260
261 ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
262 seq_printf(s, "total\t\t%d\n"
263 "avail\t\t%d\n"
264 "used\t\t%d\n"
265 "reserved\t%d\n"
266 "min\t%d\n",
267 total, avail, used, reserved, min);
268 return 0;
269}
270
271static int dentry_lru_show(struct seq_file *s, void *ptr)
272{
273 struct ceph_client *client = s->private;
274 struct ceph_mds_client *mdsc = &client->mdsc;
275 struct ceph_dentry_info *di;
276
277 spin_lock(&mdsc->dentry_lru_lock);
278 list_for_each_entry(di, &mdsc->dentry_lru, lru) {
279 struct dentry *dentry = di->dentry;
280 seq_printf(s, "%p %p\t%.*s\n",
281 di, dentry, dentry->d_name.len, dentry->d_name.name);
282 }
283 spin_unlock(&mdsc->dentry_lru_lock);
284
285 return 0;
286}
287
288#define DEFINE_SHOW_FUNC(name) \
289static int name##_open(struct inode *inode, struct file *file) \
290{ \
291 struct seq_file *sf; \
292 int ret; \
293 \
294 ret = single_open(file, name, NULL); \
295 sf = file->private_data; \
296 sf->private = inode->i_private; \
297 return ret; \
298} \
299 \
300static const struct file_operations name##_fops = { \
301 .open = name##_open, \
302 .read = seq_read, \
303 .llseek = seq_lseek, \
304 .release = single_release, \
305};
306
307DEFINE_SHOW_FUNC(monmap_show)
308DEFINE_SHOW_FUNC(mdsmap_show)
309DEFINE_SHOW_FUNC(osdmap_show)
310DEFINE_SHOW_FUNC(monc_show)
311DEFINE_SHOW_FUNC(mdsc_show)
312DEFINE_SHOW_FUNC(osdc_show)
313DEFINE_SHOW_FUNC(dentry_lru_show)
314DEFINE_SHOW_FUNC(caps_show)
315
316static int congestion_kb_set(void *data, u64 val)
317{
318 struct ceph_client *client = (struct ceph_client *)data;
319
320 if (client)
321 client->mount_args->congestion_kb = (int)val;
322
323 return 0;
324}
325
326static int congestion_kb_get(void *data, u64 *val)
327{
328 struct ceph_client *client = (struct ceph_client *)data;
329
330 if (client)
331 *val = (u64)client->mount_args->congestion_kb;
332
333 return 0;
334}
335
336
337DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
338 congestion_kb_set, "%llu\n");
339
340int __init ceph_debugfs_init(void)
341{
342 ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
343 if (!ceph_debugfs_dir)
344 return -ENOMEM;
345 return 0;
346}
347
348void ceph_debugfs_cleanup(void)
349{
350 debugfs_remove(ceph_debugfs_dir);
351}
352
353int ceph_debugfs_client_init(struct ceph_client *client)
354{
355 int ret = 0;
356 char name[80];
357
358 snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
359 PR_FSID(&client->fsid), client->monc.auth->global_id);
360
361 client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
362 if (!client->debugfs_dir)
363 goto out;
364
365 client->monc.debugfs_file = debugfs_create_file("monc",
366 0600,
367 client->debugfs_dir,
368 client,
369 &monc_show_fops);
370 if (!client->monc.debugfs_file)
371 goto out;
372
373 client->mdsc.debugfs_file = debugfs_create_file("mdsc",
374 0600,
375 client->debugfs_dir,
376 client,
377 &mdsc_show_fops);
378 if (!client->mdsc.debugfs_file)
379 goto out;
380
381 client->osdc.debugfs_file = debugfs_create_file("osdc",
382 0600,
383 client->debugfs_dir,
384 client,
385 &osdc_show_fops);
386 if (!client->osdc.debugfs_file)
387 goto out;
388
389 client->debugfs_monmap = debugfs_create_file("monmap",
390 0600,
391 client->debugfs_dir,
392 client,
393 &monmap_show_fops);
394 if (!client->debugfs_monmap)
395 goto out;
396
397 client->debugfs_mdsmap = debugfs_create_file("mdsmap",
398 0600,
399 client->debugfs_dir,
400 client,
401 &mdsmap_show_fops);
402 if (!client->debugfs_mdsmap)
403 goto out;
404
405 client->debugfs_osdmap = debugfs_create_file("osdmap",
406 0600,
407 client->debugfs_dir,
408 client,
409 &osdmap_show_fops);
410 if (!client->debugfs_osdmap)
411 goto out;
412
413 client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
414 0600,
415 client->debugfs_dir,
416 client,
417 &dentry_lru_show_fops);
418 if (!client->debugfs_dentry_lru)
419 goto out;
420
421 client->debugfs_caps = debugfs_create_file("caps",
422 0400,
423 client->debugfs_dir,
424 client,
425 &caps_show_fops);
426 if (!client->debugfs_caps)
427 goto out;
428
429 client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
430 0600,
431 client->debugfs_dir,
432 client,
433 &congestion_kb_fops);
434 if (!client->debugfs_congestion_kb)
435 goto out;
436
437 sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
438 client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
439 name);
440
441 return 0;
442
443out:
444 ceph_debugfs_client_cleanup(client);
445 return ret;
446}
447
448void ceph_debugfs_client_cleanup(struct ceph_client *client)
449{
450 debugfs_remove(client->debugfs_bdi);
451 debugfs_remove(client->debugfs_caps);
452 debugfs_remove(client->debugfs_dentry_lru);
453 debugfs_remove(client->debugfs_osdmap);
454 debugfs_remove(client->debugfs_mdsmap);
455 debugfs_remove(client->debugfs_monmap);
456 debugfs_remove(client->osdc.debugfs_file);
457 debugfs_remove(client->mdsc.debugfs_file);
458 debugfs_remove(client->monc.debugfs_file);
459 debugfs_remove(client->debugfs_congestion_kb);
460 debugfs_remove(client->debugfs_dir);
461}
462
463#else // CONFIG_DEBUG_FS
464
465int __init ceph_debugfs_init(void)
466{
467 return 0;
468}
469
470void ceph_debugfs_cleanup(void)
471{
472}
473
474int ceph_debugfs_client_init(struct ceph_client *client)
475{
476 return 0;
477}
478
479void ceph_debugfs_client_cleanup(struct ceph_client *client)
480{
481}
482
483#endif // CONFIG_DEBUG_FS
diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
new file mode 100644
index 000000000000..65b3e022eaf5
--- /dev/null
+++ b/fs/ceph/decode.h
@@ -0,0 +1,194 @@
1#ifndef __CEPH_DECODE_H
2#define __CEPH_DECODE_H
3
4#include <asm/unaligned.h>
5#include <linux/time.h>
6
7#include "types.h"
8
9/*
10 * in all cases,
11 * void **p pointer to position pointer
12 * void *end pointer to end of buffer (last byte + 1)
13 */
14
15static inline u64 ceph_decode_64(void **p)
16{
17 u64 v = get_unaligned_le64(*p);
18 *p += sizeof(u64);
19 return v;
20}
21static inline u32 ceph_decode_32(void **p)
22{
23 u32 v = get_unaligned_le32(*p);
24 *p += sizeof(u32);
25 return v;
26}
27static inline u16 ceph_decode_16(void **p)
28{
29 u16 v = get_unaligned_le16(*p);
30 *p += sizeof(u16);
31 return v;
32}
33static inline u8 ceph_decode_8(void **p)
34{
35 u8 v = *(u8 *)*p;
36 (*p)++;
37 return v;
38}
39static inline void ceph_decode_copy(void **p, void *pv, size_t n)
40{
41 memcpy(pv, *p, n);
42 *p += n;
43}
44
45/*
46 * bounds check input.
47 */
48#define ceph_decode_need(p, end, n, bad) \
49 do { \
50 if (unlikely(*(p) + (n) > (end))) \
51 goto bad; \
52 } while (0)
53
54#define ceph_decode_64_safe(p, end, v, bad) \
55 do { \
56 ceph_decode_need(p, end, sizeof(u64), bad); \
57 v = ceph_decode_64(p); \
58 } while (0)
59#define ceph_decode_32_safe(p, end, v, bad) \
60 do { \
61 ceph_decode_need(p, end, sizeof(u32), bad); \
62 v = ceph_decode_32(p); \
63 } while (0)
64#define ceph_decode_16_safe(p, end, v, bad) \
65 do { \
66 ceph_decode_need(p, end, sizeof(u16), bad); \
67 v = ceph_decode_16(p); \
68 } while (0)
69#define ceph_decode_8_safe(p, end, v, bad) \
70 do { \
71 ceph_decode_need(p, end, sizeof(u8), bad); \
72 v = ceph_decode_8(p); \
73 } while (0)
74
75#define ceph_decode_copy_safe(p, end, pv, n, bad) \
76 do { \
77 ceph_decode_need(p, end, n, bad); \
78 ceph_decode_copy(p, pv, n); \
79 } while (0)
80
81/*
82 * struct ceph_timespec <-> struct timespec
83 */
84static inline void ceph_decode_timespec(struct timespec *ts,
85 const struct ceph_timespec *tv)
86{
87 ts->tv_sec = le32_to_cpu(tv->tv_sec);
88 ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
89}
90static inline void ceph_encode_timespec(struct ceph_timespec *tv,
91 const struct timespec *ts)
92{
93 tv->tv_sec = cpu_to_le32(ts->tv_sec);
94 tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
95}
96
97/*
98 * sockaddr_storage <-> ceph_sockaddr
99 */
100static inline void ceph_encode_addr(struct ceph_entity_addr *a)
101{
102 a->in_addr.ss_family = htons(a->in_addr.ss_family);
103}
104static inline void ceph_decode_addr(struct ceph_entity_addr *a)
105{
106 a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
107 WARN_ON(a->in_addr.ss_family == 512);
108}
109
110/*
111 * encoders
112 */
113static inline void ceph_encode_64(void **p, u64 v)
114{
115 put_unaligned_le64(v, (__le64 *)*p);
116 *p += sizeof(u64);
117}
118static inline void ceph_encode_32(void **p, u32 v)
119{
120 put_unaligned_le32(v, (__le32 *)*p);
121 *p += sizeof(u32);
122}
123static inline void ceph_encode_16(void **p, u16 v)
124{
125 put_unaligned_le16(v, (__le16 *)*p);
126 *p += sizeof(u16);
127}
128static inline void ceph_encode_8(void **p, u8 v)
129{
130 *(u8 *)*p = v;
131 (*p)++;
132}
133static inline void ceph_encode_copy(void **p, const void *s, int len)
134{
135 memcpy(*p, s, len);
136 *p += len;
137}
138
139/*
140 * filepath, string encoders
141 */
142static inline void ceph_encode_filepath(void **p, void *end,
143 u64 ino, const char *path)
144{
145 u32 len = path ? strlen(path) : 0;
146 BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
147 ceph_encode_8(p, 1);
148 ceph_encode_64(p, ino);
149 ceph_encode_32(p, len);
150 if (len)
151 memcpy(*p, path, len);
152 *p += len;
153}
154
155static inline void ceph_encode_string(void **p, void *end,
156 const char *s, u32 len)
157{
158 BUG_ON(*p + sizeof(len) + len > end);
159 ceph_encode_32(p, len);
160 if (len)
161 memcpy(*p, s, len);
162 *p += len;
163}
164
165#define ceph_encode_need(p, end, n, bad) \
166 do { \
167 if (unlikely(*(p) + (n) > (end))) \
168 goto bad; \
169 } while (0)
170
171#define ceph_encode_64_safe(p, end, v, bad) \
172 do { \
173 ceph_encode_need(p, end, sizeof(u64), bad); \
174 ceph_encode_64(p, v); \
175 } while (0)
176#define ceph_encode_32_safe(p, end, v, bad) \
177 do { \
178 ceph_encode_need(p, end, sizeof(u32), bad); \
179 ceph_encode_32(p, v); \
180 } while (0)
181#define ceph_encode_16_safe(p, end, v, bad) \
182 do { \
183 ceph_encode_need(p, end, sizeof(u16), bad); \
184 ceph_encode_16(p, v); \
185 } while (0)
186
187#define ceph_encode_copy_safe(p, end, pv, n, bad) \
188 do { \
189 ceph_encode_need(p, end, n, bad); \
190 ceph_encode_copy(p, pv, n); \
191 } while (0)
192
193
194#endif
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
new file mode 100644
index 000000000000..5107384ee029
--- /dev/null
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1220 @@
1#include "ceph_debug.h"
2
3#include <linux/spinlock.h>
4#include <linux/fs_struct.h>
5#include <linux/namei.h>
6#include <linux/sched.h>
7
8#include "super.h"
9
10/*
11 * Directory operations: readdir, lookup, create, link, unlink,
12 * rename, etc.
13 */
14
15/*
16 * Ceph MDS operations are specified in terms of a base ino and
17 * relative path. Thus, the client can specify an operation on a
18 * specific inode (e.g., a getattr due to fstat(2)), or as a path
19 * relative to, say, the root directory.
20 *
21 * Normally, we limit ourselves to strict inode ops (no path component)
22 * or dentry operations (a single path component relative to an ino). The
23 * exception to this is open_root_dentry(), which will open the mount
24 * point by name.
25 */
26
27const struct inode_operations ceph_dir_iops;
28const struct file_operations ceph_dir_fops;
29struct dentry_operations ceph_dentry_ops;
30
31/*
32 * Initialize ceph dentry state.
33 */
34int ceph_init_dentry(struct dentry *dentry)
35{
36 struct ceph_dentry_info *di;
37
38 if (dentry->d_fsdata)
39 return 0;
40
41 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
42 dentry->d_op = &ceph_dentry_ops;
43 else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
44 dentry->d_op = &ceph_snapdir_dentry_ops;
45 else
46 dentry->d_op = &ceph_snap_dentry_ops;
47
48 di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
49 if (!di)
50 return -ENOMEM; /* oh well */
51
52 spin_lock(&dentry->d_lock);
53 if (dentry->d_fsdata) /* lost a race */
54 goto out_unlock;
55 di->dentry = dentry;
56 di->lease_session = NULL;
57 dentry->d_fsdata = di;
58 dentry->d_time = jiffies;
59 ceph_dentry_lru_add(dentry);
60out_unlock:
61 spin_unlock(&dentry->d_lock);
62 return 0;
63}
64
65
66
67/*
68 * for readdir, we encode the directory frag and offset within that
69 * frag into f_pos.
70 */
71static unsigned fpos_frag(loff_t p)
72{
73 return p >> 32;
74}
75static unsigned fpos_off(loff_t p)
76{
77 return p & 0xffffffff;
78}
79
80/*
81 * When possible, we try to satisfy a readdir by peeking at the
82 * dcache. We make this work by carefully ordering dentries on
83 * d_u.d_child when we initially get results back from the MDS, and
84 * falling back to a "normal" sync readdir if any dentries in the dir
85 * are dropped.
86 *
87 * I_COMPLETE tells indicates we have all dentries in the dir. It is
88 * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
89 * the MDS if/when the directory is modified).
90 */
91static int __dcache_readdir(struct file *filp,
92 void *dirent, filldir_t filldir)
93{
94 struct inode *inode = filp->f_dentry->d_inode;
95 struct ceph_file_info *fi = filp->private_data;
96 struct dentry *parent = filp->f_dentry;
97 struct inode *dir = parent->d_inode;
98 struct list_head *p;
99 struct dentry *dentry, *last;
100 struct ceph_dentry_info *di;
101 int err = 0;
102
103 /* claim ref on last dentry we returned */
104 last = fi->dentry;
105 fi->dentry = NULL;
106
107 dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
108 last);
109
110 spin_lock(&dcache_lock);
111
112 /* start at beginning? */
113 if (filp->f_pos == 2 || (last &&
114 filp->f_pos < ceph_dentry(last)->offset)) {
115 if (list_empty(&parent->d_subdirs))
116 goto out_unlock;
117 p = parent->d_subdirs.prev;
118 dout(" initial p %p/%p\n", p->prev, p->next);
119 } else {
120 p = last->d_u.d_child.prev;
121 }
122
123more:
124 dentry = list_entry(p, struct dentry, d_u.d_child);
125 di = ceph_dentry(dentry);
126 while (1) {
127 dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
128 parent->d_subdirs.prev, parent->d_subdirs.next);
129 if (p == &parent->d_subdirs) {
130 fi->at_end = 1;
131 goto out_unlock;
132 }
133 if (!d_unhashed(dentry) && dentry->d_inode &&
134 ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
135 ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
136 filp->f_pos <= di->offset)
137 break;
138 dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
139 dentry->d_name.len, dentry->d_name.name, di->offset,
140 filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
141 !dentry->d_inode ? " null" : "");
142 p = p->prev;
143 dentry = list_entry(p, struct dentry, d_u.d_child);
144 di = ceph_dentry(dentry);
145 }
146
147 atomic_inc(&dentry->d_count);
148 spin_unlock(&dcache_lock);
149 spin_unlock(&inode->i_lock);
150
151 dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
152 dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
153 filp->f_pos = di->offset;
154 err = filldir(dirent, dentry->d_name.name,
155 dentry->d_name.len, di->offset,
156 dentry->d_inode->i_ino,
157 dentry->d_inode->i_mode >> 12);
158
159 if (last) {
160 if (err < 0) {
161 /* remember our position */
162 fi->dentry = last;
163 fi->next_offset = di->offset;
164 } else {
165 dput(last);
166 }
167 last = NULL;
168 }
169
170 spin_lock(&inode->i_lock);
171 spin_lock(&dcache_lock);
172
173 if (err < 0)
174 goto out_unlock;
175
176 last = dentry;
177
178 p = p->prev;
179 filp->f_pos++;
180
181 /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
182 if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
183 goto more;
184 dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
185 err = -EAGAIN;
186
187out_unlock:
188 spin_unlock(&dcache_lock);
189
190 if (last) {
191 spin_unlock(&inode->i_lock);
192 dput(last);
193 spin_lock(&inode->i_lock);
194 }
195
196 return err;
197}
198
199/*
200 * make note of the last dentry we read, so we can
201 * continue at the same lexicographical point,
202 * regardless of what dir changes take place on the
203 * server.
204 */
205static int note_last_dentry(struct ceph_file_info *fi, const char *name,
206 int len)
207{
208 kfree(fi->last_name);
209 fi->last_name = kmalloc(len+1, GFP_NOFS);
210 if (!fi->last_name)
211 return -ENOMEM;
212 memcpy(fi->last_name, name, len);
213 fi->last_name[len] = 0;
214 dout("note_last_dentry '%s'\n", fi->last_name);
215 return 0;
216}
217
218static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
219{
220 struct ceph_file_info *fi = filp->private_data;
221 struct inode *inode = filp->f_dentry->d_inode;
222 struct ceph_inode_info *ci = ceph_inode(inode);
223 struct ceph_client *client = ceph_inode_to_client(inode);
224 struct ceph_mds_client *mdsc = &client->mdsc;
225 unsigned frag = fpos_frag(filp->f_pos);
226 int off = fpos_off(filp->f_pos);
227 int err;
228 u32 ftype;
229 struct ceph_mds_reply_info_parsed *rinfo;
230 const int max_entries = client->mount_args->max_readdir;
231
232 dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
233 if (fi->at_end)
234 return 0;
235
236 /* always start with . and .. */
237 if (filp->f_pos == 0) {
238 /* note dir version at start of readdir so we can tell
239 * if any dentries get dropped */
240 fi->dir_release_count = ci->i_release_count;
241
242 dout("readdir off 0 -> '.'\n");
243 if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
244 inode->i_ino, inode->i_mode >> 12) < 0)
245 return 0;
246 filp->f_pos = 1;
247 off = 1;
248 }
249 if (filp->f_pos == 1) {
250 dout("readdir off 1 -> '..'\n");
251 if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
252 filp->f_dentry->d_parent->d_inode->i_ino,
253 inode->i_mode >> 12) < 0)
254 return 0;
255 filp->f_pos = 2;
256 off = 2;
257 }
258
259 /* can we use the dcache? */
260 spin_lock(&inode->i_lock);
261 if ((filp->f_pos == 2 || fi->dentry) &&
262 !ceph_test_opt(client, NOASYNCREADDIR) &&
263 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
264 __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
265 err = __dcache_readdir(filp, dirent, filldir);
266 if (err != -EAGAIN) {
267 spin_unlock(&inode->i_lock);
268 return err;
269 }
270 }
271 spin_unlock(&inode->i_lock);
272 if (fi->dentry) {
273 err = note_last_dentry(fi, fi->dentry->d_name.name,
274 fi->dentry->d_name.len);
275 if (err)
276 return err;
277 dput(fi->dentry);
278 fi->dentry = NULL;
279 }
280
281 /* proceed with a normal readdir */
282
283more:
284 /* do we have the correct frag content buffered? */
285 if (fi->frag != frag || fi->last_readdir == NULL) {
286 struct ceph_mds_request *req;
287 int op = ceph_snap(inode) == CEPH_SNAPDIR ?
288 CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
289
290 /* discard old result, if any */
291 if (fi->last_readdir)
292 ceph_mdsc_put_request(fi->last_readdir);
293
294 /* requery frag tree, as the frag topology may have changed */
295 frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
296
297 dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
298 ceph_vinop(inode), frag, fi->last_name);
299 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
300 if (IS_ERR(req))
301 return PTR_ERR(req);
302 req->r_inode = igrab(inode);
303 req->r_dentry = dget(filp->f_dentry);
304 /* hints to request -> mds selection code */
305 req->r_direct_mode = USE_AUTH_MDS;
306 req->r_direct_hash = ceph_frag_value(frag);
307 req->r_direct_is_hash = true;
308 req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
309 req->r_readdir_offset = fi->next_offset;
310 req->r_args.readdir.frag = cpu_to_le32(frag);
311 req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
312 req->r_num_caps = max_entries;
313 err = ceph_mdsc_do_request(mdsc, NULL, req);
314 if (err < 0) {
315 ceph_mdsc_put_request(req);
316 return err;
317 }
318 dout("readdir got and parsed readdir result=%d"
319 " on frag %x, end=%d, complete=%d\n", err, frag,
320 (int)req->r_reply_info.dir_end,
321 (int)req->r_reply_info.dir_complete);
322
323 if (!req->r_did_prepopulate) {
324 dout("readdir !did_prepopulate");
325 fi->dir_release_count--; /* preclude I_COMPLETE */
326 }
327
328 /* note next offset and last dentry name */
329 fi->offset = fi->next_offset;
330 fi->last_readdir = req;
331
332 if (req->r_reply_info.dir_end) {
333 kfree(fi->last_name);
334 fi->last_name = NULL;
335 fi->next_offset = 0;
336 } else {
337 rinfo = &req->r_reply_info;
338 err = note_last_dentry(fi,
339 rinfo->dir_dname[rinfo->dir_nr-1],
340 rinfo->dir_dname_len[rinfo->dir_nr-1]);
341 if (err)
342 return err;
343 fi->next_offset += rinfo->dir_nr;
344 }
345 }
346
347 rinfo = &fi->last_readdir->r_reply_info;
348 dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
349 rinfo->dir_nr, off, fi->offset);
350 while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
351 u64 pos = ceph_make_fpos(frag, off);
352 struct ceph_mds_reply_inode *in =
353 rinfo->dir_in[off - fi->offset].in;
354 dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
355 off, off - fi->offset, rinfo->dir_nr, pos,
356 rinfo->dir_dname_len[off - fi->offset],
357 rinfo->dir_dname[off - fi->offset], in);
358 BUG_ON(!in);
359 ftype = le32_to_cpu(in->mode) >> 12;
360 if (filldir(dirent,
361 rinfo->dir_dname[off - fi->offset],
362 rinfo->dir_dname_len[off - fi->offset],
363 pos,
364 le64_to_cpu(in->ino),
365 ftype) < 0) {
366 dout("filldir stopping us...\n");
367 return 0;
368 }
369 off++;
370 filp->f_pos = pos + 1;
371 }
372
373 if (fi->last_name) {
374 ceph_mdsc_put_request(fi->last_readdir);
375 fi->last_readdir = NULL;
376 goto more;
377 }
378
379 /* more frags? */
380 if (!ceph_frag_is_rightmost(frag)) {
381 frag = ceph_frag_next(frag);
382 off = 0;
383 filp->f_pos = ceph_make_fpos(frag, off);
384 dout("readdir next frag is %x\n", frag);
385 goto more;
386 }
387 fi->at_end = 1;
388
389 /*
390 * if dir_release_count still matches the dir, no dentries
391 * were released during the whole readdir, and we should have
392 * the complete dir contents in our cache.
393 */
394 spin_lock(&inode->i_lock);
395 if (ci->i_release_count == fi->dir_release_count) {
396 dout(" marking %p complete\n", inode);
397 ci->i_ceph_flags |= CEPH_I_COMPLETE;
398 ci->i_max_offset = filp->f_pos;
399 }
400 spin_unlock(&inode->i_lock);
401
402 dout("readdir %p filp %p done.\n", inode, filp);
403 return 0;
404}
405
406static void reset_readdir(struct ceph_file_info *fi)
407{
408 if (fi->last_readdir) {
409 ceph_mdsc_put_request(fi->last_readdir);
410 fi->last_readdir = NULL;
411 }
412 kfree(fi->last_name);
413 fi->next_offset = 2; /* compensate for . and .. */
414 if (fi->dentry) {
415 dput(fi->dentry);
416 fi->dentry = NULL;
417 }
418 fi->at_end = 0;
419}
420
421static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
422{
423 struct ceph_file_info *fi = file->private_data;
424 struct inode *inode = file->f_mapping->host;
425 loff_t old_offset = offset;
426 loff_t retval;
427
428 mutex_lock(&inode->i_mutex);
429 switch (origin) {
430 case SEEK_END:
431 offset += inode->i_size + 2; /* FIXME */
432 break;
433 case SEEK_CUR:
434 offset += file->f_pos;
435 }
436 retval = -EINVAL;
437 if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
438 if (offset != file->f_pos) {
439 file->f_pos = offset;
440 file->f_version = 0;
441 fi->at_end = 0;
442 }
443 retval = offset;
444
445 /*
446 * discard buffered readdir content on seekdir(0), or
447 * seek to new frag, or seek prior to current chunk.
448 */
449 if (offset == 0 ||
450 fpos_frag(offset) != fpos_frag(old_offset) ||
451 fpos_off(offset) < fi->offset) {
452 dout("dir_llseek dropping %p content\n", file);
453 reset_readdir(fi);
454 }
455
456 /* bump dir_release_count if we did a forward seek */
457 if (offset > old_offset)
458 fi->dir_release_count--;
459 }
460 mutex_unlock(&inode->i_mutex);
461 return retval;
462}
463
464/*
465 * Process result of a lookup/open request.
466 *
467 * Mainly, make sure we return the final req->r_dentry (if it already
468 * existed) in place of the original VFS-provided dentry when they
469 * differ.
470 *
471 * Gracefully handle the case where the MDS replies with -ENOENT and
472 * no trace (which it may do, at its discretion, e.g., if it doesn't
473 * care to issue a lease on the negative dentry).
474 */
475struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
476 struct dentry *dentry, int err)
477{
478 struct ceph_client *client = ceph_client(dentry->d_sb);
479 struct inode *parent = dentry->d_parent->d_inode;
480
481 /* .snap dir? */
482 if (err == -ENOENT &&
483 ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
484 strcmp(dentry->d_name.name,
485 client->mount_args->snapdir_name) == 0) {
486 struct inode *inode = ceph_get_snapdir(parent);
487 dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
488 dentry, dentry->d_name.len, dentry->d_name.name, inode);
489 d_add(dentry, inode);
490 err = 0;
491 }
492
493 if (err == -ENOENT) {
494 /* no trace? */
495 err = 0;
496 if (!req->r_reply_info.head->is_dentry) {
497 dout("ENOENT and no trace, dentry %p inode %p\n",
498 dentry, dentry->d_inode);
499 if (dentry->d_inode) {
500 d_drop(dentry);
501 err = -ENOENT;
502 } else {
503 d_add(dentry, NULL);
504 }
505 }
506 }
507 if (err)
508 dentry = ERR_PTR(err);
509 else if (dentry != req->r_dentry)
510 dentry = dget(req->r_dentry); /* we got spliced */
511 else
512 dentry = NULL;
513 return dentry;
514}
515
516static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
517{
518 return ceph_ino(inode) == CEPH_INO_ROOT &&
519 strncmp(dentry->d_name.name, ".ceph", 5) == 0;
520}
521
522/*
523 * Look up a single dir entry. If there is a lookup intent, inform
524 * the MDS so that it gets our 'caps wanted' value in a single op.
525 */
526static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
527 struct nameidata *nd)
528{
529 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
530 struct ceph_mds_client *mdsc = &client->mdsc;
531 struct ceph_mds_request *req;
532 int op;
533 int err;
534
535 dout("lookup %p dentry %p '%.*s'\n",
536 dir, dentry, dentry->d_name.len, dentry->d_name.name);
537
538 if (dentry->d_name.len > NAME_MAX)
539 return ERR_PTR(-ENAMETOOLONG);
540
541 err = ceph_init_dentry(dentry);
542 if (err < 0)
543 return ERR_PTR(err);
544
545 /* open (but not create!) intent? */
546 if (nd &&
547 (nd->flags & LOOKUP_OPEN) &&
548 (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
549 !(nd->intent.open.flags & O_CREAT)) {
550 int mode = nd->intent.open.create_mode & ~current->fs->umask;
551 return ceph_lookup_open(dir, dentry, nd, mode, 1);
552 }
553
554 /* can we conclude ENOENT locally? */
555 if (dentry->d_inode == NULL) {
556 struct ceph_inode_info *ci = ceph_inode(dir);
557 struct ceph_dentry_info *di = ceph_dentry(dentry);
558
559 spin_lock(&dir->i_lock);
560 dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
561 if (strncmp(dentry->d_name.name,
562 client->mount_args->snapdir_name,
563 dentry->d_name.len) &&
564 !is_root_ceph_dentry(dir, dentry) &&
565 (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
566 (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
567 di->offset = ci->i_max_offset++;
568 spin_unlock(&dir->i_lock);
569 dout(" dir %p complete, -ENOENT\n", dir);
570 d_add(dentry, NULL);
571 di->lease_shared_gen = ci->i_shared_gen;
572 return NULL;
573 }
574 spin_unlock(&dir->i_lock);
575 }
576
577 op = ceph_snap(dir) == CEPH_SNAPDIR ?
578 CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
579 req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
580 if (IS_ERR(req))
581 return ERR_PTR(PTR_ERR(req));
582 req->r_dentry = dget(dentry);
583 req->r_num_caps = 2;
584 /* we only need inode linkage */
585 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
586 req->r_locked_dir = dir;
587 err = ceph_mdsc_do_request(mdsc, NULL, req);
588 dentry = ceph_finish_lookup(req, dentry, err);
589 ceph_mdsc_put_request(req); /* will dput(dentry) */
590 dout("lookup result=%p\n", dentry);
591 return dentry;
592}
593
594/*
595 * If we do a create but get no trace back from the MDS, follow up with
596 * a lookup (the VFS expects us to link up the provided dentry).
597 */
598int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
599{
600 struct dentry *result = ceph_lookup(dir, dentry, NULL);
601
602 if (result && !IS_ERR(result)) {
603 /*
604 * We created the item, then did a lookup, and found
605 * it was already linked to another inode we already
606 * had in our cache (and thus got spliced). Link our
607 * dentry to that inode, but don't hash it, just in
608 * case the VFS wants to dereference it.
609 */
610 BUG_ON(!result->d_inode);
611 d_instantiate(dentry, result->d_inode);
612 return 0;
613 }
614 return PTR_ERR(result);
615}
616
617static int ceph_mknod(struct inode *dir, struct dentry *dentry,
618 int mode, dev_t rdev)
619{
620 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
621 struct ceph_mds_client *mdsc = &client->mdsc;
622 struct ceph_mds_request *req;
623 int err;
624
625 if (ceph_snap(dir) != CEPH_NOSNAP)
626 return -EROFS;
627
628 dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
629 dir, dentry, mode, rdev);
630 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
631 if (IS_ERR(req)) {
632 d_drop(dentry);
633 return PTR_ERR(req);
634 }
635 req->r_dentry = dget(dentry);
636 req->r_num_caps = 2;
637 req->r_locked_dir = dir;
638 req->r_args.mknod.mode = cpu_to_le32(mode);
639 req->r_args.mknod.rdev = cpu_to_le32(rdev);
640 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
641 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
642 err = ceph_mdsc_do_request(mdsc, dir, req);
643 if (!err && !req->r_reply_info.head->is_dentry)
644 err = ceph_handle_notrace_create(dir, dentry);
645 ceph_mdsc_put_request(req);
646 if (err)
647 d_drop(dentry);
648 return err;
649}
650
651static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
652 struct nameidata *nd)
653{
654 dout("create in dir %p dentry %p name '%.*s'\n",
655 dir, dentry, dentry->d_name.len, dentry->d_name.name);
656
657 if (ceph_snap(dir) != CEPH_NOSNAP)
658 return -EROFS;
659
660 if (nd) {
661 BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
662 dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
663 /* hrm, what should i do here if we get aliased? */
664 if (IS_ERR(dentry))
665 return PTR_ERR(dentry);
666 return 0;
667 }
668
669 /* fall back to mknod */
670 return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
671}
672
673static int ceph_symlink(struct inode *dir, struct dentry *dentry,
674 const char *dest)
675{
676 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
677 struct ceph_mds_client *mdsc = &client->mdsc;
678 struct ceph_mds_request *req;
679 int err;
680
681 if (ceph_snap(dir) != CEPH_NOSNAP)
682 return -EROFS;
683
684 dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
685 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
686 if (IS_ERR(req)) {
687 d_drop(dentry);
688 return PTR_ERR(req);
689 }
690 req->r_dentry = dget(dentry);
691 req->r_num_caps = 2;
692 req->r_path2 = kstrdup(dest, GFP_NOFS);
693 req->r_locked_dir = dir;
694 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
695 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
696 err = ceph_mdsc_do_request(mdsc, dir, req);
697 if (!err && !req->r_reply_info.head->is_dentry)
698 err = ceph_handle_notrace_create(dir, dentry);
699 ceph_mdsc_put_request(req);
700 if (err)
701 d_drop(dentry);
702 return err;
703}
704
705static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
706{
707 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
708 struct ceph_mds_client *mdsc = &client->mdsc;
709 struct ceph_mds_request *req;
710 int err = -EROFS;
711 int op;
712
713 if (ceph_snap(dir) == CEPH_SNAPDIR) {
714 /* mkdir .snap/foo is a MKSNAP */
715 op = CEPH_MDS_OP_MKSNAP;
716 dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
717 dentry->d_name.len, dentry->d_name.name, dentry);
718 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
719 dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
720 op = CEPH_MDS_OP_MKDIR;
721 } else {
722 goto out;
723 }
724 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
725 if (IS_ERR(req)) {
726 err = PTR_ERR(req);
727 goto out;
728 }
729
730 req->r_dentry = dget(dentry);
731 req->r_num_caps = 2;
732 req->r_locked_dir = dir;
733 req->r_args.mkdir.mode = cpu_to_le32(mode);
734 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
735 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
736 err = ceph_mdsc_do_request(mdsc, dir, req);
737 if (!err && !req->r_reply_info.head->is_dentry)
738 err = ceph_handle_notrace_create(dir, dentry);
739 ceph_mdsc_put_request(req);
740out:
741 if (err < 0)
742 d_drop(dentry);
743 return err;
744}
745
746static int ceph_link(struct dentry *old_dentry, struct inode *dir,
747 struct dentry *dentry)
748{
749 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
750 struct ceph_mds_client *mdsc = &client->mdsc;
751 struct ceph_mds_request *req;
752 int err;
753
754 if (ceph_snap(dir) != CEPH_NOSNAP)
755 return -EROFS;
756
757 dout("link in dir %p old_dentry %p dentry %p\n", dir,
758 old_dentry, dentry);
759 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
760 if (IS_ERR(req)) {
761 d_drop(dentry);
762 return PTR_ERR(req);
763 }
764 req->r_dentry = dget(dentry);
765 req->r_num_caps = 2;
766 req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
767 req->r_locked_dir = dir;
768 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
769 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
770 err = ceph_mdsc_do_request(mdsc, dir, req);
771 if (err)
772 d_drop(dentry);
773 else if (!req->r_reply_info.head->is_dentry)
774 d_instantiate(dentry, igrab(old_dentry->d_inode));
775 ceph_mdsc_put_request(req);
776 return err;
777}
778
779/*
780 * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
781 * looks like the link count will hit 0, drop any other caps (other
782 * than PIN) we don't specifically want (due to the file still being
783 * open).
784 */
785static int drop_caps_for_unlink(struct inode *inode)
786{
787 struct ceph_inode_info *ci = ceph_inode(inode);
788 int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
789
790 spin_lock(&inode->i_lock);
791 if (inode->i_nlink == 1) {
792 drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
793 ci->i_ceph_flags |= CEPH_I_NODELAY;
794 }
795 spin_unlock(&inode->i_lock);
796 return drop;
797}
798
799/*
800 * rmdir and unlink are differ only by the metadata op code
801 */
802static int ceph_unlink(struct inode *dir, struct dentry *dentry)
803{
804 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
805 struct ceph_mds_client *mdsc = &client->mdsc;
806 struct inode *inode = dentry->d_inode;
807 struct ceph_mds_request *req;
808 int err = -EROFS;
809 int op;
810
811 if (ceph_snap(dir) == CEPH_SNAPDIR) {
812 /* rmdir .snap/foo is RMSNAP */
813 dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
814 dentry->d_name.name, dentry);
815 op = CEPH_MDS_OP_RMSNAP;
816 } else if (ceph_snap(dir) == CEPH_NOSNAP) {
817 dout("unlink/rmdir dir %p dn %p inode %p\n",
818 dir, dentry, inode);
819 op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
820 CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
821 } else
822 goto out;
823 req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
824 if (IS_ERR(req)) {
825 err = PTR_ERR(req);
826 goto out;
827 }
828 req->r_dentry = dget(dentry);
829 req->r_num_caps = 2;
830 req->r_locked_dir = dir;
831 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
832 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
833 req->r_inode_drop = drop_caps_for_unlink(inode);
834 err = ceph_mdsc_do_request(mdsc, dir, req);
835 if (!err && !req->r_reply_info.head->is_dentry)
836 d_delete(dentry);
837 ceph_mdsc_put_request(req);
838out:
839 return err;
840}
841
842static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
843 struct inode *new_dir, struct dentry *new_dentry)
844{
845 struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
846 struct ceph_mds_client *mdsc = &client->mdsc;
847 struct ceph_mds_request *req;
848 int err;
849
850 if (ceph_snap(old_dir) != ceph_snap(new_dir))
851 return -EXDEV;
852 if (ceph_snap(old_dir) != CEPH_NOSNAP ||
853 ceph_snap(new_dir) != CEPH_NOSNAP)
854 return -EROFS;
855 dout("rename dir %p dentry %p to dir %p dentry %p\n",
856 old_dir, old_dentry, new_dir, new_dentry);
857 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);
858 if (IS_ERR(req))
859 return PTR_ERR(req);
860 req->r_dentry = dget(new_dentry);
861 req->r_num_caps = 2;
862 req->r_old_dentry = dget(old_dentry);
863 req->r_locked_dir = new_dir;
864 req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
865 req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
866 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
867 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
868 /* release LINK_RDCACHE on source inode (mds will lock it) */
869 req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
870 if (new_dentry->d_inode)
871 req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
872 err = ceph_mdsc_do_request(mdsc, old_dir, req);
873 if (!err && !req->r_reply_info.head->is_dentry) {
874 /*
875 * Normally d_move() is done by fill_trace (called by
876 * do_request, above). If there is no trace, we need
877 * to do it here.
878 */
879 d_move(old_dentry, new_dentry);
880 }
881 ceph_mdsc_put_request(req);
882 return err;
883}
884
885
886/*
887 * Check if dentry lease is valid. If not, delete the lease. Try to
888 * renew if the least is more than half up.
889 */
890static int dentry_lease_is_valid(struct dentry *dentry)
891{
892 struct ceph_dentry_info *di;
893 struct ceph_mds_session *s;
894 int valid = 0;
895 u32 gen;
896 unsigned long ttl;
897 struct ceph_mds_session *session = NULL;
898 struct inode *dir = NULL;
899 u32 seq = 0;
900
901 spin_lock(&dentry->d_lock);
902 di = ceph_dentry(dentry);
903 if (di && di->lease_session) {
904 s = di->lease_session;
905 spin_lock(&s->s_cap_lock);
906 gen = s->s_cap_gen;
907 ttl = s->s_cap_ttl;
908 spin_unlock(&s->s_cap_lock);
909
910 if (di->lease_gen == gen &&
911 time_before(jiffies, dentry->d_time) &&
912 time_before(jiffies, ttl)) {
913 valid = 1;
914 if (di->lease_renew_after &&
915 time_after(jiffies, di->lease_renew_after)) {
916 /* we should renew */
917 dir = dentry->d_parent->d_inode;
918 session = ceph_get_mds_session(s);
919 seq = di->lease_seq;
920 di->lease_renew_after = 0;
921 di->lease_renew_from = jiffies;
922 }
923 }
924 }
925 spin_unlock(&dentry->d_lock);
926
927 if (session) {
928 ceph_mdsc_lease_send_msg(session, dir, dentry,
929 CEPH_MDS_LEASE_RENEW, seq);
930 ceph_put_mds_session(session);
931 }
932 dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
933 return valid;
934}
935
936/*
937 * Check if directory-wide content lease/cap is valid.
938 */
939static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
940{
941 struct ceph_inode_info *ci = ceph_inode(dir);
942 struct ceph_dentry_info *di = ceph_dentry(dentry);
943 int valid = 0;
944
945 spin_lock(&dir->i_lock);
946 if (ci->i_shared_gen == di->lease_shared_gen)
947 valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
948 spin_unlock(&dir->i_lock);
949 dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
950 dir, (unsigned)ci->i_shared_gen, dentry,
951 (unsigned)di->lease_shared_gen, valid);
952 return valid;
953}
954
955/*
956 * Check if cached dentry can be trusted.
957 */
958static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
959{
960 struct inode *dir = dentry->d_parent->d_inode;
961
962 dout("d_revalidate %p '%.*s' inode %p\n", dentry,
963 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
964
965 /* always trust cached snapped dentries, snapdir dentry */
966 if (ceph_snap(dir) != CEPH_NOSNAP) {
967 dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
968 dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
969 goto out_touch;
970 }
971 if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
972 goto out_touch;
973
974 if (dentry_lease_is_valid(dentry) ||
975 dir_lease_is_valid(dir, dentry))
976 goto out_touch;
977
978 dout("d_revalidate %p invalid\n", dentry);
979 d_drop(dentry);
980 return 0;
981out_touch:
982 ceph_dentry_lru_touch(dentry);
983 return 1;
984}
985
986/*
987 * When a dentry is released, clear the dir I_COMPLETE if it was part
988 * of the current dir gen.
989 */
990static void ceph_dentry_release(struct dentry *dentry)
991{
992 struct ceph_dentry_info *di = ceph_dentry(dentry);
993 struct inode *parent_inode = dentry->d_parent->d_inode;
994
995 if (parent_inode) {
996 struct ceph_inode_info *ci = ceph_inode(parent_inode);
997
998 spin_lock(&parent_inode->i_lock);
999 if (ci->i_shared_gen == di->lease_shared_gen) {
1000 dout(" clearing %p complete (d_release)\n",
1001 parent_inode);
1002 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1003 ci->i_release_count++;
1004 }
1005 spin_unlock(&parent_inode->i_lock);
1006 }
1007 if (di) {
1008 ceph_dentry_lru_del(dentry);
1009 if (di->lease_session)
1010 ceph_put_mds_session(di->lease_session);
1011 kmem_cache_free(ceph_dentry_cachep, di);
1012 dentry->d_fsdata = NULL;
1013 }
1014}
1015
1016static int ceph_snapdir_d_revalidate(struct dentry *dentry,
1017 struct nameidata *nd)
1018{
1019 /*
1020 * Eventually, we'll want to revalidate snapped metadata
1021 * too... probably...
1022 */
1023 return 1;
1024}
1025
1026
1027
1028/*
1029 * read() on a dir. This weird interface hack only works if mounted
1030 * with '-o dirstat'.
1031 */
1032static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
1033 loff_t *ppos)
1034{
1035 struct ceph_file_info *cf = file->private_data;
1036 struct inode *inode = file->f_dentry->d_inode;
1037 struct ceph_inode_info *ci = ceph_inode(inode);
1038 int left;
1039
1040 if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
1041 return -EISDIR;
1042
1043 if (!cf->dir_info) {
1044 cf->dir_info = kmalloc(1024, GFP_NOFS);
1045 if (!cf->dir_info)
1046 return -ENOMEM;
1047 cf->dir_info_len =
1048 sprintf(cf->dir_info,
1049 "entries: %20lld\n"
1050 " files: %20lld\n"
1051 " subdirs: %20lld\n"
1052 "rentries: %20lld\n"
1053 " rfiles: %20lld\n"
1054 " rsubdirs: %20lld\n"
1055 "rbytes: %20lld\n"
1056 "rctime: %10ld.%09ld\n",
1057 ci->i_files + ci->i_subdirs,
1058 ci->i_files,
1059 ci->i_subdirs,
1060 ci->i_rfiles + ci->i_rsubdirs,
1061 ci->i_rfiles,
1062 ci->i_rsubdirs,
1063 ci->i_rbytes,
1064 (long)ci->i_rctime.tv_sec,
1065 (long)ci->i_rctime.tv_nsec);
1066 }
1067
1068 if (*ppos >= cf->dir_info_len)
1069 return 0;
1070 size = min_t(unsigned, size, cf->dir_info_len-*ppos);
1071 left = copy_to_user(buf, cf->dir_info + *ppos, size);
1072 if (left == size)
1073 return -EFAULT;
1074 *ppos += (size - left);
1075 return size - left;
1076}
1077
1078/*
1079 * an fsync() on a dir will wait for any uncommitted directory
1080 * operations to commit.
1081 */
1082static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
1083 int datasync)
1084{
1085 struct inode *inode = dentry->d_inode;
1086 struct ceph_inode_info *ci = ceph_inode(inode);
1087 struct list_head *head = &ci->i_unsafe_dirops;
1088 struct ceph_mds_request *req;
1089 u64 last_tid;
1090 int ret = 0;
1091
1092 dout("dir_fsync %p\n", inode);
1093 spin_lock(&ci->i_unsafe_lock);
1094 if (list_empty(head))
1095 goto out;
1096
1097 req = list_entry(head->prev,
1098 struct ceph_mds_request, r_unsafe_dir_item);
1099 last_tid = req->r_tid;
1100
1101 do {
1102 ceph_mdsc_get_request(req);
1103 spin_unlock(&ci->i_unsafe_lock);
1104 dout("dir_fsync %p wait on tid %llu (until %llu)\n",
1105 inode, req->r_tid, last_tid);
1106 if (req->r_timeout) {
1107 ret = wait_for_completion_timeout(
1108 &req->r_safe_completion, req->r_timeout);
1109 if (ret > 0)
1110 ret = 0;
1111 else if (ret == 0)
1112 ret = -EIO; /* timed out */
1113 } else {
1114 wait_for_completion(&req->r_safe_completion);
1115 }
1116 spin_lock(&ci->i_unsafe_lock);
1117 ceph_mdsc_put_request(req);
1118
1119 if (ret || list_empty(head))
1120 break;
1121 req = list_entry(head->next,
1122 struct ceph_mds_request, r_unsafe_dir_item);
1123 } while (req->r_tid < last_tid);
1124out:
1125 spin_unlock(&ci->i_unsafe_lock);
1126 return ret;
1127}
1128
1129/*
1130 * We maintain a private dentry LRU.
1131 *
1132 * FIXME: this needs to be changed to a per-mds lru to be useful.
1133 */
1134void ceph_dentry_lru_add(struct dentry *dn)
1135{
1136 struct ceph_dentry_info *di = ceph_dentry(dn);
1137 struct ceph_mds_client *mdsc;
1138
1139 dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
1140 dn->d_name.len, dn->d_name.name);
1141 if (di) {
1142 mdsc = &ceph_client(dn->d_sb)->mdsc;
1143 spin_lock(&mdsc->dentry_lru_lock);
1144 list_add_tail(&di->lru, &mdsc->dentry_lru);
1145 mdsc->num_dentry++;
1146 spin_unlock(&mdsc->dentry_lru_lock);
1147 }
1148}
1149
1150void ceph_dentry_lru_touch(struct dentry *dn)
1151{
1152 struct ceph_dentry_info *di = ceph_dentry(dn);
1153 struct ceph_mds_client *mdsc;
1154
1155 dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
1156 dn->d_name.len, dn->d_name.name);
1157 if (di) {
1158 mdsc = &ceph_client(dn->d_sb)->mdsc;
1159 spin_lock(&mdsc->dentry_lru_lock);
1160 list_move_tail(&di->lru, &mdsc->dentry_lru);
1161 spin_unlock(&mdsc->dentry_lru_lock);
1162 }
1163}
1164
1165void ceph_dentry_lru_del(struct dentry *dn)
1166{
1167 struct ceph_dentry_info *di = ceph_dentry(dn);
1168 struct ceph_mds_client *mdsc;
1169
1170 dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
1171 dn->d_name.len, dn->d_name.name);
1172 if (di) {
1173 mdsc = &ceph_client(dn->d_sb)->mdsc;
1174 spin_lock(&mdsc->dentry_lru_lock);
1175 list_del_init(&di->lru);
1176 mdsc->num_dentry--;
1177 spin_unlock(&mdsc->dentry_lru_lock);
1178 }
1179}
1180
1181const struct file_operations ceph_dir_fops = {
1182 .read = ceph_read_dir,
1183 .readdir = ceph_readdir,
1184 .llseek = ceph_dir_llseek,
1185 .open = ceph_open,
1186 .release = ceph_release,
1187 .unlocked_ioctl = ceph_ioctl,
1188 .fsync = ceph_dir_fsync,
1189};
1190
1191const struct inode_operations ceph_dir_iops = {
1192 .lookup = ceph_lookup,
1193 .permission = ceph_permission,
1194 .getattr = ceph_getattr,
1195 .setattr = ceph_setattr,
1196 .setxattr = ceph_setxattr,
1197 .getxattr = ceph_getxattr,
1198 .listxattr = ceph_listxattr,
1199 .removexattr = ceph_removexattr,
1200 .mknod = ceph_mknod,
1201 .symlink = ceph_symlink,
1202 .mkdir = ceph_mkdir,
1203 .link = ceph_link,
1204 .unlink = ceph_unlink,
1205 .rmdir = ceph_unlink,
1206 .rename = ceph_rename,
1207 .create = ceph_create,
1208};
1209
1210struct dentry_operations ceph_dentry_ops = {
1211 .d_revalidate = ceph_d_revalidate,
1212 .d_release = ceph_dentry_release,
1213};
1214
1215struct dentry_operations ceph_snapdir_dentry_ops = {
1216 .d_revalidate = ceph_snapdir_d_revalidate,
1217};
1218
1219struct dentry_operations ceph_snap_dentry_ops = {
1220};
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
new file mode 100644
index 000000000000..fc68e39cbad6
--- /dev/null
+++ b/fs/ceph/export.c
@@ -0,0 +1,223 @@
1#include "ceph_debug.h"
2
3#include <linux/exportfs.h>
4#include <asm/unaligned.h>
5
6#include "super.h"
7
8/*
9 * NFS export support
10 *
11 * NFS re-export of a ceph mount is, at present, only semireliable.
12 * The basic issue is that the Ceph architectures doesn't lend itself
13 * well to generating filehandles that will remain valid forever.
14 *
15 * So, we do our best. If you're lucky, your inode will be in the
16 * client's cache. If it's not, and you have a connectable fh, then
17 * the MDS server may be able to find it for you. Otherwise, you get
18 * ESTALE.
19 *
20 * There are ways to this more reliable, but in the non-connectable fh
21 * case, we won't every work perfectly, and in the connectable case,
22 * some changes are needed on the MDS side to work better.
23 */
24
25/*
26 * Basic fh
27 */
28struct ceph_nfs_fh {
29 u64 ino;
30} __attribute__ ((packed));
31
32/*
33 * Larger 'connectable' fh that includes parent ino and name hash.
34 * Use this whenever possible, as it works more reliably.
35 */
36struct ceph_nfs_confh {
37 u64 ino, parent_ino;
38 u32 parent_name_hash;
39} __attribute__ ((packed));
40
41static int ceph_encode_fh(struct dentry *dentry, u32 *rawfh, int *max_len,
42 int connectable)
43{
44 struct ceph_nfs_fh *fh = (void *)rawfh;
45 struct ceph_nfs_confh *cfh = (void *)rawfh;
46 struct dentry *parent = dentry->d_parent;
47 struct inode *inode = dentry->d_inode;
48 int type;
49
50 /* don't re-export snaps */
51 if (ceph_snap(inode) != CEPH_NOSNAP)
52 return -EINVAL;
53
54 if (*max_len >= sizeof(*cfh)) {
55 dout("encode_fh %p connectable\n", dentry);
56 cfh->ino = ceph_ino(dentry->d_inode);
57 cfh->parent_ino = ceph_ino(parent->d_inode);
58 cfh->parent_name_hash = parent->d_name.hash;
59 *max_len = sizeof(*cfh);
60 type = 2;
61 } else if (*max_len > sizeof(*fh)) {
62 if (connectable)
63 return -ENOSPC;
64 dout("encode_fh %p\n", dentry);
65 fh->ino = ceph_ino(dentry->d_inode);
66 *max_len = sizeof(*fh);
67 type = 1;
68 } else {
69 return -ENOSPC;
70 }
71 return type;
72}
73
74/*
75 * convert regular fh to dentry
76 *
77 * FIXME: we should try harder by querying the mds for the ino.
78 */
79static struct dentry *__fh_to_dentry(struct super_block *sb,
80 struct ceph_nfs_fh *fh)
81{
82 struct inode *inode;
83 struct dentry *dentry;
84 struct ceph_vino vino;
85 int err;
86
87 dout("__fh_to_dentry %llx\n", fh->ino);
88 vino.ino = fh->ino;
89 vino.snap = CEPH_NOSNAP;
90 inode = ceph_find_inode(sb, vino);
91 if (!inode)
92 return ERR_PTR(-ESTALE);
93
94 dentry = d_obtain_alias(inode);
95 if (!dentry) {
96 pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
97 fh->ino, inode);
98 iput(inode);
99 return ERR_PTR(-ENOMEM);
100 }
101 err = ceph_init_dentry(dentry);
102
103 if (err < 0) {
104 iput(inode);
105 return ERR_PTR(err);
106 }
107 dout("__fh_to_dentry %llx %p dentry %p\n", fh->ino, inode, dentry);
108 return dentry;
109}
110
111/*
112 * convert connectable fh to dentry
113 */
114static struct dentry *__cfh_to_dentry(struct super_block *sb,
115 struct ceph_nfs_confh *cfh)
116{
117 struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
118 struct inode *inode;
119 struct dentry *dentry;
120 struct ceph_vino vino;
121 int err;
122
123 dout("__cfh_to_dentry %llx (%llx/%x)\n",
124 cfh->ino, cfh->parent_ino, cfh->parent_name_hash);
125
126 vino.ino = cfh->ino;
127 vino.snap = CEPH_NOSNAP;
128 inode = ceph_find_inode(sb, vino);
129 if (!inode) {
130 struct ceph_mds_request *req;
131
132 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
133 USE_ANY_MDS);
134 if (IS_ERR(req))
135 return ERR_PTR(PTR_ERR(req));
136
137 req->r_ino1 = vino;
138 req->r_ino2.ino = cfh->parent_ino;
139 req->r_ino2.snap = CEPH_NOSNAP;
140 req->r_path2 = kmalloc(16, GFP_NOFS);
141 snprintf(req->r_path2, 16, "%d", cfh->parent_name_hash);
142 req->r_num_caps = 1;
143 err = ceph_mdsc_do_request(mdsc, NULL, req);
144 ceph_mdsc_put_request(req);
145 inode = ceph_find_inode(sb, vino);
146 if (!inode)
147 return ERR_PTR(err ? err : -ESTALE);
148 }
149
150 dentry = d_obtain_alias(inode);
151 if (!dentry) {
152 pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
153 cfh->ino, inode);
154 iput(inode);
155 return ERR_PTR(-ENOMEM);
156 }
157 err = ceph_init_dentry(dentry);
158 if (err < 0) {
159 iput(inode);
160 return ERR_PTR(err);
161 }
162 dout("__cfh_to_dentry %llx %p dentry %p\n", cfh->ino, inode, dentry);
163 return dentry;
164}
165
166static struct dentry *ceph_fh_to_dentry(struct super_block *sb, struct fid *fid,
167 int fh_len, int fh_type)
168{
169 if (fh_type == 1)
170 return __fh_to_dentry(sb, (struct ceph_nfs_fh *)fid->raw);
171 else
172 return __cfh_to_dentry(sb, (struct ceph_nfs_confh *)fid->raw);
173}
174
175/*
176 * get parent, if possible.
177 *
178 * FIXME: we could do better by querying the mds to discover the
179 * parent.
180 */
181static struct dentry *ceph_fh_to_parent(struct super_block *sb,
182 struct fid *fid,
183 int fh_len, int fh_type)
184{
185 struct ceph_nfs_confh *cfh = (void *)fid->raw;
186 struct ceph_vino vino;
187 struct inode *inode;
188 struct dentry *dentry;
189 int err;
190
191 if (fh_type == 1)
192 return ERR_PTR(-ESTALE);
193
194 pr_debug("fh_to_parent %llx/%d\n", cfh->parent_ino,
195 cfh->parent_name_hash);
196
197 vino.ino = cfh->ino;
198 vino.snap = CEPH_NOSNAP;
199 inode = ceph_find_inode(sb, vino);
200 if (!inode)
201 return ERR_PTR(-ESTALE);
202
203 dentry = d_obtain_alias(inode);
204 if (!dentry) {
205 pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
206 cfh->ino, inode);
207 iput(inode);
208 return ERR_PTR(-ENOMEM);
209 }
210 err = ceph_init_dentry(dentry);
211 if (err < 0) {
212 iput(inode);
213 return ERR_PTR(err);
214 }
215 dout("fh_to_parent %llx %p dentry %p\n", cfh->ino, inode, dentry);
216 return dentry;
217}
218
219const struct export_operations ceph_export_ops = {
220 .encode_fh = ceph_encode_fh,
221 .fh_to_dentry = ceph_fh_to_dentry,
222 .fh_to_parent = ceph_fh_to_parent,
223};
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
new file mode 100644
index 000000000000..5d2af8464f6a
--- /dev/null
+++ b/fs/ceph/file.c
@@ -0,0 +1,937 @@
1#include "ceph_debug.h"
2
3#include <linux/sched.h>
4#include <linux/file.h>
5#include <linux/namei.h>
6#include <linux/writeback.h>
7
8#include "super.h"
9#include "mds_client.h"
10
11/*
12 * Ceph file operations
13 *
14 * Implement basic open/close functionality, and implement
15 * read/write.
16 *
17 * We implement three modes of file I/O:
18 * - buffered uses the generic_file_aio_{read,write} helpers
19 *
20 * - synchronous is used when there is multi-client read/write
21 * sharing, avoids the page cache, and synchronously waits for an
22 * ack from the OSD.
23 *
24 * - direct io takes the variant of the sync path that references
25 * user pages directly.
26 *
27 * fsync() flushes and waits on dirty pages, but just queues metadata
28 * for writeback: since the MDS can recover size and mtime there is no
29 * need to wait for MDS acknowledgement.
30 */
31
32
33/*
34 * Prepare an open request. Preallocate ceph_cap to avoid an
35 * inopportune ENOMEM later.
36 */
37static struct ceph_mds_request *
38prepare_open_request(struct super_block *sb, int flags, int create_mode)
39{
40 struct ceph_client *client = ceph_sb_to_client(sb);
41 struct ceph_mds_client *mdsc = &client->mdsc;
42 struct ceph_mds_request *req;
43 int want_auth = USE_ANY_MDS;
44 int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN;
45
46 if (flags & (O_WRONLY|O_RDWR|O_CREAT|O_TRUNC))
47 want_auth = USE_AUTH_MDS;
48
49 req = ceph_mdsc_create_request(mdsc, op, want_auth);
50 if (IS_ERR(req))
51 goto out;
52 req->r_fmode = ceph_flags_to_mode(flags);
53 req->r_args.open.flags = cpu_to_le32(flags);
54 req->r_args.open.mode = cpu_to_le32(create_mode);
55 req->r_args.open.preferred = cpu_to_le32(-1);
56out:
57 return req;
58}
59
60/*
61 * initialize private struct file data.
62 * if we fail, clean up by dropping fmode reference on the ceph_inode
63 */
64static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
65{
66 struct ceph_file_info *cf;
67 int ret = 0;
68
69 switch (inode->i_mode & S_IFMT) {
70 case S_IFREG:
71 case S_IFDIR:
72 dout("init_file %p %p 0%o (regular)\n", inode, file,
73 inode->i_mode);
74 cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO);
75 if (cf == NULL) {
76 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
77 return -ENOMEM;
78 }
79 cf->fmode = fmode;
80 cf->next_offset = 2;
81 file->private_data = cf;
82 BUG_ON(inode->i_fop->release != ceph_release);
83 break;
84
85 case S_IFLNK:
86 dout("init_file %p %p 0%o (symlink)\n", inode, file,
87 inode->i_mode);
88 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
89 break;
90
91 default:
92 dout("init_file %p %p 0%o (special)\n", inode, file,
93 inode->i_mode);
94 /*
95 * we need to drop the open ref now, since we don't
96 * have .release set to ceph_release.
97 */
98 ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
99 BUG_ON(inode->i_fop->release == ceph_release);
100
101 /* call the proper open fop */
102 ret = inode->i_fop->open(inode, file);
103 }
104 return ret;
105}
106
107/*
108 * If the filp already has private_data, that means the file was
109 * already opened by intent during lookup, and we do nothing.
110 *
111 * If we already have the requisite capabilities, we can satisfy
112 * the open request locally (no need to request new caps from the
113 * MDS). We do, however, need to inform the MDS (asynchronously)
114 * if our wanted caps set expands.
115 */
116int ceph_open(struct inode *inode, struct file *file)
117{
118 struct ceph_inode_info *ci = ceph_inode(inode);
119 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
120 struct ceph_mds_client *mdsc = &client->mdsc;
121 struct ceph_mds_request *req;
122 struct ceph_file_info *cf = file->private_data;
123 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
124 int err;
125 int flags, fmode, wanted;
126
127 if (cf) {
128 dout("open file %p is already opened\n", file);
129 return 0;
130 }
131
132 /* filter out O_CREAT|O_EXCL; vfs did that already. yuck. */
133 flags = file->f_flags & ~(O_CREAT|O_EXCL);
134 if (S_ISDIR(inode->i_mode))
135 flags = O_DIRECTORY; /* mds likes to know */
136
137 dout("open inode %p ino %llx.%llx file %p flags %d (%d)\n", inode,
138 ceph_vinop(inode), file, flags, file->f_flags);
139 fmode = ceph_flags_to_mode(flags);
140 wanted = ceph_caps_for_mode(fmode);
141
142 /* snapped files are read-only */
143 if (ceph_snap(inode) != CEPH_NOSNAP && (file->f_mode & FMODE_WRITE))
144 return -EROFS;
145
146 /* trivially open snapdir */
147 if (ceph_snap(inode) == CEPH_SNAPDIR) {
148 spin_lock(&inode->i_lock);
149 __ceph_get_fmode(ci, fmode);
150 spin_unlock(&inode->i_lock);
151 return ceph_init_file(inode, file, fmode);
152 }
153
154 /*
155 * No need to block if we have any caps. Update wanted set
156 * asynchronously.
157 */
158 spin_lock(&inode->i_lock);
159 if (__ceph_is_any_real_caps(ci)) {
160 int mds_wanted = __ceph_caps_mds_wanted(ci);
161 int issued = __ceph_caps_issued(ci, NULL);
162
163 dout("open %p fmode %d want %s issued %s using existing\n",
164 inode, fmode, ceph_cap_string(wanted),
165 ceph_cap_string(issued));
166 __ceph_get_fmode(ci, fmode);
167 spin_unlock(&inode->i_lock);
168
169 /* adjust wanted? */
170 if ((issued & wanted) != wanted &&
171 (mds_wanted & wanted) != wanted &&
172 ceph_snap(inode) != CEPH_SNAPDIR)
173 ceph_check_caps(ci, 0, NULL);
174
175 return ceph_init_file(inode, file, fmode);
176 } else if (ceph_snap(inode) != CEPH_NOSNAP &&
177 (ci->i_snap_caps & wanted) == wanted) {
178 __ceph_get_fmode(ci, fmode);
179 spin_unlock(&inode->i_lock);
180 return ceph_init_file(inode, file, fmode);
181 }
182 spin_unlock(&inode->i_lock);
183
184 dout("open fmode %d wants %s\n", fmode, ceph_cap_string(wanted));
185 req = prepare_open_request(inode->i_sb, flags, 0);
186 if (IS_ERR(req)) {
187 err = PTR_ERR(req);
188 goto out;
189 }
190 req->r_inode = igrab(inode);
191 req->r_num_caps = 1;
192 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
193 if (!err)
194 err = ceph_init_file(inode, file, req->r_fmode);
195 ceph_mdsc_put_request(req);
196 dout("open result=%d on %llx.%llx\n", err, ceph_vinop(inode));
197out:
198 return err;
199}
200
201
202/*
203 * Do a lookup + open with a single request.
204 *
205 * If this succeeds, but some subsequent check in the vfs
206 * may_open() fails, the struct *file gets cleaned up (i.e.
207 * ceph_release gets called). So fear not!
208 */
209/*
210 * flags
211 * path_lookup_open -> LOOKUP_OPEN
212 * path_lookup_create -> LOOKUP_OPEN|LOOKUP_CREATE
213 */
214struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
215 struct nameidata *nd, int mode,
216 int locked_dir)
217{
218 struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
219 struct ceph_mds_client *mdsc = &client->mdsc;
220 struct file *file = nd->intent.open.file;
221 struct inode *parent_inode = get_dentry_parent_inode(file->f_dentry);
222 struct ceph_mds_request *req;
223 int err;
224 int flags = nd->intent.open.flags - 1; /* silly vfs! */
225
226 dout("ceph_lookup_open dentry %p '%.*s' flags %d mode 0%o\n",
227 dentry, dentry->d_name.len, dentry->d_name.name, flags, mode);
228
229 /* do the open */
230 req = prepare_open_request(dir->i_sb, flags, mode);
231 if (IS_ERR(req))
232 return ERR_PTR(PTR_ERR(req));
233 req->r_dentry = dget(dentry);
234 req->r_num_caps = 2;
235 if (flags & O_CREAT) {
236 req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
237 req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
238 }
239 req->r_locked_dir = dir; /* caller holds dir->i_mutex */
240 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
241 dentry = ceph_finish_lookup(req, dentry, err);
242 if (!err && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
243 err = ceph_handle_notrace_create(dir, dentry);
244 if (!err)
245 err = ceph_init_file(req->r_dentry->d_inode, file,
246 req->r_fmode);
247 ceph_mdsc_put_request(req);
248 dout("ceph_lookup_open result=%p\n", dentry);
249 return dentry;
250}
251
252int ceph_release(struct inode *inode, struct file *file)
253{
254 struct ceph_inode_info *ci = ceph_inode(inode);
255 struct ceph_file_info *cf = file->private_data;
256
257 dout("release inode %p file %p\n", inode, file);
258 ceph_put_fmode(ci, cf->fmode);
259 if (cf->last_readdir)
260 ceph_mdsc_put_request(cf->last_readdir);
261 kfree(cf->last_name);
262 kfree(cf->dir_info);
263 dput(cf->dentry);
264 kmem_cache_free(ceph_file_cachep, cf);
265
266 /* wake up anyone waiting for caps on this inode */
267 wake_up(&ci->i_cap_wq);
268 return 0;
269}
270
271/*
272 * build a vector of user pages
273 */
274static struct page **get_direct_page_vector(const char __user *data,
275 int num_pages,
276 loff_t off, size_t len)
277{
278 struct page **pages;
279 int rc;
280
281 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
282 if (!pages)
283 return ERR_PTR(-ENOMEM);
284
285 down_read(&current->mm->mmap_sem);
286 rc = get_user_pages(current, current->mm, (unsigned long)data,
287 num_pages, 0, 0, pages, NULL);
288 up_read(&current->mm->mmap_sem);
289 if (rc < 0)
290 goto fail;
291 return pages;
292
293fail:
294 kfree(pages);
295 return ERR_PTR(rc);
296}
297
298static void put_page_vector(struct page **pages, int num_pages)
299{
300 int i;
301
302 for (i = 0; i < num_pages; i++)
303 put_page(pages[i]);
304 kfree(pages);
305}
306
307void ceph_release_page_vector(struct page **pages, int num_pages)
308{
309 int i;
310
311 for (i = 0; i < num_pages; i++)
312 __free_pages(pages[i], 0);
313 kfree(pages);
314}
315
316/*
317 * allocate a vector new pages
318 */
319static struct page **alloc_page_vector(int num_pages)
320{
321 struct page **pages;
322 int i;
323
324 pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
325 if (!pages)
326 return ERR_PTR(-ENOMEM);
327 for (i = 0; i < num_pages; i++) {
328 pages[i] = alloc_page(GFP_NOFS);
329 if (pages[i] == NULL) {
330 ceph_release_page_vector(pages, i);
331 return ERR_PTR(-ENOMEM);
332 }
333 }
334 return pages;
335}
336
337/*
338 * copy user data into a page vector
339 */
340static int copy_user_to_page_vector(struct page **pages,
341 const char __user *data,
342 loff_t off, size_t len)
343{
344 int i = 0;
345 int po = off & ~PAGE_CACHE_MASK;
346 int left = len;
347 int l, bad;
348
349 while (left > 0) {
350 l = min_t(int, PAGE_CACHE_SIZE-po, left);
351 bad = copy_from_user(page_address(pages[i]) + po, data, l);
352 if (bad == l)
353 return -EFAULT;
354 data += l - bad;
355 left -= l - bad;
356 po += l - bad;
357 if (po == PAGE_CACHE_SIZE) {
358 po = 0;
359 i++;
360 }
361 }
362 return len;
363}
364
365/*
366 * copy user data from a page vector into a user pointer
367 */
368static int copy_page_vector_to_user(struct page **pages, char __user *data,
369 loff_t off, size_t len)
370{
371 int i = 0;
372 int po = off & ~PAGE_CACHE_MASK;
373 int left = len;
374 int l, bad;
375
376 while (left > 0) {
377 l = min_t(int, left, PAGE_CACHE_SIZE-po);
378 bad = copy_to_user(data, page_address(pages[i]) + po, l);
379 if (bad == l)
380 return -EFAULT;
381 data += l - bad;
382 left -= l - bad;
383 if (po) {
384 po += l - bad;
385 if (po == PAGE_CACHE_SIZE)
386 po = 0;
387 }
388 i++;
389 }
390 return len;
391}
392
393/*
394 * Zero an extent within a page vector. Offset is relative to the
395 * start of the first page.
396 */
397static void zero_page_vector_range(int off, int len, struct page **pages)
398{
399 int i = off >> PAGE_CACHE_SHIFT;
400
401 off &= ~PAGE_CACHE_MASK;
402
403 dout("zero_page_vector_page %u~%u\n", off, len);
404
405 /* leading partial page? */
406 if (off) {
407 int end = min((int)PAGE_CACHE_SIZE, off + len);
408 dout("zeroing %d %p head from %d\n", i, pages[i],
409 (int)off);
410 zero_user_segment(pages[i], off, end);
411 len -= (end - off);
412 i++;
413 }
414 while (len >= PAGE_CACHE_SIZE) {
415 dout("zeroing %d %p len=%d\n", i, pages[i], len);
416 zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
417 len -= PAGE_CACHE_SIZE;
418 i++;
419 }
420 /* trailing partial page? */
421 if (len) {
422 dout("zeroing %d %p tail to %d\n", i, pages[i], (int)len);
423 zero_user_segment(pages[i], 0, len);
424 }
425}
426
427
428/*
429 * Read a range of bytes striped over one or more objects. Iterate over
430 * objects we stripe over. (That's not atomic, but good enough for now.)
431 *
432 * If we get a short result from the OSD, check against i_size; we need to
433 * only return a short read to the caller if we hit EOF.
434 */
435static int striped_read(struct inode *inode,
436 u64 off, u64 len,
437 struct page **pages, int num_pages,
438 int *checkeof)
439{
440 struct ceph_client *client = ceph_inode_to_client(inode);
441 struct ceph_inode_info *ci = ceph_inode(inode);
442 u64 pos, this_len;
443 int page_off = off & ~PAGE_CACHE_MASK; /* first byte's offset in page */
444 int left, pages_left;
445 int read;
446 struct page **page_pos;
447 int ret;
448 bool hit_stripe, was_short;
449
450 /*
451 * we may need to do multiple reads. not atomic, unfortunately.
452 */
453 pos = off;
454 left = len;
455 page_pos = pages;
456 pages_left = num_pages;
457 read = 0;
458
459more:
460 this_len = left;
461 ret = ceph_osdc_readpages(&client->osdc, ceph_vino(inode),
462 &ci->i_layout, pos, &this_len,
463 ci->i_truncate_seq,
464 ci->i_truncate_size,
465 page_pos, pages_left);
466 hit_stripe = this_len < left;
467 was_short = ret >= 0 && ret < this_len;
468 if (ret == -ENOENT)
469 ret = 0;
470 dout("striped_read %llu~%u (read %u) got %d%s%s\n", pos, left, read,
471 ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
472
473 if (ret > 0) {
474 int didpages =
475 ((pos & ~PAGE_CACHE_MASK) + ret) >> PAGE_CACHE_SHIFT;
476
477 if (read < pos - off) {
478 dout(" zero gap %llu to %llu\n", off + read, pos);
479 zero_page_vector_range(page_off + read,
480 pos - off - read, pages);
481 }
482 pos += ret;
483 read = pos - off;
484 left -= ret;
485 page_pos += didpages;
486 pages_left -= didpages;
487
488 /* hit stripe? */
489 if (left && hit_stripe)
490 goto more;
491 }
492
493 if (was_short) {
494 /* was original extent fully inside i_size? */
495 if (pos + left <= inode->i_size) {
496 dout("zero tail\n");
497 zero_page_vector_range(page_off + read, len - read,
498 pages);
499 read = len;
500 goto out;
501 }
502
503 /* check i_size */
504 *checkeof = 1;
505 }
506
507out:
508 if (ret >= 0)
509 ret = read;
510 dout("striped_read returns %d\n", ret);
511 return ret;
512}
513
514/*
515 * Completely synchronous read and write methods. Direct from __user
516 * buffer to osd, or directly to user pages (if O_DIRECT).
517 *
518 * If the read spans object boundary, just do multiple reads.
519 */
520static ssize_t ceph_sync_read(struct file *file, char __user *data,
521 unsigned len, loff_t *poff, int *checkeof)
522{
523 struct inode *inode = file->f_dentry->d_inode;
524 struct page **pages;
525 u64 off = *poff;
526 int num_pages = calc_pages_for(off, len);
527 int ret;
528
529 dout("sync_read on file %p %llu~%u %s\n", file, off, len,
530 (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
531
532 if (file->f_flags & O_DIRECT) {
533 pages = get_direct_page_vector(data, num_pages, off, len);
534
535 /*
536 * flush any page cache pages in this range. this
537 * will make concurrent normal and O_DIRECT io slow,
538 * but it will at least behave sensibly when they are
539 * in sequence.
540 */
541 } else {
542 pages = alloc_page_vector(num_pages);
543 }
544 if (IS_ERR(pages))
545 return PTR_ERR(pages);
546
547 ret = filemap_write_and_wait(inode->i_mapping);
548 if (ret < 0)
549 goto done;
550
551 ret = striped_read(inode, off, len, pages, num_pages, checkeof);
552
553 if (ret >= 0 && (file->f_flags & O_DIRECT) == 0)
554 ret = copy_page_vector_to_user(pages, data, off, ret);
555 if (ret >= 0)
556 *poff = off + ret;
557
558done:
559 if (file->f_flags & O_DIRECT)
560 put_page_vector(pages, num_pages);
561 else
562 ceph_release_page_vector(pages, num_pages);
563 dout("sync_read result %d\n", ret);
564 return ret;
565}
566
567/*
568 * Write commit callback, called if we requested both an ACK and
569 * ONDISK commit reply from the OSD.
570 */
571static void sync_write_commit(struct ceph_osd_request *req,
572 struct ceph_msg *msg)
573{
574 struct ceph_inode_info *ci = ceph_inode(req->r_inode);
575
576 dout("sync_write_commit %p tid %llu\n", req, req->r_tid);
577 spin_lock(&ci->i_unsafe_lock);
578 list_del_init(&req->r_unsafe_item);
579 spin_unlock(&ci->i_unsafe_lock);
580 ceph_put_cap_refs(ci, CEPH_CAP_FILE_WR);
581}
582
583/*
584 * Synchronous write, straight from __user pointer or user pages (if
585 * O_DIRECT).
586 *
587 * If write spans object boundary, just do multiple writes. (For a
588 * correct atomic write, we should e.g. take write locks on all
589 * objects, rollback on failure, etc.)
590 */
591static ssize_t ceph_sync_write(struct file *file, const char __user *data,
592 size_t left, loff_t *offset)
593{
594 struct inode *inode = file->f_dentry->d_inode;
595 struct ceph_inode_info *ci = ceph_inode(inode);
596 struct ceph_client *client = ceph_inode_to_client(inode);
597 struct ceph_osd_request *req;
598 struct page **pages;
599 int num_pages;
600 long long unsigned pos;
601 u64 len;
602 int written = 0;
603 int flags;
604 int do_sync = 0;
605 int check_caps = 0;
606 int ret;
607 struct timespec mtime = CURRENT_TIME;
608
609 if (ceph_snap(file->f_dentry->d_inode) != CEPH_NOSNAP)
610 return -EROFS;
611
612 dout("sync_write on file %p %lld~%u %s\n", file, *offset,
613 (unsigned)left, (file->f_flags & O_DIRECT) ? "O_DIRECT" : "");
614
615 if (file->f_flags & O_APPEND)
616 pos = i_size_read(inode);
617 else
618 pos = *offset;
619
620 ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + left);
621 if (ret < 0)
622 return ret;
623
624 ret = invalidate_inode_pages2_range(inode->i_mapping,
625 pos >> PAGE_CACHE_SHIFT,
626 (pos + left) >> PAGE_CACHE_SHIFT);
627 if (ret < 0)
628 dout("invalidate_inode_pages2_range returned %d\n", ret);
629
630 flags = CEPH_OSD_FLAG_ORDERSNAP |
631 CEPH_OSD_FLAG_ONDISK |
632 CEPH_OSD_FLAG_WRITE;
633 if ((file->f_flags & (O_SYNC|O_DIRECT)) == 0)
634 flags |= CEPH_OSD_FLAG_ACK;
635 else
636 do_sync = 1;
637
638 /*
639 * we may need to do multiple writes here if we span an object
640 * boundary. this isn't atomic, unfortunately. :(
641 */
642more:
643 len = left;
644 req = ceph_osdc_new_request(&client->osdc, &ci->i_layout,
645 ceph_vino(inode), pos, &len,
646 CEPH_OSD_OP_WRITE, flags,
647 ci->i_snap_realm->cached_context,
648 do_sync,
649 ci->i_truncate_seq, ci->i_truncate_size,
650 &mtime, false, 2);
651 if (IS_ERR(req))
652 return PTR_ERR(req);
653
654 num_pages = calc_pages_for(pos, len);
655
656 if (file->f_flags & O_DIRECT) {
657 pages = get_direct_page_vector(data, num_pages, pos, len);
658 if (IS_ERR(pages)) {
659 ret = PTR_ERR(pages);
660 goto out;
661 }
662
663 /*
664 * throw out any page cache pages in this range. this
665 * may block.
666 */
667 truncate_inode_pages_range(inode->i_mapping, pos, pos+len);
668 } else {
669 pages = alloc_page_vector(num_pages);
670 if (IS_ERR(pages)) {
671 ret = PTR_ERR(pages);
672 goto out;
673 }
674 ret = copy_user_to_page_vector(pages, data, pos, len);
675 if (ret < 0) {
676 ceph_release_page_vector(pages, num_pages);
677 goto out;
678 }
679
680 if ((file->f_flags & O_SYNC) == 0) {
681 /* get a second commit callback */
682 req->r_safe_callback = sync_write_commit;
683 req->r_own_pages = 1;
684 }
685 }
686 req->r_pages = pages;
687 req->r_num_pages = num_pages;
688 req->r_inode = inode;
689
690 ret = ceph_osdc_start_request(&client->osdc, req, false);
691 if (!ret) {
692 if (req->r_safe_callback) {
693 /*
694 * Add to inode unsafe list only after we
695 * start_request so that a tid has been assigned.
696 */
697 spin_lock(&ci->i_unsafe_lock);
698 list_add(&ci->i_unsafe_writes, &req->r_unsafe_item);
699 spin_unlock(&ci->i_unsafe_lock);
700 ceph_get_cap_refs(ci, CEPH_CAP_FILE_WR);
701 }
702 ret = ceph_osdc_wait_request(&client->osdc, req);
703 }
704
705 if (file->f_flags & O_DIRECT)
706 put_page_vector(pages, num_pages);
707 else if (file->f_flags & O_SYNC)
708 ceph_release_page_vector(pages, num_pages);
709
710out:
711 ceph_osdc_put_request(req);
712 if (ret == 0) {
713 pos += len;
714 written += len;
715 left -= len;
716 if (left)
717 goto more;
718
719 ret = written;
720 *offset = pos;
721 if (pos > i_size_read(inode))
722 check_caps = ceph_inode_set_size(inode, pos);
723 if (check_caps)
724 ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY,
725 NULL);
726 }
727 return ret;
728}
729
730/*
731 * Wrap generic_file_aio_read with checks for cap bits on the inode.
732 * Atomically grab references, so that those bits are not released
733 * back to the MDS mid-read.
734 *
735 * Hmm, the sync read case isn't actually async... should it be?
736 */
737static ssize_t ceph_aio_read(struct kiocb *iocb, const struct iovec *iov,
738 unsigned long nr_segs, loff_t pos)
739{
740 struct file *filp = iocb->ki_filp;
741 loff_t *ppos = &iocb->ki_pos;
742 size_t len = iov->iov_len;
743 struct inode *inode = filp->f_dentry->d_inode;
744 struct ceph_inode_info *ci = ceph_inode(inode);
745 void *base = iov->iov_base;
746 ssize_t ret;
747 int got = 0;
748 int checkeof = 0, read = 0;
749
750 dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n",
751 inode, ceph_vinop(inode), pos, (unsigned)len, inode);
752again:
753 __ceph_do_pending_vmtruncate(inode);
754 ret = ceph_get_caps(ci, CEPH_CAP_FILE_RD, CEPH_CAP_FILE_CACHE,
755 &got, -1);
756 if (ret < 0)
757 goto out;
758 dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
759 inode, ceph_vinop(inode), pos, (unsigned)len,
760 ceph_cap_string(got));
761
762 if ((got & CEPH_CAP_FILE_CACHE) == 0 ||
763 (iocb->ki_filp->f_flags & O_DIRECT) ||
764 (inode->i_sb->s_flags & MS_SYNCHRONOUS))
765 /* hmm, this isn't really async... */
766 ret = ceph_sync_read(filp, base, len, ppos, &checkeof);
767 else
768 ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
769
770out:
771 dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
772 inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
773 ceph_put_cap_refs(ci, got);
774
775 if (checkeof && ret >= 0) {
776 int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
777
778 /* hit EOF or hole? */
779 if (statret == 0 && *ppos < inode->i_size) {
780 dout("aio_read sync_read hit hole, reading more\n");
781 read += ret;
782 base += ret;
783 len -= ret;
784 checkeof = 0;
785 goto again;
786 }
787 }
788 if (ret >= 0)
789 ret += read;
790
791 return ret;
792}
793
794/*
795 * Take cap references to avoid releasing caps to MDS mid-write.
796 *
797 * If we are synchronous, and write with an old snap context, the OSD
798 * may return EOLDSNAPC. In that case, retry the write.. _after_
799 * dropping our cap refs and allowing the pending snap to logically
800 * complete _before_ this write occurs.
801 *
802 * If we are near ENOSPC, write synchronously.
803 */
804static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
805 unsigned long nr_segs, loff_t pos)
806{
807 struct file *file = iocb->ki_filp;
808 struct inode *inode = file->f_dentry->d_inode;
809 struct ceph_inode_info *ci = ceph_inode(inode);
810 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
811 loff_t endoff = pos + iov->iov_len;
812 int got = 0;
813 int ret, err;
814
815 if (ceph_snap(inode) != CEPH_NOSNAP)
816 return -EROFS;
817
818retry_snap:
819 if (ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL))
820 return -ENOSPC;
821 __ceph_do_pending_vmtruncate(inode);
822 dout("aio_write %p %llx.%llx %llu~%u getting caps. i_size %llu\n",
823 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
824 inode->i_size);
825 ret = ceph_get_caps(ci, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER,
826 &got, endoff);
827 if (ret < 0)
828 goto out;
829
830 dout("aio_write %p %llx.%llx %llu~%u got cap refs on %s\n",
831 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
832 ceph_cap_string(got));
833
834 if ((got & CEPH_CAP_FILE_BUFFER) == 0 ||
835 (iocb->ki_filp->f_flags & O_DIRECT) ||
836 (inode->i_sb->s_flags & MS_SYNCHRONOUS)) {
837 ret = ceph_sync_write(file, iov->iov_base, iov->iov_len,
838 &iocb->ki_pos);
839 } else {
840 ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
841
842 if ((ret >= 0 || ret == -EIOCBQUEUED) &&
843 ((file->f_flags & O_SYNC) || IS_SYNC(file->f_mapping->host)
844 || ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_NEARFULL))) {
845 err = vfs_fsync_range(file, file->f_path.dentry,
846 pos, pos + ret - 1, 1);
847 if (err < 0)
848 ret = err;
849 }
850 }
851 if (ret >= 0) {
852 spin_lock(&inode->i_lock);
853 __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR);
854 spin_unlock(&inode->i_lock);
855 }
856
857out:
858 dout("aio_write %p %llx.%llx %llu~%u dropping cap refs on %s\n",
859 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len,
860 ceph_cap_string(got));
861 ceph_put_cap_refs(ci, got);
862
863 if (ret == -EOLDSNAPC) {
864 dout("aio_write %p %llx.%llx %llu~%u got EOLDSNAPC, retrying\n",
865 inode, ceph_vinop(inode), pos, (unsigned)iov->iov_len);
866 goto retry_snap;
867 }
868
869 return ret;
870}
871
872/*
873 * llseek. be sure to verify file size on SEEK_END.
874 */
875static loff_t ceph_llseek(struct file *file, loff_t offset, int origin)
876{
877 struct inode *inode = file->f_mapping->host;
878 int ret;
879
880 mutex_lock(&inode->i_mutex);
881 __ceph_do_pending_vmtruncate(inode);
882 switch (origin) {
883 case SEEK_END:
884 ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE);
885 if (ret < 0) {
886 offset = ret;
887 goto out;
888 }
889 offset += inode->i_size;
890 break;
891 case SEEK_CUR:
892 /*
893 * Here we special-case the lseek(fd, 0, SEEK_CUR)
894 * position-querying operation. Avoid rewriting the "same"
895 * f_pos value back to the file because a concurrent read(),
896 * write() or lseek() might have altered it
897 */
898 if (offset == 0) {
899 offset = file->f_pos;
900 goto out;
901 }
902 offset += file->f_pos;
903 break;
904 }
905
906 if (offset < 0 || offset > inode->i_sb->s_maxbytes) {
907 offset = -EINVAL;
908 goto out;
909 }
910
911 /* Special lock needed here? */
912 if (offset != file->f_pos) {
913 file->f_pos = offset;
914 file->f_version = 0;
915 }
916
917out:
918 mutex_unlock(&inode->i_mutex);
919 return offset;
920}
921
922const struct file_operations ceph_file_fops = {
923 .open = ceph_open,
924 .release = ceph_release,
925 .llseek = ceph_llseek,
926 .read = do_sync_read,
927 .write = do_sync_write,
928 .aio_read = ceph_aio_read,
929 .aio_write = ceph_aio_write,
930 .mmap = ceph_mmap,
931 .fsync = ceph_fsync,
932 .splice_read = generic_file_splice_read,
933 .splice_write = generic_file_splice_write,
934 .unlocked_ioctl = ceph_ioctl,
935 .compat_ioctl = ceph_ioctl,
936};
937
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
new file mode 100644
index 000000000000..7abe1aed819b
--- /dev/null
+++ b/fs/ceph/inode.c
@@ -0,0 +1,1750 @@
1#include "ceph_debug.h"
2
3#include <linux/module.h>
4#include <linux/fs.h>
5#include <linux/smp_lock.h>
6#include <linux/slab.h>
7#include <linux/string.h>
8#include <linux/uaccess.h>
9#include <linux/kernel.h>
10#include <linux/namei.h>
11#include <linux/writeback.h>
12#include <linux/vmalloc.h>
13#include <linux/pagevec.h>
14
15#include "super.h"
16#include "decode.h"
17
18/*
19 * Ceph inode operations
20 *
21 * Implement basic inode helpers (get, alloc) and inode ops (getattr,
22 * setattr, etc.), xattr helpers, and helpers for assimilating
23 * metadata returned by the MDS into our cache.
24 *
25 * Also define helpers for doing asynchronous writeback, invalidation,
26 * and truncation for the benefit of those who can't afford to block
27 * (typically because they are in the message handler path).
28 */
29
30static const struct inode_operations ceph_symlink_iops;
31
32static void ceph_invalidate_work(struct work_struct *work);
33static void ceph_writeback_work(struct work_struct *work);
34static void ceph_vmtruncate_work(struct work_struct *work);
35
36/*
37 * find or create an inode, given the ceph ino number
38 */
39struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
40{
41 struct inode *inode;
42 ino_t t = ceph_vino_to_ino(vino);
43
44 inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
45 if (inode == NULL)
46 return ERR_PTR(-ENOMEM);
47 if (inode->i_state & I_NEW) {
48 dout("get_inode created new inode %p %llx.%llx ino %llx\n",
49 inode, ceph_vinop(inode), (u64)inode->i_ino);
50 unlock_new_inode(inode);
51 }
52
53 dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
54 vino.snap, inode);
55 return inode;
56}
57
58/*
59 * get/constuct snapdir inode for a given directory
60 */
61struct inode *ceph_get_snapdir(struct inode *parent)
62{
63 struct ceph_vino vino = {
64 .ino = ceph_ino(parent),
65 .snap = CEPH_SNAPDIR,
66 };
67 struct inode *inode = ceph_get_inode(parent->i_sb, vino);
68 struct ceph_inode_info *ci = ceph_inode(inode);
69
70 BUG_ON(!S_ISDIR(parent->i_mode));
71 if (IS_ERR(inode))
72 return ERR_PTR(PTR_ERR(inode));
73 inode->i_mode = parent->i_mode;
74 inode->i_uid = parent->i_uid;
75 inode->i_gid = parent->i_gid;
76 inode->i_op = &ceph_dir_iops;
77 inode->i_fop = &ceph_dir_fops;
78 ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
79 ci->i_rbytes = 0;
80 return inode;
81}
82
83const struct inode_operations ceph_file_iops = {
84 .permission = ceph_permission,
85 .setattr = ceph_setattr,
86 .getattr = ceph_getattr,
87 .setxattr = ceph_setxattr,
88 .getxattr = ceph_getxattr,
89 .listxattr = ceph_listxattr,
90 .removexattr = ceph_removexattr,
91};
92
93
94/*
95 * We use a 'frag tree' to keep track of the MDS's directory fragments
96 * for a given inode (usually there is just a single fragment). We
97 * need to know when a child frag is delegated to a new MDS, or when
98 * it is flagged as replicated, so we can direct our requests
99 * accordingly.
100 */
101
102/*
103 * find/create a frag in the tree
104 */
105static struct ceph_inode_frag *__get_or_create_frag(struct ceph_inode_info *ci,
106 u32 f)
107{
108 struct rb_node **p;
109 struct rb_node *parent = NULL;
110 struct ceph_inode_frag *frag;
111 int c;
112
113 p = &ci->i_fragtree.rb_node;
114 while (*p) {
115 parent = *p;
116 frag = rb_entry(parent, struct ceph_inode_frag, node);
117 c = ceph_frag_compare(f, frag->frag);
118 if (c < 0)
119 p = &(*p)->rb_left;
120 else if (c > 0)
121 p = &(*p)->rb_right;
122 else
123 return frag;
124 }
125
126 frag = kmalloc(sizeof(*frag), GFP_NOFS);
127 if (!frag) {
128 pr_err("__get_or_create_frag ENOMEM on %p %llx.%llx "
129 "frag %x\n", &ci->vfs_inode,
130 ceph_vinop(&ci->vfs_inode), f);
131 return ERR_PTR(-ENOMEM);
132 }
133 frag->frag = f;
134 frag->split_by = 0;
135 frag->mds = -1;
136 frag->ndist = 0;
137
138 rb_link_node(&frag->node, parent, p);
139 rb_insert_color(&frag->node, &ci->i_fragtree);
140
141 dout("get_or_create_frag added %llx.%llx frag %x\n",
142 ceph_vinop(&ci->vfs_inode), f);
143 return frag;
144}
145
146/*
147 * find a specific frag @f
148 */
149struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci, u32 f)
150{
151 struct rb_node *n = ci->i_fragtree.rb_node;
152
153 while (n) {
154 struct ceph_inode_frag *frag =
155 rb_entry(n, struct ceph_inode_frag, node);
156 int c = ceph_frag_compare(f, frag->frag);
157 if (c < 0)
158 n = n->rb_left;
159 else if (c > 0)
160 n = n->rb_right;
161 else
162 return frag;
163 }
164 return NULL;
165}
166
167/*
168 * Choose frag containing the given value @v. If @pfrag is
169 * specified, copy the frag delegation info to the caller if
170 * it is present.
171 */
172u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
173 struct ceph_inode_frag *pfrag,
174 int *found)
175{
176 u32 t = ceph_frag_make(0, 0);
177 struct ceph_inode_frag *frag;
178 unsigned nway, i;
179 u32 n;
180
181 if (found)
182 *found = 0;
183
184 mutex_lock(&ci->i_fragtree_mutex);
185 while (1) {
186 WARN_ON(!ceph_frag_contains_value(t, v));
187 frag = __ceph_find_frag(ci, t);
188 if (!frag)
189 break; /* t is a leaf */
190 if (frag->split_by == 0) {
191 if (pfrag)
192 memcpy(pfrag, frag, sizeof(*pfrag));
193 if (found)
194 *found = 1;
195 break;
196 }
197
198 /* choose child */
199 nway = 1 << frag->split_by;
200 dout("choose_frag(%x) %x splits by %d (%d ways)\n", v, t,
201 frag->split_by, nway);
202 for (i = 0; i < nway; i++) {
203 n = ceph_frag_make_child(t, frag->split_by, i);
204 if (ceph_frag_contains_value(n, v)) {
205 t = n;
206 break;
207 }
208 }
209 BUG_ON(i == nway);
210 }
211 dout("choose_frag(%x) = %x\n", v, t);
212
213 mutex_unlock(&ci->i_fragtree_mutex);
214 return t;
215}
216
217/*
218 * Process dirfrag (delegation) info from the mds. Include leaf
219 * fragment in tree ONLY if ndist > 0. Otherwise, only
220 * branches/splits are included in i_fragtree)
221 */
222static int ceph_fill_dirfrag(struct inode *inode,
223 struct ceph_mds_reply_dirfrag *dirinfo)
224{
225 struct ceph_inode_info *ci = ceph_inode(inode);
226 struct ceph_inode_frag *frag;
227 u32 id = le32_to_cpu(dirinfo->frag);
228 int mds = le32_to_cpu(dirinfo->auth);
229 int ndist = le32_to_cpu(dirinfo->ndist);
230 int i;
231 int err = 0;
232
233 mutex_lock(&ci->i_fragtree_mutex);
234 if (ndist == 0) {
235 /* no delegation info needed. */
236 frag = __ceph_find_frag(ci, id);
237 if (!frag)
238 goto out;
239 if (frag->split_by == 0) {
240 /* tree leaf, remove */
241 dout("fill_dirfrag removed %llx.%llx frag %x"
242 " (no ref)\n", ceph_vinop(inode), id);
243 rb_erase(&frag->node, &ci->i_fragtree);
244 kfree(frag);
245 } else {
246 /* tree branch, keep and clear */
247 dout("fill_dirfrag cleared %llx.%llx frag %x"
248 " referral\n", ceph_vinop(inode), id);
249 frag->mds = -1;
250 frag->ndist = 0;
251 }
252 goto out;
253 }
254
255
256 /* find/add this frag to store mds delegation info */
257 frag = __get_or_create_frag(ci, id);
258 if (IS_ERR(frag)) {
259 /* this is not the end of the world; we can continue
260 with bad/inaccurate delegation info */
261 pr_err("fill_dirfrag ENOMEM on mds ref %llx.%llx fg %x\n",
262 ceph_vinop(inode), le32_to_cpu(dirinfo->frag));
263 err = -ENOMEM;
264 goto out;
265 }
266
267 frag->mds = mds;
268 frag->ndist = min_t(u32, ndist, CEPH_MAX_DIRFRAG_REP);
269 for (i = 0; i < frag->ndist; i++)
270 frag->dist[i] = le32_to_cpu(dirinfo->dist[i]);
271 dout("fill_dirfrag %llx.%llx frag %x ndist=%d\n",
272 ceph_vinop(inode), frag->frag, frag->ndist);
273
274out:
275 mutex_unlock(&ci->i_fragtree_mutex);
276 return err;
277}
278
279
280/*
281 * initialize a newly allocated inode.
282 */
283struct inode *ceph_alloc_inode(struct super_block *sb)
284{
285 struct ceph_inode_info *ci;
286 int i;
287
288 ci = kmem_cache_alloc(ceph_inode_cachep, GFP_NOFS);
289 if (!ci)
290 return NULL;
291
292 dout("alloc_inode %p\n", &ci->vfs_inode);
293
294 ci->i_version = 0;
295 ci->i_time_warp_seq = 0;
296 ci->i_ceph_flags = 0;
297 ci->i_release_count = 0;
298 ci->i_symlink = NULL;
299
300 ci->i_fragtree = RB_ROOT;
301 mutex_init(&ci->i_fragtree_mutex);
302
303 ci->i_xattrs.blob = NULL;
304 ci->i_xattrs.prealloc_blob = NULL;
305 ci->i_xattrs.dirty = false;
306 ci->i_xattrs.index = RB_ROOT;
307 ci->i_xattrs.count = 0;
308 ci->i_xattrs.names_size = 0;
309 ci->i_xattrs.vals_size = 0;
310 ci->i_xattrs.version = 0;
311 ci->i_xattrs.index_version = 0;
312
313 ci->i_caps = RB_ROOT;
314 ci->i_auth_cap = NULL;
315 ci->i_dirty_caps = 0;
316 ci->i_flushing_caps = 0;
317 INIT_LIST_HEAD(&ci->i_dirty_item);
318 INIT_LIST_HEAD(&ci->i_flushing_item);
319 ci->i_cap_flush_seq = 0;
320 ci->i_cap_flush_last_tid = 0;
321 memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
322 init_waitqueue_head(&ci->i_cap_wq);
323 ci->i_hold_caps_min = 0;
324 ci->i_hold_caps_max = 0;
325 INIT_LIST_HEAD(&ci->i_cap_delay_list);
326 ci->i_cap_exporting_mds = 0;
327 ci->i_cap_exporting_mseq = 0;
328 ci->i_cap_exporting_issued = 0;
329 INIT_LIST_HEAD(&ci->i_cap_snaps);
330 ci->i_head_snapc = NULL;
331 ci->i_snap_caps = 0;
332
333 for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
334 ci->i_nr_by_mode[i] = 0;
335
336 ci->i_truncate_seq = 0;
337 ci->i_truncate_size = 0;
338 ci->i_truncate_pending = 0;
339
340 ci->i_max_size = 0;
341 ci->i_reported_size = 0;
342 ci->i_wanted_max_size = 0;
343 ci->i_requested_max_size = 0;
344
345 ci->i_pin_ref = 0;
346 ci->i_rd_ref = 0;
347 ci->i_rdcache_ref = 0;
348 ci->i_wr_ref = 0;
349 ci->i_wrbuffer_ref = 0;
350 ci->i_wrbuffer_ref_head = 0;
351 ci->i_shared_gen = 0;
352 ci->i_rdcache_gen = 0;
353 ci->i_rdcache_revoking = 0;
354
355 INIT_LIST_HEAD(&ci->i_unsafe_writes);
356 INIT_LIST_HEAD(&ci->i_unsafe_dirops);
357 spin_lock_init(&ci->i_unsafe_lock);
358
359 ci->i_snap_realm = NULL;
360 INIT_LIST_HEAD(&ci->i_snap_realm_item);
361 INIT_LIST_HEAD(&ci->i_snap_flush_item);
362
363 INIT_WORK(&ci->i_wb_work, ceph_writeback_work);
364 INIT_WORK(&ci->i_pg_inv_work, ceph_invalidate_work);
365
366 INIT_WORK(&ci->i_vmtruncate_work, ceph_vmtruncate_work);
367
368 return &ci->vfs_inode;
369}
370
371void ceph_destroy_inode(struct inode *inode)
372{
373 struct ceph_inode_info *ci = ceph_inode(inode);
374 struct ceph_inode_frag *frag;
375 struct rb_node *n;
376
377 dout("destroy_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode));
378
379 ceph_queue_caps_release(inode);
380
381 kfree(ci->i_symlink);
382 while ((n = rb_first(&ci->i_fragtree)) != NULL) {
383 frag = rb_entry(n, struct ceph_inode_frag, node);
384 rb_erase(n, &ci->i_fragtree);
385 kfree(frag);
386 }
387
388 __ceph_destroy_xattrs(ci);
389 if (ci->i_xattrs.blob)
390 ceph_buffer_put(ci->i_xattrs.blob);
391 if (ci->i_xattrs.prealloc_blob)
392 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
393
394 kmem_cache_free(ceph_inode_cachep, ci);
395}
396
397
398/*
399 * Helpers to fill in size, ctime, mtime, and atime. We have to be
400 * careful because either the client or MDS may have more up to date
401 * info, depending on which capabilities are held, and whether
402 * time_warp_seq or truncate_seq have increased. (Ordinarily, mtime
403 * and size are monotonically increasing, except when utimes() or
404 * truncate() increments the corresponding _seq values.)
405 */
406int ceph_fill_file_size(struct inode *inode, int issued,
407 u32 truncate_seq, u64 truncate_size, u64 size)
408{
409 struct ceph_inode_info *ci = ceph_inode(inode);
410 int queue_trunc = 0;
411
412 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
413 (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
414 dout("size %lld -> %llu\n", inode->i_size, size);
415 inode->i_size = size;
416 inode->i_blocks = (size + (1<<9) - 1) >> 9;
417 ci->i_reported_size = size;
418 if (truncate_seq != ci->i_truncate_seq) {
419 dout("truncate_seq %u -> %u\n",
420 ci->i_truncate_seq, truncate_seq);
421 ci->i_truncate_seq = truncate_seq;
422 /*
423 * If we hold relevant caps, or in the case where we're
424 * not the only client referencing this file and we
425 * don't hold those caps, then we need to check whether
426 * the file is either opened or mmaped
427 */
428 if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_RD|
429 CEPH_CAP_FILE_WR|CEPH_CAP_FILE_BUFFER|
430 CEPH_CAP_FILE_EXCL)) ||
431 mapping_mapped(inode->i_mapping) ||
432 __ceph_caps_file_wanted(ci)) {
433 ci->i_truncate_pending++;
434 queue_trunc = 1;
435 }
436 }
437 }
438 if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) >= 0 &&
439 ci->i_truncate_size != truncate_size) {
440 dout("truncate_size %lld -> %llu\n", ci->i_truncate_size,
441 truncate_size);
442 ci->i_truncate_size = truncate_size;
443 }
444 return queue_trunc;
445}
446
447void ceph_fill_file_time(struct inode *inode, int issued,
448 u64 time_warp_seq, struct timespec *ctime,
449 struct timespec *mtime, struct timespec *atime)
450{
451 struct ceph_inode_info *ci = ceph_inode(inode);
452 int warn = 0;
453
454 if (issued & (CEPH_CAP_FILE_EXCL|
455 CEPH_CAP_FILE_WR|
456 CEPH_CAP_FILE_BUFFER)) {
457 if (timespec_compare(ctime, &inode->i_ctime) > 0) {
458 dout("ctime %ld.%09ld -> %ld.%09ld inc w/ cap\n",
459 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
460 ctime->tv_sec, ctime->tv_nsec);
461 inode->i_ctime = *ctime;
462 }
463 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) > 0) {
464 /* the MDS did a utimes() */
465 dout("mtime %ld.%09ld -> %ld.%09ld "
466 "tw %d -> %d\n",
467 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
468 mtime->tv_sec, mtime->tv_nsec,
469 ci->i_time_warp_seq, (int)time_warp_seq);
470
471 inode->i_mtime = *mtime;
472 inode->i_atime = *atime;
473 ci->i_time_warp_seq = time_warp_seq;
474 } else if (time_warp_seq == ci->i_time_warp_seq) {
475 /* nobody did utimes(); take the max */
476 if (timespec_compare(mtime, &inode->i_mtime) > 0) {
477 dout("mtime %ld.%09ld -> %ld.%09ld inc\n",
478 inode->i_mtime.tv_sec,
479 inode->i_mtime.tv_nsec,
480 mtime->tv_sec, mtime->tv_nsec);
481 inode->i_mtime = *mtime;
482 }
483 if (timespec_compare(atime, &inode->i_atime) > 0) {
484 dout("atime %ld.%09ld -> %ld.%09ld inc\n",
485 inode->i_atime.tv_sec,
486 inode->i_atime.tv_nsec,
487 atime->tv_sec, atime->tv_nsec);
488 inode->i_atime = *atime;
489 }
490 } else if (issued & CEPH_CAP_FILE_EXCL) {
491 /* we did a utimes(); ignore mds values */
492 } else {
493 warn = 1;
494 }
495 } else {
496 /* we have no write caps; whatever the MDS says is true */
497 if (ceph_seq_cmp(time_warp_seq, ci->i_time_warp_seq) >= 0) {
498 inode->i_ctime = *ctime;
499 inode->i_mtime = *mtime;
500 inode->i_atime = *atime;
501 ci->i_time_warp_seq = time_warp_seq;
502 } else {
503 warn = 1;
504 }
505 }
506 if (warn) /* time_warp_seq shouldn't go backwards */
507 dout("%p mds time_warp_seq %llu < %u\n",
508 inode, time_warp_seq, ci->i_time_warp_seq);
509}
510
511/*
512 * Populate an inode based on info from mds. May be called on new or
513 * existing inodes.
514 */
515static int fill_inode(struct inode *inode,
516 struct ceph_mds_reply_info_in *iinfo,
517 struct ceph_mds_reply_dirfrag *dirinfo,
518 struct ceph_mds_session *session,
519 unsigned long ttl_from, int cap_fmode,
520 struct ceph_cap_reservation *caps_reservation)
521{
522 struct ceph_mds_reply_inode *info = iinfo->in;
523 struct ceph_inode_info *ci = ceph_inode(inode);
524 int i;
525 int issued, implemented;
526 struct timespec mtime, atime, ctime;
527 u32 nsplits;
528 struct ceph_buffer *xattr_blob = NULL;
529 int err = 0;
530 int queue_trunc = 0;
531
532 dout("fill_inode %p ino %llx.%llx v %llu had %llu\n",
533 inode, ceph_vinop(inode), le64_to_cpu(info->version),
534 ci->i_version);
535
536 /*
537 * prealloc xattr data, if it looks like we'll need it. only
538 * if len > 4 (meaning there are actually xattrs; the first 4
539 * bytes are the xattr count).
540 */
541 if (iinfo->xattr_len > 4) {
542 xattr_blob = ceph_buffer_new(iinfo->xattr_len, GFP_NOFS);
543 if (!xattr_blob)
544 pr_err("fill_inode ENOMEM xattr blob %d bytes\n",
545 iinfo->xattr_len);
546 }
547
548 spin_lock(&inode->i_lock);
549
550 /*
551 * provided version will be odd if inode value is projected,
552 * even if stable. skip the update if we have a newer info
553 * (e.g., due to inode info racing form multiple MDSs), or if
554 * we are getting projected (unstable) inode info.
555 */
556 if (le64_to_cpu(info->version) > 0 &&
557 (ci->i_version & ~1) > le64_to_cpu(info->version))
558 goto no_change;
559
560 issued = __ceph_caps_issued(ci, &implemented);
561 issued |= implemented | __ceph_caps_dirty(ci);
562
563 /* update inode */
564 ci->i_version = le64_to_cpu(info->version);
565 inode->i_version++;
566 inode->i_rdev = le32_to_cpu(info->rdev);
567
568 if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
569 inode->i_mode = le32_to_cpu(info->mode);
570 inode->i_uid = le32_to_cpu(info->uid);
571 inode->i_gid = le32_to_cpu(info->gid);
572 dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
573 inode->i_uid, inode->i_gid);
574 }
575
576 if ((issued & CEPH_CAP_LINK_EXCL) == 0)
577 inode->i_nlink = le32_to_cpu(info->nlink);
578
579 /* be careful with mtime, atime, size */
580 ceph_decode_timespec(&atime, &info->atime);
581 ceph_decode_timespec(&mtime, &info->mtime);
582 ceph_decode_timespec(&ctime, &info->ctime);
583 queue_trunc = ceph_fill_file_size(inode, issued,
584 le32_to_cpu(info->truncate_seq),
585 le64_to_cpu(info->truncate_size),
586 le64_to_cpu(info->size));
587 ceph_fill_file_time(inode, issued,
588 le32_to_cpu(info->time_warp_seq),
589 &ctime, &mtime, &atime);
590
591 ci->i_max_size = le64_to_cpu(info->max_size);
592 ci->i_layout = info->layout;
593 inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1;
594
595 /* xattrs */
596 /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */
597 if ((issued & CEPH_CAP_XATTR_EXCL) == 0 &&
598 le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) {
599 if (ci->i_xattrs.blob)
600 ceph_buffer_put(ci->i_xattrs.blob);
601 ci->i_xattrs.blob = xattr_blob;
602 if (xattr_blob)
603 memcpy(ci->i_xattrs.blob->vec.iov_base,
604 iinfo->xattr_data, iinfo->xattr_len);
605 ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
606 }
607
608 inode->i_mapping->a_ops = &ceph_aops;
609 inode->i_mapping->backing_dev_info =
610 &ceph_client(inode->i_sb)->backing_dev_info;
611
612 switch (inode->i_mode & S_IFMT) {
613 case S_IFIFO:
614 case S_IFBLK:
615 case S_IFCHR:
616 case S_IFSOCK:
617 init_special_inode(inode, inode->i_mode, inode->i_rdev);
618 inode->i_op = &ceph_file_iops;
619 break;
620 case S_IFREG:
621 inode->i_op = &ceph_file_iops;
622 inode->i_fop = &ceph_file_fops;
623 break;
624 case S_IFLNK:
625 inode->i_op = &ceph_symlink_iops;
626 if (!ci->i_symlink) {
627 int symlen = iinfo->symlink_len;
628 char *sym;
629
630 BUG_ON(symlen != inode->i_size);
631 spin_unlock(&inode->i_lock);
632
633 err = -ENOMEM;
634 sym = kmalloc(symlen+1, GFP_NOFS);
635 if (!sym)
636 goto out;
637 memcpy(sym, iinfo->symlink, symlen);
638 sym[symlen] = 0;
639
640 spin_lock(&inode->i_lock);
641 if (!ci->i_symlink)
642 ci->i_symlink = sym;
643 else
644 kfree(sym); /* lost a race */
645 }
646 break;
647 case S_IFDIR:
648 inode->i_op = &ceph_dir_iops;
649 inode->i_fop = &ceph_dir_fops;
650
651 ci->i_files = le64_to_cpu(info->files);
652 ci->i_subdirs = le64_to_cpu(info->subdirs);
653 ci->i_rbytes = le64_to_cpu(info->rbytes);
654 ci->i_rfiles = le64_to_cpu(info->rfiles);
655 ci->i_rsubdirs = le64_to_cpu(info->rsubdirs);
656 ceph_decode_timespec(&ci->i_rctime, &info->rctime);
657
658 /* set dir completion flag? */
659 if (ci->i_files == 0 && ci->i_subdirs == 0 &&
660 ceph_snap(inode) == CEPH_NOSNAP &&
661 (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
662 dout(" marking %p complete (empty)\n", inode);
663 ci->i_ceph_flags |= CEPH_I_COMPLETE;
664 ci->i_max_offset = 2;
665 }
666
667 /* it may be better to set st_size in getattr instead? */
668 if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
669 inode->i_size = ci->i_rbytes;
670 break;
671 default:
672 pr_err("fill_inode %llx.%llx BAD mode 0%o\n",
673 ceph_vinop(inode), inode->i_mode);
674 }
675
676no_change:
677 spin_unlock(&inode->i_lock);
678
679 /* queue truncate if we saw i_size decrease */
680 if (queue_trunc)
681 ceph_queue_vmtruncate(inode);
682
683 /* populate frag tree */
684 /* FIXME: move me up, if/when version reflects fragtree changes */
685 nsplits = le32_to_cpu(info->fragtree.nsplits);
686 mutex_lock(&ci->i_fragtree_mutex);
687 for (i = 0; i < nsplits; i++) {
688 u32 id = le32_to_cpu(info->fragtree.splits[i].frag);
689 struct ceph_inode_frag *frag = __get_or_create_frag(ci, id);
690
691 if (IS_ERR(frag))
692 continue;
693 frag->split_by = le32_to_cpu(info->fragtree.splits[i].by);
694 dout(" frag %x split by %d\n", frag->frag, frag->split_by);
695 }
696 mutex_unlock(&ci->i_fragtree_mutex);
697
698 /* were we issued a capability? */
699 if (info->cap.caps) {
700 if (ceph_snap(inode) == CEPH_NOSNAP) {
701 ceph_add_cap(inode, session,
702 le64_to_cpu(info->cap.cap_id),
703 cap_fmode,
704 le32_to_cpu(info->cap.caps),
705 le32_to_cpu(info->cap.wanted),
706 le32_to_cpu(info->cap.seq),
707 le32_to_cpu(info->cap.mseq),
708 le64_to_cpu(info->cap.realm),
709 info->cap.flags,
710 caps_reservation);
711 } else {
712 spin_lock(&inode->i_lock);
713 dout(" %p got snap_caps %s\n", inode,
714 ceph_cap_string(le32_to_cpu(info->cap.caps)));
715 ci->i_snap_caps |= le32_to_cpu(info->cap.caps);
716 if (cap_fmode >= 0)
717 __ceph_get_fmode(ci, cap_fmode);
718 spin_unlock(&inode->i_lock);
719 }
720 }
721
722 /* update delegation info? */
723 if (dirinfo)
724 ceph_fill_dirfrag(inode, dirinfo);
725
726 err = 0;
727
728out:
729 if (xattr_blob)
730 ceph_buffer_put(xattr_blob);
731 return err;
732}
733
734/*
735 * caller should hold session s_mutex.
736 */
737static void update_dentry_lease(struct dentry *dentry,
738 struct ceph_mds_reply_lease *lease,
739 struct ceph_mds_session *session,
740 unsigned long from_time)
741{
742 struct ceph_dentry_info *di = ceph_dentry(dentry);
743 long unsigned duration = le32_to_cpu(lease->duration_ms);
744 long unsigned ttl = from_time + (duration * HZ) / 1000;
745 long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
746 struct inode *dir;
747
748 /* only track leases on regular dentries */
749 if (dentry->d_op != &ceph_dentry_ops)
750 return;
751
752 spin_lock(&dentry->d_lock);
753 dout("update_dentry_lease %p mask %d duration %lu ms ttl %lu\n",
754 dentry, le16_to_cpu(lease->mask), duration, ttl);
755
756 /* make lease_rdcache_gen match directory */
757 dir = dentry->d_parent->d_inode;
758 di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
759
760 if (lease->mask == 0)
761 goto out_unlock;
762
763 if (di->lease_gen == session->s_cap_gen &&
764 time_before(ttl, dentry->d_time))
765 goto out_unlock; /* we already have a newer lease. */
766
767 if (di->lease_session && di->lease_session != session)
768 goto out_unlock;
769
770 ceph_dentry_lru_touch(dentry);
771
772 if (!di->lease_session)
773 di->lease_session = ceph_get_mds_session(session);
774 di->lease_gen = session->s_cap_gen;
775 di->lease_seq = le32_to_cpu(lease->seq);
776 di->lease_renew_after = half_ttl;
777 di->lease_renew_from = 0;
778 dentry->d_time = ttl;
779out_unlock:
780 spin_unlock(&dentry->d_lock);
781 return;
782}
783
784/*
785 * splice a dentry to an inode.
786 * caller must hold directory i_mutex for this to be safe.
787 *
788 * we will only rehash the resulting dentry if @prehash is
789 * true; @prehash will be set to false (for the benefit of
790 * the caller) if we fail.
791 */
792static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
793 bool *prehash)
794{
795 struct dentry *realdn;
796
797 /* dn must be unhashed */
798 if (!d_unhashed(dn))
799 d_drop(dn);
800 realdn = d_materialise_unique(dn, in);
801 if (IS_ERR(realdn)) {
802 pr_err("splice_dentry error %p inode %p ino %llx.%llx\n",
803 dn, in, ceph_vinop(in));
804 if (prehash)
805 *prehash = false; /* don't rehash on error */
806 dn = realdn; /* note realdn contains the error */
807 goto out;
808 } else if (realdn) {
809 dout("dn %p (%d) spliced with %p (%d) "
810 "inode %p ino %llx.%llx\n",
811 dn, atomic_read(&dn->d_count),
812 realdn, atomic_read(&realdn->d_count),
813 realdn->d_inode, ceph_vinop(realdn->d_inode));
814 dput(dn);
815 dn = realdn;
816 } else {
817 BUG_ON(!ceph_dentry(dn));
818
819 dout("dn %p attached to %p ino %llx.%llx\n",
820 dn, dn->d_inode, ceph_vinop(dn->d_inode));
821 }
822 if ((!prehash || *prehash) && d_unhashed(dn))
823 d_rehash(dn);
824out:
825 return dn;
826}
827
828/*
829 * Set dentry's directory position based on the current dir's max, and
830 * order it in d_subdirs, so that dcache_readdir behaves.
831 */
832static void ceph_set_dentry_offset(struct dentry *dn)
833{
834 struct dentry *dir = dn->d_parent;
835 struct inode *inode = dn->d_parent->d_inode;
836 struct ceph_dentry_info *di;
837
838 BUG_ON(!inode);
839
840 di = ceph_dentry(dn);
841
842 spin_lock(&inode->i_lock);
843 di->offset = ceph_inode(inode)->i_max_offset++;
844 spin_unlock(&inode->i_lock);
845
846 spin_lock(&dcache_lock);
847 spin_lock(&dn->d_lock);
848 list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
849 dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
850 dn->d_u.d_child.prev, dn->d_u.d_child.next);
851 spin_unlock(&dn->d_lock);
852 spin_unlock(&dcache_lock);
853}
854
855/*
856 * Incorporate results into the local cache. This is either just
857 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
858 * after a lookup).
859 *
860 * A reply may contain
861 * a directory inode along with a dentry.
862 * and/or a target inode
863 *
864 * Called with snap_rwsem (read).
865 */
866int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
867 struct ceph_mds_session *session)
868{
869 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
870 struct inode *in = NULL;
871 struct ceph_mds_reply_inode *ininfo;
872 struct ceph_vino vino;
873 int i = 0;
874 int err = 0;
875
876 dout("fill_trace %p is_dentry %d is_target %d\n", req,
877 rinfo->head->is_dentry, rinfo->head->is_target);
878
879#if 0
880 /*
881 * Debugging hook:
882 *
883 * If we resend completed ops to a recovering mds, we get no
884 * trace. Since that is very rare, pretend this is the case
885 * to ensure the 'no trace' handlers in the callers behave.
886 *
887 * Fill in inodes unconditionally to avoid breaking cap
888 * invariants.
889 */
890 if (rinfo->head->op & CEPH_MDS_OP_WRITE) {
891 pr_info("fill_trace faking empty trace on %lld %s\n",
892 req->r_tid, ceph_mds_op_name(rinfo->head->op));
893 if (rinfo->head->is_dentry) {
894 rinfo->head->is_dentry = 0;
895 err = fill_inode(req->r_locked_dir,
896 &rinfo->diri, rinfo->dirfrag,
897 session, req->r_request_started, -1);
898 }
899 if (rinfo->head->is_target) {
900 rinfo->head->is_target = 0;
901 ininfo = rinfo->targeti.in;
902 vino.ino = le64_to_cpu(ininfo->ino);
903 vino.snap = le64_to_cpu(ininfo->snapid);
904 in = ceph_get_inode(sb, vino);
905 err = fill_inode(in, &rinfo->targeti, NULL,
906 session, req->r_request_started,
907 req->r_fmode);
908 iput(in);
909 }
910 }
911#endif
912
913 if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
914 dout("fill_trace reply is empty!\n");
915 if (rinfo->head->result == 0 && req->r_locked_dir) {
916 struct ceph_inode_info *ci =
917 ceph_inode(req->r_locked_dir);
918 dout(" clearing %p complete (empty trace)\n",
919 req->r_locked_dir);
920 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
921 ci->i_release_count++;
922 }
923 return 0;
924 }
925
926 if (rinfo->head->is_dentry) {
927 struct inode *dir = req->r_locked_dir;
928
929 err = fill_inode(dir, &rinfo->diri, rinfo->dirfrag,
930 session, req->r_request_started, -1,
931 &req->r_caps_reservation);
932 if (err < 0)
933 return err;
934 }
935
936 if (rinfo->head->is_dentry && !req->r_aborted) {
937 /*
938 * lookup link rename : null -> possibly existing inode
939 * mknod symlink mkdir : null -> new inode
940 * unlink : linked -> null
941 */
942 struct inode *dir = req->r_locked_dir;
943 struct dentry *dn = req->r_dentry;
944 bool have_dir_cap, have_lease;
945
946 BUG_ON(!dn);
947 BUG_ON(!dir);
948 BUG_ON(dn->d_parent->d_inode != dir);
949 BUG_ON(ceph_ino(dir) !=
950 le64_to_cpu(rinfo->diri.in->ino));
951 BUG_ON(ceph_snap(dir) !=
952 le64_to_cpu(rinfo->diri.in->snapid));
953
954 /* do we have a lease on the whole dir? */
955 have_dir_cap =
956 (le32_to_cpu(rinfo->diri.in->cap.caps) &
957 CEPH_CAP_FILE_SHARED);
958
959 /* do we have a dn lease? */
960 have_lease = have_dir_cap ||
961 (le16_to_cpu(rinfo->dlease->mask) &
962 CEPH_LOCK_DN);
963
964 if (!have_lease)
965 dout("fill_trace no dentry lease or dir cap\n");
966
967 /* rename? */
968 if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
969 dout(" src %p '%.*s' dst %p '%.*s'\n",
970 req->r_old_dentry,
971 req->r_old_dentry->d_name.len,
972 req->r_old_dentry->d_name.name,
973 dn, dn->d_name.len, dn->d_name.name);
974 dout("fill_trace doing d_move %p -> %p\n",
975 req->r_old_dentry, dn);
976 d_move(req->r_old_dentry, dn);
977 dout(" src %p '%.*s' dst %p '%.*s'\n",
978 req->r_old_dentry,
979 req->r_old_dentry->d_name.len,
980 req->r_old_dentry->d_name.name,
981 dn, dn->d_name.len, dn->d_name.name);
982 /* ensure target dentry is invalidated, despite
983 rehashing bug in vfs_rename_dir */
984 dn->d_time = jiffies;
985 ceph_dentry(dn)->lease_shared_gen = 0;
986 /* take overwritten dentry's readdir offset */
987 ceph_dentry(req->r_old_dentry)->offset =
988 ceph_dentry(dn)->offset;
989 dn = req->r_old_dentry; /* use old_dentry */
990 in = dn->d_inode;
991 }
992
993 /* null dentry? */
994 if (!rinfo->head->is_target) {
995 dout("fill_trace null dentry\n");
996 if (dn->d_inode) {
997 dout("d_delete %p\n", dn);
998 d_delete(dn);
999 } else {
1000 dout("d_instantiate %p NULL\n", dn);
1001 d_instantiate(dn, NULL);
1002 if (have_lease && d_unhashed(dn))
1003 d_rehash(dn);
1004 update_dentry_lease(dn, rinfo->dlease,
1005 session,
1006 req->r_request_started);
1007 }
1008 goto done;
1009 }
1010
1011 /* attach proper inode */
1012 ininfo = rinfo->targeti.in;
1013 vino.ino = le64_to_cpu(ininfo->ino);
1014 vino.snap = le64_to_cpu(ininfo->snapid);
1015 if (!dn->d_inode) {
1016 in = ceph_get_inode(sb, vino);
1017 if (IS_ERR(in)) {
1018 pr_err("fill_trace bad get_inode "
1019 "%llx.%llx\n", vino.ino, vino.snap);
1020 err = PTR_ERR(in);
1021 d_delete(dn);
1022 goto done;
1023 }
1024 dn = splice_dentry(dn, in, &have_lease);
1025 if (IS_ERR(dn)) {
1026 err = PTR_ERR(dn);
1027 goto done;
1028 }
1029 req->r_dentry = dn; /* may have spliced */
1030 ceph_set_dentry_offset(dn);
1031 igrab(in);
1032 } else if (ceph_ino(in) == vino.ino &&
1033 ceph_snap(in) == vino.snap) {
1034 igrab(in);
1035 } else {
1036 dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
1037 dn, in, ceph_ino(in), ceph_snap(in),
1038 vino.ino, vino.snap);
1039 have_lease = false;
1040 in = NULL;
1041 }
1042
1043 if (have_lease)
1044 update_dentry_lease(dn, rinfo->dlease, session,
1045 req->r_request_started);
1046 dout(" final dn %p\n", dn);
1047 i++;
1048 } else if (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
1049 req->r_op == CEPH_MDS_OP_MKSNAP) {
1050 struct dentry *dn = req->r_dentry;
1051
1052 /* fill out a snapdir LOOKUPSNAP dentry */
1053 BUG_ON(!dn);
1054 BUG_ON(!req->r_locked_dir);
1055 BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
1056 ininfo = rinfo->targeti.in;
1057 vino.ino = le64_to_cpu(ininfo->ino);
1058 vino.snap = le64_to_cpu(ininfo->snapid);
1059 in = ceph_get_inode(sb, vino);
1060 if (IS_ERR(in)) {
1061 pr_err("fill_inode get_inode badness %llx.%llx\n",
1062 vino.ino, vino.snap);
1063 err = PTR_ERR(in);
1064 d_delete(dn);
1065 goto done;
1066 }
1067 dout(" linking snapped dir %p to dn %p\n", in, dn);
1068 dn = splice_dentry(dn, in, NULL);
1069 if (IS_ERR(dn)) {
1070 err = PTR_ERR(dn);
1071 goto done;
1072 }
1073 ceph_set_dentry_offset(dn);
1074 req->r_dentry = dn; /* may have spliced */
1075 igrab(in);
1076 rinfo->head->is_dentry = 1; /* fool notrace handlers */
1077 }
1078
1079 if (rinfo->head->is_target) {
1080 vino.ino = le64_to_cpu(rinfo->targeti.in->ino);
1081 vino.snap = le64_to_cpu(rinfo->targeti.in->snapid);
1082
1083 if (in == NULL || ceph_ino(in) != vino.ino ||
1084 ceph_snap(in) != vino.snap) {
1085 in = ceph_get_inode(sb, vino);
1086 if (IS_ERR(in)) {
1087 err = PTR_ERR(in);
1088 goto done;
1089 }
1090 }
1091 req->r_target_inode = in;
1092
1093 err = fill_inode(in,
1094 &rinfo->targeti, NULL,
1095 session, req->r_request_started,
1096 (le32_to_cpu(rinfo->head->result) == 0) ?
1097 req->r_fmode : -1,
1098 &req->r_caps_reservation);
1099 if (err < 0) {
1100 pr_err("fill_inode badness %p %llx.%llx\n",
1101 in, ceph_vinop(in));
1102 goto done;
1103 }
1104 }
1105
1106done:
1107 dout("fill_trace done err=%d\n", err);
1108 return err;
1109}
1110
1111/*
1112 * Prepopulate our cache with readdir results, leases, etc.
1113 */
1114int ceph_readdir_prepopulate(struct ceph_mds_request *req,
1115 struct ceph_mds_session *session)
1116{
1117 struct dentry *parent = req->r_dentry;
1118 struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
1119 struct qstr dname;
1120 struct dentry *dn;
1121 struct inode *in;
1122 int err = 0, i;
1123 struct inode *snapdir = NULL;
1124 struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
1125 u64 frag = le32_to_cpu(rhead->args.readdir.frag);
1126 struct ceph_dentry_info *di;
1127
1128 if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
1129 snapdir = ceph_get_snapdir(parent->d_inode);
1130 parent = d_find_alias(snapdir);
1131 dout("readdir_prepopulate %d items under SNAPDIR dn %p\n",
1132 rinfo->dir_nr, parent);
1133 } else {
1134 dout("readdir_prepopulate %d items under dn %p\n",
1135 rinfo->dir_nr, parent);
1136 if (rinfo->dir_dir)
1137 ceph_fill_dirfrag(parent->d_inode, rinfo->dir_dir);
1138 }
1139
1140 for (i = 0; i < rinfo->dir_nr; i++) {
1141 struct ceph_vino vino;
1142
1143 dname.name = rinfo->dir_dname[i];
1144 dname.len = rinfo->dir_dname_len[i];
1145 dname.hash = full_name_hash(dname.name, dname.len);
1146
1147 vino.ino = le64_to_cpu(rinfo->dir_in[i].in->ino);
1148 vino.snap = le64_to_cpu(rinfo->dir_in[i].in->snapid);
1149
1150retry_lookup:
1151 dn = d_lookup(parent, &dname);
1152 dout("d_lookup on parent=%p name=%.*s got %p\n",
1153 parent, dname.len, dname.name, dn);
1154
1155 if (!dn) {
1156 dn = d_alloc(parent, &dname);
1157 dout("d_alloc %p '%.*s' = %p\n", parent,
1158 dname.len, dname.name, dn);
1159 if (dn == NULL) {
1160 dout("d_alloc badness\n");
1161 err = -ENOMEM;
1162 goto out;
1163 }
1164 err = ceph_init_dentry(dn);
1165 if (err < 0)
1166 goto out;
1167 } else if (dn->d_inode &&
1168 (ceph_ino(dn->d_inode) != vino.ino ||
1169 ceph_snap(dn->d_inode) != vino.snap)) {
1170 dout(" dn %p points to wrong inode %p\n",
1171 dn, dn->d_inode);
1172 d_delete(dn);
1173 dput(dn);
1174 goto retry_lookup;
1175 } else {
1176 /* reorder parent's d_subdirs */
1177 spin_lock(&dcache_lock);
1178 spin_lock(&dn->d_lock);
1179 list_move(&dn->d_u.d_child, &parent->d_subdirs);
1180 spin_unlock(&dn->d_lock);
1181 spin_unlock(&dcache_lock);
1182 }
1183
1184 di = dn->d_fsdata;
1185 di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
1186
1187 /* inode */
1188 if (dn->d_inode) {
1189 in = dn->d_inode;
1190 } else {
1191 in = ceph_get_inode(parent->d_sb, vino);
1192 if (in == NULL) {
1193 dout("new_inode badness\n");
1194 d_delete(dn);
1195 dput(dn);
1196 err = -ENOMEM;
1197 goto out;
1198 }
1199 dn = splice_dentry(dn, in, NULL);
1200 }
1201
1202 if (fill_inode(in, &rinfo->dir_in[i], NULL, session,
1203 req->r_request_started, -1,
1204 &req->r_caps_reservation) < 0) {
1205 pr_err("fill_inode badness on %p\n", in);
1206 dput(dn);
1207 continue;
1208 }
1209 update_dentry_lease(dn, rinfo->dir_dlease[i],
1210 req->r_session, req->r_request_started);
1211 dput(dn);
1212 }
1213 req->r_did_prepopulate = true;
1214
1215out:
1216 if (snapdir) {
1217 iput(snapdir);
1218 dput(parent);
1219 }
1220 dout("readdir_prepopulate done\n");
1221 return err;
1222}
1223
1224int ceph_inode_set_size(struct inode *inode, loff_t size)
1225{
1226 struct ceph_inode_info *ci = ceph_inode(inode);
1227 int ret = 0;
1228
1229 spin_lock(&inode->i_lock);
1230 dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
1231 inode->i_size = size;
1232 inode->i_blocks = (size + (1 << 9) - 1) >> 9;
1233
1234 /* tell the MDS if we are approaching max_size */
1235 if ((size << 1) >= ci->i_max_size &&
1236 (ci->i_reported_size << 1) < ci->i_max_size)
1237 ret = 1;
1238
1239 spin_unlock(&inode->i_lock);
1240 return ret;
1241}
1242
1243/*
1244 * Write back inode data in a worker thread. (This can't be done
1245 * in the message handler context.)
1246 */
1247void ceph_queue_writeback(struct inode *inode)
1248{
1249 if (queue_work(ceph_inode_to_client(inode)->wb_wq,
1250 &ceph_inode(inode)->i_wb_work)) {
1251 dout("ceph_queue_writeback %p\n", inode);
1252 igrab(inode);
1253 } else {
1254 dout("ceph_queue_writeback %p failed\n", inode);
1255 }
1256}
1257
1258static void ceph_writeback_work(struct work_struct *work)
1259{
1260 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1261 i_wb_work);
1262 struct inode *inode = &ci->vfs_inode;
1263
1264 dout("writeback %p\n", inode);
1265 filemap_fdatawrite(&inode->i_data);
1266 iput(inode);
1267}
1268
1269/*
1270 * queue an async invalidation
1271 */
1272void ceph_queue_invalidate(struct inode *inode)
1273{
1274 if (queue_work(ceph_inode_to_client(inode)->pg_inv_wq,
1275 &ceph_inode(inode)->i_pg_inv_work)) {
1276 dout("ceph_queue_invalidate %p\n", inode);
1277 igrab(inode);
1278 } else {
1279 dout("ceph_queue_invalidate %p failed\n", inode);
1280 }
1281}
1282
1283/*
1284 * invalidate any pages that are not dirty or under writeback. this
1285 * includes pages that are clean and mapped.
1286 */
1287static void ceph_invalidate_nondirty_pages(struct address_space *mapping)
1288{
1289 struct pagevec pvec;
1290 pgoff_t next = 0;
1291 int i;
1292
1293 pagevec_init(&pvec, 0);
1294 while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
1295 for (i = 0; i < pagevec_count(&pvec); i++) {
1296 struct page *page = pvec.pages[i];
1297 pgoff_t index;
1298 int skip_page =
1299 (PageDirty(page) || PageWriteback(page));
1300
1301 if (!skip_page)
1302 skip_page = !trylock_page(page);
1303
1304 /*
1305 * We really shouldn't be looking at the ->index of an
1306 * unlocked page. But we're not allowed to lock these
1307 * pages. So we rely upon nobody altering the ->index
1308 * of this (pinned-by-us) page.
1309 */
1310 index = page->index;
1311 if (index > next)
1312 next = index;
1313 next++;
1314
1315 if (skip_page)
1316 continue;
1317
1318 generic_error_remove_page(mapping, page);
1319 unlock_page(page);
1320 }
1321 pagevec_release(&pvec);
1322 cond_resched();
1323 }
1324}
1325
1326/*
1327 * Invalidate inode pages in a worker thread. (This can't be done
1328 * in the message handler context.)
1329 */
1330static void ceph_invalidate_work(struct work_struct *work)
1331{
1332 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1333 i_pg_inv_work);
1334 struct inode *inode = &ci->vfs_inode;
1335 u32 orig_gen;
1336 int check = 0;
1337
1338 spin_lock(&inode->i_lock);
1339 dout("invalidate_pages %p gen %d revoking %d\n", inode,
1340 ci->i_rdcache_gen, ci->i_rdcache_revoking);
1341 if (ci->i_rdcache_gen == 0 ||
1342 ci->i_rdcache_revoking != ci->i_rdcache_gen) {
1343 BUG_ON(ci->i_rdcache_revoking > ci->i_rdcache_gen);
1344 /* nevermind! */
1345 ci->i_rdcache_revoking = 0;
1346 spin_unlock(&inode->i_lock);
1347 goto out;
1348 }
1349 orig_gen = ci->i_rdcache_gen;
1350 spin_unlock(&inode->i_lock);
1351
1352 ceph_invalidate_nondirty_pages(inode->i_mapping);
1353
1354 spin_lock(&inode->i_lock);
1355 if (orig_gen == ci->i_rdcache_gen) {
1356 dout("invalidate_pages %p gen %d successful\n", inode,
1357 ci->i_rdcache_gen);
1358 ci->i_rdcache_gen = 0;
1359 ci->i_rdcache_revoking = 0;
1360 check = 1;
1361 } else {
1362 dout("invalidate_pages %p gen %d raced, gen now %d\n",
1363 inode, orig_gen, ci->i_rdcache_gen);
1364 }
1365 spin_unlock(&inode->i_lock);
1366
1367 if (check)
1368 ceph_check_caps(ci, 0, NULL);
1369out:
1370 iput(inode);
1371}
1372
1373
1374/*
1375 * called by trunc_wq; take i_mutex ourselves
1376 *
1377 * We also truncate in a separate thread as well.
1378 */
1379static void ceph_vmtruncate_work(struct work_struct *work)
1380{
1381 struct ceph_inode_info *ci = container_of(work, struct ceph_inode_info,
1382 i_vmtruncate_work);
1383 struct inode *inode = &ci->vfs_inode;
1384
1385 dout("vmtruncate_work %p\n", inode);
1386 mutex_lock(&inode->i_mutex);
1387 __ceph_do_pending_vmtruncate(inode);
1388 mutex_unlock(&inode->i_mutex);
1389 iput(inode);
1390}
1391
1392/*
1393 * Queue an async vmtruncate. If we fail to queue work, we will handle
1394 * the truncation the next time we call __ceph_do_pending_vmtruncate.
1395 */
1396void ceph_queue_vmtruncate(struct inode *inode)
1397{
1398 struct ceph_inode_info *ci = ceph_inode(inode);
1399
1400 if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
1401 &ci->i_vmtruncate_work)) {
1402 dout("ceph_queue_vmtruncate %p\n", inode);
1403 igrab(inode);
1404 } else {
1405 dout("ceph_queue_vmtruncate %p failed, pending=%d\n",
1406 inode, ci->i_truncate_pending);
1407 }
1408}
1409
1410/*
1411 * called with i_mutex held.
1412 *
1413 * Make sure any pending truncation is applied before doing anything
1414 * that may depend on it.
1415 */
1416void __ceph_do_pending_vmtruncate(struct inode *inode)
1417{
1418 struct ceph_inode_info *ci = ceph_inode(inode);
1419 u64 to;
1420 int wrbuffer_refs, wake = 0;
1421
1422retry:
1423 spin_lock(&inode->i_lock);
1424 if (ci->i_truncate_pending == 0) {
1425 dout("__do_pending_vmtruncate %p none pending\n", inode);
1426 spin_unlock(&inode->i_lock);
1427 return;
1428 }
1429
1430 /*
1431 * make sure any dirty snapped pages are flushed before we
1432 * possibly truncate them.. so write AND block!
1433 */
1434 if (ci->i_wrbuffer_ref_head < ci->i_wrbuffer_ref) {
1435 dout("__do_pending_vmtruncate %p flushing snaps first\n",
1436 inode);
1437 spin_unlock(&inode->i_lock);
1438 filemap_write_and_wait_range(&inode->i_data, 0,
1439 inode->i_sb->s_maxbytes);
1440 goto retry;
1441 }
1442
1443 to = ci->i_truncate_size;
1444 wrbuffer_refs = ci->i_wrbuffer_ref;
1445 dout("__do_pending_vmtruncate %p (%d) to %lld\n", inode,
1446 ci->i_truncate_pending, to);
1447 spin_unlock(&inode->i_lock);
1448
1449 truncate_inode_pages(inode->i_mapping, to);
1450
1451 spin_lock(&inode->i_lock);
1452 ci->i_truncate_pending--;
1453 if (ci->i_truncate_pending == 0)
1454 wake = 1;
1455 spin_unlock(&inode->i_lock);
1456
1457 if (wrbuffer_refs == 0)
1458 ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
1459 if (wake)
1460 wake_up(&ci->i_cap_wq);
1461}
1462
1463
1464/*
1465 * symlinks
1466 */
1467static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd)
1468{
1469 struct ceph_inode_info *ci = ceph_inode(dentry->d_inode);
1470 nd_set_link(nd, ci->i_symlink);
1471 return NULL;
1472}
1473
1474static const struct inode_operations ceph_symlink_iops = {
1475 .readlink = generic_readlink,
1476 .follow_link = ceph_sym_follow_link,
1477};
1478
1479/*
1480 * setattr
1481 */
1482int ceph_setattr(struct dentry *dentry, struct iattr *attr)
1483{
1484 struct inode *inode = dentry->d_inode;
1485 struct ceph_inode_info *ci = ceph_inode(inode);
1486 struct inode *parent_inode = dentry->d_parent->d_inode;
1487 const unsigned int ia_valid = attr->ia_valid;
1488 struct ceph_mds_request *req;
1489 struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
1490 int issued;
1491 int release = 0, dirtied = 0;
1492 int mask = 0;
1493 int err = 0;
1494
1495 if (ceph_snap(inode) != CEPH_NOSNAP)
1496 return -EROFS;
1497
1498 __ceph_do_pending_vmtruncate(inode);
1499
1500 err = inode_change_ok(inode, attr);
1501 if (err != 0)
1502 return err;
1503
1504 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR,
1505 USE_AUTH_MDS);
1506 if (IS_ERR(req))
1507 return PTR_ERR(req);
1508
1509 spin_lock(&inode->i_lock);
1510 issued = __ceph_caps_issued(ci, NULL);
1511 dout("setattr %p issued %s\n", inode, ceph_cap_string(issued));
1512
1513 if (ia_valid & ATTR_UID) {
1514 dout("setattr %p uid %d -> %d\n", inode,
1515 inode->i_uid, attr->ia_uid);
1516 if (issued & CEPH_CAP_AUTH_EXCL) {
1517 inode->i_uid = attr->ia_uid;
1518 dirtied |= CEPH_CAP_AUTH_EXCL;
1519 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1520 attr->ia_uid != inode->i_uid) {
1521 req->r_args.setattr.uid = cpu_to_le32(attr->ia_uid);
1522 mask |= CEPH_SETATTR_UID;
1523 release |= CEPH_CAP_AUTH_SHARED;
1524 }
1525 }
1526 if (ia_valid & ATTR_GID) {
1527 dout("setattr %p gid %d -> %d\n", inode,
1528 inode->i_gid, attr->ia_gid);
1529 if (issued & CEPH_CAP_AUTH_EXCL) {
1530 inode->i_gid = attr->ia_gid;
1531 dirtied |= CEPH_CAP_AUTH_EXCL;
1532 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1533 attr->ia_gid != inode->i_gid) {
1534 req->r_args.setattr.gid = cpu_to_le32(attr->ia_gid);
1535 mask |= CEPH_SETATTR_GID;
1536 release |= CEPH_CAP_AUTH_SHARED;
1537 }
1538 }
1539 if (ia_valid & ATTR_MODE) {
1540 dout("setattr %p mode 0%o -> 0%o\n", inode, inode->i_mode,
1541 attr->ia_mode);
1542 if (issued & CEPH_CAP_AUTH_EXCL) {
1543 inode->i_mode = attr->ia_mode;
1544 dirtied |= CEPH_CAP_AUTH_EXCL;
1545 } else if ((issued & CEPH_CAP_AUTH_SHARED) == 0 ||
1546 attr->ia_mode != inode->i_mode) {
1547 req->r_args.setattr.mode = cpu_to_le32(attr->ia_mode);
1548 mask |= CEPH_SETATTR_MODE;
1549 release |= CEPH_CAP_AUTH_SHARED;
1550 }
1551 }
1552
1553 if (ia_valid & ATTR_ATIME) {
1554 dout("setattr %p atime %ld.%ld -> %ld.%ld\n", inode,
1555 inode->i_atime.tv_sec, inode->i_atime.tv_nsec,
1556 attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec);
1557 if (issued & CEPH_CAP_FILE_EXCL) {
1558 ci->i_time_warp_seq++;
1559 inode->i_atime = attr->ia_atime;
1560 dirtied |= CEPH_CAP_FILE_EXCL;
1561 } else if ((issued & CEPH_CAP_FILE_WR) &&
1562 timespec_compare(&inode->i_atime,
1563 &attr->ia_atime) < 0) {
1564 inode->i_atime = attr->ia_atime;
1565 dirtied |= CEPH_CAP_FILE_WR;
1566 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1567 !timespec_equal(&inode->i_atime, &attr->ia_atime)) {
1568 ceph_encode_timespec(&req->r_args.setattr.atime,
1569 &attr->ia_atime);
1570 mask |= CEPH_SETATTR_ATIME;
1571 release |= CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_RD |
1572 CEPH_CAP_FILE_WR;
1573 }
1574 }
1575 if (ia_valid & ATTR_MTIME) {
1576 dout("setattr %p mtime %ld.%ld -> %ld.%ld\n", inode,
1577 inode->i_mtime.tv_sec, inode->i_mtime.tv_nsec,
1578 attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec);
1579 if (issued & CEPH_CAP_FILE_EXCL) {
1580 ci->i_time_warp_seq++;
1581 inode->i_mtime = attr->ia_mtime;
1582 dirtied |= CEPH_CAP_FILE_EXCL;
1583 } else if ((issued & CEPH_CAP_FILE_WR) &&
1584 timespec_compare(&inode->i_mtime,
1585 &attr->ia_mtime) < 0) {
1586 inode->i_mtime = attr->ia_mtime;
1587 dirtied |= CEPH_CAP_FILE_WR;
1588 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1589 !timespec_equal(&inode->i_mtime, &attr->ia_mtime)) {
1590 ceph_encode_timespec(&req->r_args.setattr.mtime,
1591 &attr->ia_mtime);
1592 mask |= CEPH_SETATTR_MTIME;
1593 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1594 CEPH_CAP_FILE_WR;
1595 }
1596 }
1597 if (ia_valid & ATTR_SIZE) {
1598 dout("setattr %p size %lld -> %lld\n", inode,
1599 inode->i_size, attr->ia_size);
1600 if (attr->ia_size > inode->i_sb->s_maxbytes) {
1601 err = -EINVAL;
1602 goto out;
1603 }
1604 if ((issued & CEPH_CAP_FILE_EXCL) &&
1605 attr->ia_size > inode->i_size) {
1606 inode->i_size = attr->ia_size;
1607 inode->i_blocks =
1608 (attr->ia_size + (1 << 9) - 1) >> 9;
1609 inode->i_ctime = attr->ia_ctime;
1610 ci->i_reported_size = attr->ia_size;
1611 dirtied |= CEPH_CAP_FILE_EXCL;
1612 } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 ||
1613 attr->ia_size != inode->i_size) {
1614 req->r_args.setattr.size = cpu_to_le64(attr->ia_size);
1615 req->r_args.setattr.old_size =
1616 cpu_to_le64(inode->i_size);
1617 mask |= CEPH_SETATTR_SIZE;
1618 release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_RD |
1619 CEPH_CAP_FILE_WR;
1620 }
1621 }
1622
1623 /* these do nothing */
1624 if (ia_valid & ATTR_CTIME) {
1625 bool only = (ia_valid & (ATTR_SIZE|ATTR_MTIME|ATTR_ATIME|
1626 ATTR_MODE|ATTR_UID|ATTR_GID)) == 0;
1627 dout("setattr %p ctime %ld.%ld -> %ld.%ld (%s)\n", inode,
1628 inode->i_ctime.tv_sec, inode->i_ctime.tv_nsec,
1629 attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec,
1630 only ? "ctime only" : "ignored");
1631 inode->i_ctime = attr->ia_ctime;
1632 if (only) {
1633 /*
1634 * if kernel wants to dirty ctime but nothing else,
1635 * we need to choose a cap to dirty under, or do
1636 * a almost-no-op setattr
1637 */
1638 if (issued & CEPH_CAP_AUTH_EXCL)
1639 dirtied |= CEPH_CAP_AUTH_EXCL;
1640 else if (issued & CEPH_CAP_FILE_EXCL)
1641 dirtied |= CEPH_CAP_FILE_EXCL;
1642 else if (issued & CEPH_CAP_XATTR_EXCL)
1643 dirtied |= CEPH_CAP_XATTR_EXCL;
1644 else
1645 mask |= CEPH_SETATTR_CTIME;
1646 }
1647 }
1648 if (ia_valid & ATTR_FILE)
1649 dout("setattr %p ATTR_FILE ... hrm!\n", inode);
1650
1651 if (dirtied) {
1652 __ceph_mark_dirty_caps(ci, dirtied);
1653 inode->i_ctime = CURRENT_TIME;
1654 }
1655
1656 release &= issued;
1657 spin_unlock(&inode->i_lock);
1658
1659 if (mask) {
1660 req->r_inode = igrab(inode);
1661 req->r_inode_drop = release;
1662 req->r_args.setattr.mask = cpu_to_le32(mask);
1663 req->r_num_caps = 1;
1664 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
1665 }
1666 dout("setattr %p result=%d (%s locally, %d remote)\n", inode, err,
1667 ceph_cap_string(dirtied), mask);
1668
1669 ceph_mdsc_put_request(req);
1670 __ceph_do_pending_vmtruncate(inode);
1671 return err;
1672out:
1673 spin_unlock(&inode->i_lock);
1674 ceph_mdsc_put_request(req);
1675 return err;
1676}
1677
1678/*
1679 * Verify that we have a lease on the given mask. If not,
1680 * do a getattr against an mds.
1681 */
1682int ceph_do_getattr(struct inode *inode, int mask)
1683{
1684 struct ceph_client *client = ceph_sb_to_client(inode->i_sb);
1685 struct ceph_mds_client *mdsc = &client->mdsc;
1686 struct ceph_mds_request *req;
1687 int err;
1688
1689 if (ceph_snap(inode) == CEPH_SNAPDIR) {
1690 dout("do_getattr inode %p SNAPDIR\n", inode);
1691 return 0;
1692 }
1693
1694 dout("do_getattr inode %p mask %s\n", inode, ceph_cap_string(mask));
1695 if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1))
1696 return 0;
1697
1698 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
1699 if (IS_ERR(req))
1700 return PTR_ERR(req);
1701 req->r_inode = igrab(inode);
1702 req->r_num_caps = 1;
1703 req->r_args.getattr.mask = cpu_to_le32(mask);
1704 err = ceph_mdsc_do_request(mdsc, NULL, req);
1705 ceph_mdsc_put_request(req);
1706 dout("do_getattr result=%d\n", err);
1707 return err;
1708}
1709
1710
1711/*
1712 * Check inode permissions. We verify we have a valid value for
1713 * the AUTH cap, then call the generic handler.
1714 */
1715int ceph_permission(struct inode *inode, int mask)
1716{
1717 int err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED);
1718
1719 if (!err)
1720 err = generic_permission(inode, mask, NULL);
1721 return err;
1722}
1723
1724/*
1725 * Get all attributes. Hopefully somedata we'll have a statlite()
1726 * and can limit the fields we require to be accurate.
1727 */
1728int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
1729 struct kstat *stat)
1730{
1731 struct inode *inode = dentry->d_inode;
1732 struct ceph_inode_info *ci = ceph_inode(inode);
1733 int err;
1734
1735 err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL);
1736 if (!err) {
1737 generic_fillattr(inode, stat);
1738 stat->ino = inode->i_ino;
1739 if (ceph_snap(inode) != CEPH_NOSNAP)
1740 stat->dev = ceph_snap(inode);
1741 else
1742 stat->dev = 0;
1743 if (S_ISDIR(inode->i_mode)) {
1744 stat->size = ci->i_rbytes;
1745 stat->blocks = 0;
1746 stat->blksize = 65536;
1747 }
1748 }
1749 return err;
1750}
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
new file mode 100644
index 000000000000..8a5bcae62846
--- /dev/null
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,160 @@
1#include <linux/in.h>
2
3#include "ioctl.h"
4#include "super.h"
5#include "ceph_debug.h"
6
7
8/*
9 * ioctls
10 */
11
12/*
13 * get and set the file layout
14 */
15static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
16{
17 struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
18 struct ceph_ioctl_layout l;
19 int err;
20
21 err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
22 if (!err) {
23 l.stripe_unit = ceph_file_layout_su(ci->i_layout);
24 l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
25 l.object_size = ceph_file_layout_object_size(ci->i_layout);
26 l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
27 l.preferred_osd =
28 (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
29 if (copy_to_user(arg, &l, sizeof(l)))
30 return -EFAULT;
31 }
32
33 return err;
34}
35
36static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
37{
38 struct inode *inode = file->f_dentry->d_inode;
39 struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
40 struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
41 struct ceph_mds_request *req;
42 struct ceph_ioctl_layout l;
43 int err, i;
44
45 /* copy and validate */
46 if (copy_from_user(&l, arg, sizeof(l)))
47 return -EFAULT;
48
49 if ((l.object_size & ~PAGE_MASK) ||
50 (l.stripe_unit & ~PAGE_MASK) ||
51 !l.stripe_unit ||
52 (l.object_size &&
53 (unsigned)l.object_size % (unsigned)l.stripe_unit))
54 return -EINVAL;
55
56 /* make sure it's a valid data pool */
57 if (l.data_pool > 0) {
58 mutex_lock(&mdsc->mutex);
59 err = -EINVAL;
60 for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
61 if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
62 err = 0;
63 break;
64 }
65 mutex_unlock(&mdsc->mutex);
66 if (err)
67 return err;
68 }
69
70 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
71 USE_AUTH_MDS);
72 if (IS_ERR(req))
73 return PTR_ERR(req);
74 req->r_inode = igrab(inode);
75 req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
76
77 req->r_args.setlayout.layout.fl_stripe_unit =
78 cpu_to_le32(l.stripe_unit);
79 req->r_args.setlayout.layout.fl_stripe_count =
80 cpu_to_le32(l.stripe_count);
81 req->r_args.setlayout.layout.fl_object_size =
82 cpu_to_le32(l.object_size);
83 req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
84 req->r_args.setlayout.layout.fl_pg_preferred =
85 cpu_to_le32(l.preferred_osd);
86
87 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
88 ceph_mdsc_put_request(req);
89 return err;
90}
91
92/*
93 * Return object name, size/offset information, and location (OSD
94 * number, network address) for a given file offset.
95 */
96static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
97{
98 struct ceph_ioctl_dataloc dl;
99 struct inode *inode = file->f_dentry->d_inode;
100 struct ceph_inode_info *ci = ceph_inode(inode);
101 struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
102 u64 len = 1, olen;
103 u64 tmp;
104 struct ceph_object_layout ol;
105 struct ceph_pg pgid;
106
107 /* copy and validate */
108 if (copy_from_user(&dl, arg, sizeof(dl)))
109 return -EFAULT;
110
111 down_read(&osdc->map_sem);
112 ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
113 &dl.object_no, &dl.object_offset, &olen);
114 dl.file_offset -= dl.object_offset;
115 dl.object_size = ceph_file_layout_object_size(ci->i_layout);
116 dl.block_size = ceph_file_layout_su(ci->i_layout);
117
118 /* block_offset = object_offset % block_size */
119 tmp = dl.object_offset;
120 dl.block_offset = do_div(tmp, dl.block_size);
121
122 snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
123 ceph_ino(inode), dl.object_no);
124 ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
125 osdc->osdmap);
126
127 pgid = ol.ol_pgid;
128 dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
129 if (dl.osd >= 0) {
130 struct ceph_entity_addr *a =
131 ceph_osd_addr(osdc->osdmap, dl.osd);
132 if (a)
133 memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
134 } else {
135 memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
136 }
137 up_read(&osdc->map_sem);
138
139 /* send result back to user */
140 if (copy_to_user(arg, &dl, sizeof(dl)))
141 return -EFAULT;
142
143 return 0;
144}
145
146long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
147{
148 dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
149 switch (cmd) {
150 case CEPH_IOC_GET_LAYOUT:
151 return ceph_ioctl_get_layout(file, (void __user *)arg);
152
153 case CEPH_IOC_SET_LAYOUT:
154 return ceph_ioctl_set_layout(file, (void __user *)arg);
155
156 case CEPH_IOC_GET_DATALOC:
157 return ceph_ioctl_get_dataloc(file, (void __user *)arg);
158 }
159 return -ENOTTY;
160}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
new file mode 100644
index 000000000000..25e4f1a9d059
--- /dev/null
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,40 @@
1#ifndef FS_CEPH_IOCTL_H
2#define FS_CEPH_IOCTL_H
3
4#include <linux/ioctl.h>
5#include <linux/types.h>
6
7#define CEPH_IOCTL_MAGIC 0x97
8
9/* just use u64 to align sanely on all archs */
10struct ceph_ioctl_layout {
11 __u64 stripe_unit, stripe_count, object_size;
12 __u64 data_pool;
13 __s64 preferred_osd;
14};
15
16#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \
17 struct ceph_ioctl_layout)
18#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
19 struct ceph_ioctl_layout)
20
21/*
22 * Extract identity, address of the OSD and object storing a given
23 * file offset.
24 */
25struct ceph_ioctl_dataloc {
26 __u64 file_offset; /* in+out: file offset */
27 __u64 object_offset; /* out: offset in object */
28 __u64 object_no; /* out: object # */
29 __u64 object_size; /* out: object size */
30 char object_name[64]; /* out: object name */
31 __u64 block_offset; /* out: offset in block */
32 __u64 block_size; /* out: block length */
33 __s64 osd; /* out: osd # */
34 struct sockaddr_storage osd_addr; /* out: osd address */
35};
36
37#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
38 struct ceph_ioctl_dataloc)
39
40#endif
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
new file mode 100644
index 000000000000..a2600101ec22
--- /dev/null
+++ b/fs/ceph/mds_client.c
@@ -0,0 +1,3021 @@
1#include "ceph_debug.h"
2
3#include <linux/wait.h>
4#include <linux/sched.h>
5
6#include "mds_client.h"
7#include "mon_client.h"
8#include "super.h"
9#include "messenger.h"
10#include "decode.h"
11#include "auth.h"
12#include "pagelist.h"
13
14/*
15 * A cluster of MDS (metadata server) daemons is responsible for
16 * managing the file system namespace (the directory hierarchy and
17 * inodes) and for coordinating shared access to storage. Metadata is
18 * partitioning hierarchically across a number of servers, and that
19 * partition varies over time as the cluster adjusts the distribution
20 * in order to balance load.
21 *
22 * The MDS client is primarily responsible to managing synchronous
23 * metadata requests for operations like open, unlink, and so forth.
24 * If there is a MDS failure, we find out about it when we (possibly
25 * request and) receive a new MDS map, and can resubmit affected
26 * requests.
27 *
28 * For the most part, though, we take advantage of a lossless
29 * communications channel to the MDS, and do not need to worry about
30 * timing out or resubmitting requests.
31 *
32 * We maintain a stateful "session" with each MDS we interact with.
33 * Within each session, we sent periodic heartbeat messages to ensure
34 * any capabilities or leases we have been issues remain valid. If
35 * the session times out and goes stale, our leases and capabilities
36 * are no longer valid.
37 */
38
39static void __wake_requests(struct ceph_mds_client *mdsc,
40 struct list_head *head);
41
42const static struct ceph_connection_operations mds_con_ops;
43
44
45/*
46 * mds reply parsing
47 */
48
49/*
50 * parse individual inode info
51 */
52static int parse_reply_info_in(void **p, void *end,
53 struct ceph_mds_reply_info_in *info)
54{
55 int err = -EIO;
56
57 info->in = *p;
58 *p += sizeof(struct ceph_mds_reply_inode) +
59 sizeof(*info->in->fragtree.splits) *
60 le32_to_cpu(info->in->fragtree.nsplits);
61
62 ceph_decode_32_safe(p, end, info->symlink_len, bad);
63 ceph_decode_need(p, end, info->symlink_len, bad);
64 info->symlink = *p;
65 *p += info->symlink_len;
66
67 ceph_decode_32_safe(p, end, info->xattr_len, bad);
68 ceph_decode_need(p, end, info->xattr_len, bad);
69 info->xattr_data = *p;
70 *p += info->xattr_len;
71 return 0;
72bad:
73 return err;
74}
75
76/*
77 * parse a normal reply, which may contain a (dir+)dentry and/or a
78 * target inode.
79 */
80static int parse_reply_info_trace(void **p, void *end,
81 struct ceph_mds_reply_info_parsed *info)
82{
83 int err;
84
85 if (info->head->is_dentry) {
86 err = parse_reply_info_in(p, end, &info->diri);
87 if (err < 0)
88 goto out_bad;
89
90 if (unlikely(*p + sizeof(*info->dirfrag) > end))
91 goto bad;
92 info->dirfrag = *p;
93 *p += sizeof(*info->dirfrag) +
94 sizeof(u32)*le32_to_cpu(info->dirfrag->ndist);
95 if (unlikely(*p > end))
96 goto bad;
97
98 ceph_decode_32_safe(p, end, info->dname_len, bad);
99 ceph_decode_need(p, end, info->dname_len, bad);
100 info->dname = *p;
101 *p += info->dname_len;
102 info->dlease = *p;
103 *p += sizeof(*info->dlease);
104 }
105
106 if (info->head->is_target) {
107 err = parse_reply_info_in(p, end, &info->targeti);
108 if (err < 0)
109 goto out_bad;
110 }
111
112 if (unlikely(*p != end))
113 goto bad;
114 return 0;
115
116bad:
117 err = -EIO;
118out_bad:
119 pr_err("problem parsing mds trace %d\n", err);
120 return err;
121}
122
123/*
124 * parse readdir results
125 */
126static int parse_reply_info_dir(void **p, void *end,
127 struct ceph_mds_reply_info_parsed *info)
128{
129 u32 num, i = 0;
130 int err;
131
132 info->dir_dir = *p;
133 if (*p + sizeof(*info->dir_dir) > end)
134 goto bad;
135 *p += sizeof(*info->dir_dir) +
136 sizeof(u32)*le32_to_cpu(info->dir_dir->ndist);
137 if (*p > end)
138 goto bad;
139
140 ceph_decode_need(p, end, sizeof(num) + 2, bad);
141 num = ceph_decode_32(p);
142 info->dir_end = ceph_decode_8(p);
143 info->dir_complete = ceph_decode_8(p);
144 if (num == 0)
145 goto done;
146
147 /* alloc large array */
148 info->dir_nr = num;
149 info->dir_in = kcalloc(num, sizeof(*info->dir_in) +
150 sizeof(*info->dir_dname) +
151 sizeof(*info->dir_dname_len) +
152 sizeof(*info->dir_dlease),
153 GFP_NOFS);
154 if (info->dir_in == NULL) {
155 err = -ENOMEM;
156 goto out_bad;
157 }
158 info->dir_dname = (void *)(info->dir_in + num);
159 info->dir_dname_len = (void *)(info->dir_dname + num);
160 info->dir_dlease = (void *)(info->dir_dname_len + num);
161
162 while (num) {
163 /* dentry */
164 ceph_decode_need(p, end, sizeof(u32)*2, bad);
165 info->dir_dname_len[i] = ceph_decode_32(p);
166 ceph_decode_need(p, end, info->dir_dname_len[i], bad);
167 info->dir_dname[i] = *p;
168 *p += info->dir_dname_len[i];
169 dout("parsed dir dname '%.*s'\n", info->dir_dname_len[i],
170 info->dir_dname[i]);
171 info->dir_dlease[i] = *p;
172 *p += sizeof(struct ceph_mds_reply_lease);
173
174 /* inode */
175 err = parse_reply_info_in(p, end, &info->dir_in[i]);
176 if (err < 0)
177 goto out_bad;
178 i++;
179 num--;
180 }
181
182done:
183 if (*p != end)
184 goto bad;
185 return 0;
186
187bad:
188 err = -EIO;
189out_bad:
190 pr_err("problem parsing dir contents %d\n", err);
191 return err;
192}
193
194/*
195 * parse entire mds reply
196 */
197static int parse_reply_info(struct ceph_msg *msg,
198 struct ceph_mds_reply_info_parsed *info)
199{
200 void *p, *end;
201 u32 len;
202 int err;
203
204 info->head = msg->front.iov_base;
205 p = msg->front.iov_base + sizeof(struct ceph_mds_reply_head);
206 end = p + msg->front.iov_len - sizeof(struct ceph_mds_reply_head);
207
208 /* trace */
209 ceph_decode_32_safe(&p, end, len, bad);
210 if (len > 0) {
211 err = parse_reply_info_trace(&p, p+len, info);
212 if (err < 0)
213 goto out_bad;
214 }
215
216 /* dir content */
217 ceph_decode_32_safe(&p, end, len, bad);
218 if (len > 0) {
219 err = parse_reply_info_dir(&p, p+len, info);
220 if (err < 0)
221 goto out_bad;
222 }
223
224 /* snap blob */
225 ceph_decode_32_safe(&p, end, len, bad);
226 info->snapblob_len = len;
227 info->snapblob = p;
228 p += len;
229
230 if (p != end)
231 goto bad;
232 return 0;
233
234bad:
235 err = -EIO;
236out_bad:
237 pr_err("mds parse_reply err %d\n", err);
238 return err;
239}
240
241static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info)
242{
243 kfree(info->dir_in);
244}
245
246
247/*
248 * sessions
249 */
250static const char *session_state_name(int s)
251{
252 switch (s) {
253 case CEPH_MDS_SESSION_NEW: return "new";
254 case CEPH_MDS_SESSION_OPENING: return "opening";
255 case CEPH_MDS_SESSION_OPEN: return "open";
256 case CEPH_MDS_SESSION_HUNG: return "hung";
257 case CEPH_MDS_SESSION_CLOSING: return "closing";
258 case CEPH_MDS_SESSION_RESTARTING: return "restarting";
259 case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
260 default: return "???";
261 }
262}
263
264static struct ceph_mds_session *get_session(struct ceph_mds_session *s)
265{
266 if (atomic_inc_not_zero(&s->s_ref)) {
267 dout("mdsc get_session %p %d -> %d\n", s,
268 atomic_read(&s->s_ref)-1, atomic_read(&s->s_ref));
269 return s;
270 } else {
271 dout("mdsc get_session %p 0 -- FAIL", s);
272 return NULL;
273 }
274}
275
276void ceph_put_mds_session(struct ceph_mds_session *s)
277{
278 dout("mdsc put_session %p %d -> %d\n", s,
279 atomic_read(&s->s_ref), atomic_read(&s->s_ref)-1);
280 if (atomic_dec_and_test(&s->s_ref)) {
281 if (s->s_authorizer)
282 s->s_mdsc->client->monc.auth->ops->destroy_authorizer(
283 s->s_mdsc->client->monc.auth, s->s_authorizer);
284 kfree(s);
285 }
286}
287
288/*
289 * called under mdsc->mutex
290 */
291struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
292 int mds)
293{
294 struct ceph_mds_session *session;
295
296 if (mds >= mdsc->max_sessions || mdsc->sessions[mds] == NULL)
297 return NULL;
298 session = mdsc->sessions[mds];
299 dout("lookup_mds_session %p %d\n", session,
300 atomic_read(&session->s_ref));
301 get_session(session);
302 return session;
303}
304
305static bool __have_session(struct ceph_mds_client *mdsc, int mds)
306{
307 if (mds >= mdsc->max_sessions)
308 return false;
309 return mdsc->sessions[mds];
310}
311
312static int __verify_registered_session(struct ceph_mds_client *mdsc,
313 struct ceph_mds_session *s)
314{
315 if (s->s_mds >= mdsc->max_sessions ||
316 mdsc->sessions[s->s_mds] != s)
317 return -ENOENT;
318 return 0;
319}
320
321/*
322 * create+register a new session for given mds.
323 * called under mdsc->mutex.
324 */
325static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
326 int mds)
327{
328 struct ceph_mds_session *s;
329
330 s = kzalloc(sizeof(*s), GFP_NOFS);
331 s->s_mdsc = mdsc;
332 s->s_mds = mds;
333 s->s_state = CEPH_MDS_SESSION_NEW;
334 s->s_ttl = 0;
335 s->s_seq = 0;
336 mutex_init(&s->s_mutex);
337
338 ceph_con_init(mdsc->client->msgr, &s->s_con);
339 s->s_con.private = s;
340 s->s_con.ops = &mds_con_ops;
341 s->s_con.peer_name.type = CEPH_ENTITY_TYPE_MDS;
342 s->s_con.peer_name.num = cpu_to_le64(mds);
343
344 spin_lock_init(&s->s_cap_lock);
345 s->s_cap_gen = 0;
346 s->s_cap_ttl = 0;
347 s->s_renew_requested = 0;
348 s->s_renew_seq = 0;
349 INIT_LIST_HEAD(&s->s_caps);
350 s->s_nr_caps = 0;
351 s->s_trim_caps = 0;
352 atomic_set(&s->s_ref, 1);
353 INIT_LIST_HEAD(&s->s_waiting);
354 INIT_LIST_HEAD(&s->s_unsafe);
355 s->s_num_cap_releases = 0;
356 s->s_cap_iterator = NULL;
357 INIT_LIST_HEAD(&s->s_cap_releases);
358 INIT_LIST_HEAD(&s->s_cap_releases_done);
359 INIT_LIST_HEAD(&s->s_cap_flushing);
360 INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
361
362 dout("register_session mds%d\n", mds);
363 if (mds >= mdsc->max_sessions) {
364 int newmax = 1 << get_count_order(mds+1);
365 struct ceph_mds_session **sa;
366
367 dout("register_session realloc to %d\n", newmax);
368 sa = kcalloc(newmax, sizeof(void *), GFP_NOFS);
369 if (sa == NULL)
370 goto fail_realloc;
371 if (mdsc->sessions) {
372 memcpy(sa, mdsc->sessions,
373 mdsc->max_sessions * sizeof(void *));
374 kfree(mdsc->sessions);
375 }
376 mdsc->sessions = sa;
377 mdsc->max_sessions = newmax;
378 }
379 mdsc->sessions[mds] = s;
380 atomic_inc(&s->s_ref); /* one ref to sessions[], one to caller */
381
382 ceph_con_open(&s->s_con, ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
383
384 return s;
385
386fail_realloc:
387 kfree(s);
388 return ERR_PTR(-ENOMEM);
389}
390
391/*
392 * called under mdsc->mutex
393 */
394static void __unregister_session(struct ceph_mds_client *mdsc,
395 struct ceph_mds_session *s)
396{
397 dout("__unregister_session mds%d %p\n", s->s_mds, s);
398 BUG_ON(mdsc->sessions[s->s_mds] != s);
399 mdsc->sessions[s->s_mds] = NULL;
400 ceph_con_close(&s->s_con);
401 ceph_put_mds_session(s);
402}
403
404/*
405 * drop session refs in request.
406 *
407 * should be last request ref, or hold mdsc->mutex
408 */
409static void put_request_session(struct ceph_mds_request *req)
410{
411 if (req->r_session) {
412 ceph_put_mds_session(req->r_session);
413 req->r_session = NULL;
414 }
415}
416
417void ceph_mdsc_release_request(struct kref *kref)
418{
419 struct ceph_mds_request *req = container_of(kref,
420 struct ceph_mds_request,
421 r_kref);
422 if (req->r_request)
423 ceph_msg_put(req->r_request);
424 if (req->r_reply) {
425 ceph_msg_put(req->r_reply);
426 destroy_reply_info(&req->r_reply_info);
427 }
428 if (req->r_inode) {
429 ceph_put_cap_refs(ceph_inode(req->r_inode),
430 CEPH_CAP_PIN);
431 iput(req->r_inode);
432 }
433 if (req->r_locked_dir)
434 ceph_put_cap_refs(ceph_inode(req->r_locked_dir),
435 CEPH_CAP_PIN);
436 if (req->r_target_inode)
437 iput(req->r_target_inode);
438 if (req->r_dentry)
439 dput(req->r_dentry);
440 if (req->r_old_dentry) {
441 ceph_put_cap_refs(
442 ceph_inode(req->r_old_dentry->d_parent->d_inode),
443 CEPH_CAP_PIN);
444 dput(req->r_old_dentry);
445 }
446 kfree(req->r_path1);
447 kfree(req->r_path2);
448 put_request_session(req);
449 ceph_unreserve_caps(&req->r_caps_reservation);
450 kfree(req);
451}
452
453/*
454 * lookup session, bump ref if found.
455 *
456 * called under mdsc->mutex.
457 */
458static struct ceph_mds_request *__lookup_request(struct ceph_mds_client *mdsc,
459 u64 tid)
460{
461 struct ceph_mds_request *req;
462 struct rb_node *n = mdsc->request_tree.rb_node;
463
464 while (n) {
465 req = rb_entry(n, struct ceph_mds_request, r_node);
466 if (tid < req->r_tid)
467 n = n->rb_left;
468 else if (tid > req->r_tid)
469 n = n->rb_right;
470 else {
471 ceph_mdsc_get_request(req);
472 return req;
473 }
474 }
475 return NULL;
476}
477
478static void __insert_request(struct ceph_mds_client *mdsc,
479 struct ceph_mds_request *new)
480{
481 struct rb_node **p = &mdsc->request_tree.rb_node;
482 struct rb_node *parent = NULL;
483 struct ceph_mds_request *req = NULL;
484
485 while (*p) {
486 parent = *p;
487 req = rb_entry(parent, struct ceph_mds_request, r_node);
488 if (new->r_tid < req->r_tid)
489 p = &(*p)->rb_left;
490 else if (new->r_tid > req->r_tid)
491 p = &(*p)->rb_right;
492 else
493 BUG();
494 }
495
496 rb_link_node(&new->r_node, parent, p);
497 rb_insert_color(&new->r_node, &mdsc->request_tree);
498}
499
500/*
501 * Register an in-flight request, and assign a tid. Link to directory
502 * are modifying (if any).
503 *
504 * Called under mdsc->mutex.
505 */
506static void __register_request(struct ceph_mds_client *mdsc,
507 struct ceph_mds_request *req,
508 struct inode *dir)
509{
510 req->r_tid = ++mdsc->last_tid;
511 if (req->r_num_caps)
512 ceph_reserve_caps(&req->r_caps_reservation, req->r_num_caps);
513 dout("__register_request %p tid %lld\n", req, req->r_tid);
514 ceph_mdsc_get_request(req);
515 __insert_request(mdsc, req);
516
517 if (dir) {
518 struct ceph_inode_info *ci = ceph_inode(dir);
519
520 spin_lock(&ci->i_unsafe_lock);
521 req->r_unsafe_dir = dir;
522 list_add_tail(&req->r_unsafe_dir_item, &ci->i_unsafe_dirops);
523 spin_unlock(&ci->i_unsafe_lock);
524 }
525}
526
527static void __unregister_request(struct ceph_mds_client *mdsc,
528 struct ceph_mds_request *req)
529{
530 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
531 rb_erase(&req->r_node, &mdsc->request_tree);
532 ceph_mdsc_put_request(req);
533
534 if (req->r_unsafe_dir) {
535 struct ceph_inode_info *ci = ceph_inode(req->r_unsafe_dir);
536
537 spin_lock(&ci->i_unsafe_lock);
538 list_del_init(&req->r_unsafe_dir_item);
539 spin_unlock(&ci->i_unsafe_lock);
540 }
541}
542
543/*
544 * Choose mds to send request to next. If there is a hint set in the
545 * request (e.g., due to a prior forward hint from the mds), use that.
546 * Otherwise, consult frag tree and/or caps to identify the
547 * appropriate mds. If all else fails, choose randomly.
548 *
549 * Called under mdsc->mutex.
550 */
551static int __choose_mds(struct ceph_mds_client *mdsc,
552 struct ceph_mds_request *req)
553{
554 struct inode *inode;
555 struct ceph_inode_info *ci;
556 struct ceph_cap *cap;
557 int mode = req->r_direct_mode;
558 int mds = -1;
559 u32 hash = req->r_direct_hash;
560 bool is_hash = req->r_direct_is_hash;
561
562 /*
563 * is there a specific mds we should try? ignore hint if we have
564 * no session and the mds is not up (active or recovering).
565 */
566 if (req->r_resend_mds >= 0 &&
567 (__have_session(mdsc, req->r_resend_mds) ||
568 ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
569 dout("choose_mds using resend_mds mds%d\n",
570 req->r_resend_mds);
571 return req->r_resend_mds;
572 }
573
574 if (mode == USE_RANDOM_MDS)
575 goto random;
576
577 inode = NULL;
578 if (req->r_inode) {
579 inode = req->r_inode;
580 } else if (req->r_dentry) {
581 if (req->r_dentry->d_inode) {
582 inode = req->r_dentry->d_inode;
583 } else {
584 inode = req->r_dentry->d_parent->d_inode;
585 hash = req->r_dentry->d_name.hash;
586 is_hash = true;
587 }
588 }
589 dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash,
590 (int)hash, mode);
591 if (!inode)
592 goto random;
593 ci = ceph_inode(inode);
594
595 if (is_hash && S_ISDIR(inode->i_mode)) {
596 struct ceph_inode_frag frag;
597 int found;
598
599 ceph_choose_frag(ci, hash, &frag, &found);
600 if (found) {
601 if (mode == USE_ANY_MDS && frag.ndist > 0) {
602 u8 r;
603
604 /* choose a random replica */
605 get_random_bytes(&r, 1);
606 r %= frag.ndist;
607 mds = frag.dist[r];
608 dout("choose_mds %p %llx.%llx "
609 "frag %u mds%d (%d/%d)\n",
610 inode, ceph_vinop(inode),
611 frag.frag, frag.mds,
612 (int)r, frag.ndist);
613 return mds;
614 }
615
616 /* since this file/dir wasn't known to be
617 * replicated, then we want to look for the
618 * authoritative mds. */
619 mode = USE_AUTH_MDS;
620 if (frag.mds >= 0) {
621 /* choose auth mds */
622 mds = frag.mds;
623 dout("choose_mds %p %llx.%llx "
624 "frag %u mds%d (auth)\n",
625 inode, ceph_vinop(inode), frag.frag, mds);
626 return mds;
627 }
628 }
629 }
630
631 spin_lock(&inode->i_lock);
632 cap = NULL;
633 if (mode == USE_AUTH_MDS)
634 cap = ci->i_auth_cap;
635 if (!cap && !RB_EMPTY_ROOT(&ci->i_caps))
636 cap = rb_entry(rb_first(&ci->i_caps), struct ceph_cap, ci_node);
637 if (!cap) {
638 spin_unlock(&inode->i_lock);
639 goto random;
640 }
641 mds = cap->session->s_mds;
642 dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n",
643 inode, ceph_vinop(inode), mds,
644 cap == ci->i_auth_cap ? "auth " : "", cap);
645 spin_unlock(&inode->i_lock);
646 return mds;
647
648random:
649 mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
650 dout("choose_mds chose random mds%d\n", mds);
651 return mds;
652}
653
654
655/*
656 * session messages
657 */
658static struct ceph_msg *create_session_msg(u32 op, u64 seq)
659{
660 struct ceph_msg *msg;
661 struct ceph_mds_session_head *h;
662
663 msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
664 if (IS_ERR(msg)) {
665 pr_err("create_session_msg ENOMEM creating msg\n");
666 return ERR_PTR(PTR_ERR(msg));
667 }
668 h = msg->front.iov_base;
669 h->op = cpu_to_le32(op);
670 h->seq = cpu_to_le64(seq);
671 return msg;
672}
673
674/*
675 * send session open request.
676 *
677 * called under mdsc->mutex
678 */
679static int __open_session(struct ceph_mds_client *mdsc,
680 struct ceph_mds_session *session)
681{
682 struct ceph_msg *msg;
683 int mstate;
684 int mds = session->s_mds;
685 int err = 0;
686
687 /* wait for mds to go active? */
688 mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
689 dout("open_session to mds%d (%s)\n", mds,
690 ceph_mds_state_name(mstate));
691 session->s_state = CEPH_MDS_SESSION_OPENING;
692 session->s_renew_requested = jiffies;
693
694 /* send connect message */
695 msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
696 if (IS_ERR(msg)) {
697 err = PTR_ERR(msg);
698 goto out;
699 }
700 ceph_con_send(&session->s_con, msg);
701
702out:
703 return 0;
704}
705
706/*
707 * session caps
708 */
709
710/*
711 * Free preallocated cap messages assigned to this session
712 */
713static void cleanup_cap_releases(struct ceph_mds_session *session)
714{
715 struct ceph_msg *msg;
716
717 spin_lock(&session->s_cap_lock);
718 while (!list_empty(&session->s_cap_releases)) {
719 msg = list_first_entry(&session->s_cap_releases,
720 struct ceph_msg, list_head);
721 list_del_init(&msg->list_head);
722 ceph_msg_put(msg);
723 }
724 while (!list_empty(&session->s_cap_releases_done)) {
725 msg = list_first_entry(&session->s_cap_releases_done,
726 struct ceph_msg, list_head);
727 list_del_init(&msg->list_head);
728 ceph_msg_put(msg);
729 }
730 spin_unlock(&session->s_cap_lock);
731}
732
733/*
734 * Helper to safely iterate over all caps associated with a session.
735 *
736 * caller must hold session s_mutex
737 */
738static int iterate_session_caps(struct ceph_mds_session *session,
739 int (*cb)(struct inode *, struct ceph_cap *,
740 void *), void *arg)
741{
742 struct list_head *p;
743 struct ceph_cap *cap;
744 struct inode *inode, *last_inode = NULL;
745 struct ceph_cap *old_cap = NULL;
746 int ret;
747
748 dout("iterate_session_caps %p mds%d\n", session, session->s_mds);
749 spin_lock(&session->s_cap_lock);
750 p = session->s_caps.next;
751 while (p != &session->s_caps) {
752 cap = list_entry(p, struct ceph_cap, session_caps);
753 inode = igrab(&cap->ci->vfs_inode);
754 if (!inode) {
755 p = p->next;
756 continue;
757 }
758 session->s_cap_iterator = cap;
759 spin_unlock(&session->s_cap_lock);
760
761 if (last_inode) {
762 iput(last_inode);
763 last_inode = NULL;
764 }
765 if (old_cap) {
766 ceph_put_cap(old_cap);
767 old_cap = NULL;
768 }
769
770 ret = cb(inode, cap, arg);
771 last_inode = inode;
772
773 spin_lock(&session->s_cap_lock);
774 p = p->next;
775 if (cap->ci == NULL) {
776 dout("iterate_session_caps finishing cap %p removal\n",
777 cap);
778 BUG_ON(cap->session != session);
779 list_del_init(&cap->session_caps);
780 session->s_nr_caps--;
781 cap->session = NULL;
782 old_cap = cap; /* put_cap it w/o locks held */
783 }
784 if (ret < 0)
785 goto out;
786 }
787 ret = 0;
788out:
789 session->s_cap_iterator = NULL;
790 spin_unlock(&session->s_cap_lock);
791
792 if (last_inode)
793 iput(last_inode);
794 if (old_cap)
795 ceph_put_cap(old_cap);
796
797 return ret;
798}
799
800static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
801 void *arg)
802{
803 struct ceph_inode_info *ci = ceph_inode(inode);
804 dout("removing cap %p, ci is %p, inode is %p\n",
805 cap, ci, &ci->vfs_inode);
806 ceph_remove_cap(cap);
807 return 0;
808}
809
810/*
811 * caller must hold session s_mutex
812 */
813static void remove_session_caps(struct ceph_mds_session *session)
814{
815 dout("remove_session_caps on %p\n", session);
816 iterate_session_caps(session, remove_session_caps_cb, NULL);
817 BUG_ON(session->s_nr_caps > 0);
818 cleanup_cap_releases(session);
819}
820
821/*
822 * wake up any threads waiting on this session's caps. if the cap is
823 * old (didn't get renewed on the client reconnect), remove it now.
824 *
825 * caller must hold s_mutex.
826 */
827static int wake_up_session_cb(struct inode *inode, struct ceph_cap *cap,
828 void *arg)
829{
830 struct ceph_inode_info *ci = ceph_inode(inode);
831
832 wake_up(&ci->i_cap_wq);
833 if (arg) {
834 spin_lock(&inode->i_lock);
835 ci->i_wanted_max_size = 0;
836 ci->i_requested_max_size = 0;
837 spin_unlock(&inode->i_lock);
838 }
839 return 0;
840}
841
842static void wake_up_session_caps(struct ceph_mds_session *session,
843 int reconnect)
844{
845 dout("wake_up_session_caps %p mds%d\n", session, session->s_mds);
846 iterate_session_caps(session, wake_up_session_cb,
847 (void *)(unsigned long)reconnect);
848}
849
850/*
851 * Send periodic message to MDS renewing all currently held caps. The
852 * ack will reset the expiration for all caps from this session.
853 *
854 * caller holds s_mutex
855 */
856static int send_renew_caps(struct ceph_mds_client *mdsc,
857 struct ceph_mds_session *session)
858{
859 struct ceph_msg *msg;
860 int state;
861
862 if (time_after_eq(jiffies, session->s_cap_ttl) &&
863 time_after_eq(session->s_cap_ttl, session->s_renew_requested))
864 pr_info("mds%d caps stale\n", session->s_mds);
865
866 /* do not try to renew caps until a recovering mds has reconnected
867 * with its clients. */
868 state = ceph_mdsmap_get_state(mdsc->mdsmap, session->s_mds);
869 if (state < CEPH_MDS_STATE_RECONNECT) {
870 dout("send_renew_caps ignoring mds%d (%s)\n",
871 session->s_mds, ceph_mds_state_name(state));
872 return 0;
873 }
874
875 dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
876 ceph_mds_state_name(state));
877 session->s_renew_requested = jiffies;
878 msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
879 ++session->s_renew_seq);
880 if (IS_ERR(msg))
881 return PTR_ERR(msg);
882 ceph_con_send(&session->s_con, msg);
883 return 0;
884}
885
886/*
887 * Note new cap ttl, and any transition from stale -> not stale (fresh?).
888 *
889 * Called under session->s_mutex
890 */
891static void renewed_caps(struct ceph_mds_client *mdsc,
892 struct ceph_mds_session *session, int is_renew)
893{
894 int was_stale;
895 int wake = 0;
896
897 spin_lock(&session->s_cap_lock);
898 was_stale = is_renew && (session->s_cap_ttl == 0 ||
899 time_after_eq(jiffies, session->s_cap_ttl));
900
901 session->s_cap_ttl = session->s_renew_requested +
902 mdsc->mdsmap->m_session_timeout*HZ;
903
904 if (was_stale) {
905 if (time_before(jiffies, session->s_cap_ttl)) {
906 pr_info("mds%d caps renewed\n", session->s_mds);
907 wake = 1;
908 } else {
909 pr_info("mds%d caps still stale\n", session->s_mds);
910 }
911 }
912 dout("renewed_caps mds%d ttl now %lu, was %s, now %s\n",
913 session->s_mds, session->s_cap_ttl, was_stale ? "stale" : "fresh",
914 time_before(jiffies, session->s_cap_ttl) ? "stale" : "fresh");
915 spin_unlock(&session->s_cap_lock);
916
917 if (wake)
918 wake_up_session_caps(session, 0);
919}
920
921/*
922 * send a session close request
923 */
924static int request_close_session(struct ceph_mds_client *mdsc,
925 struct ceph_mds_session *session)
926{
927 struct ceph_msg *msg;
928 int err = 0;
929
930 dout("request_close_session mds%d state %s seq %lld\n",
931 session->s_mds, session_state_name(session->s_state),
932 session->s_seq);
933 msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
934 if (IS_ERR(msg))
935 err = PTR_ERR(msg);
936 else
937 ceph_con_send(&session->s_con, msg);
938 return err;
939}
940
941/*
942 * Called with s_mutex held.
943 */
944static int __close_session(struct ceph_mds_client *mdsc,
945 struct ceph_mds_session *session)
946{
947 if (session->s_state >= CEPH_MDS_SESSION_CLOSING)
948 return 0;
949 session->s_state = CEPH_MDS_SESSION_CLOSING;
950 return request_close_session(mdsc, session);
951}
952
953/*
954 * Trim old(er) caps.
955 *
956 * Because we can't cache an inode without one or more caps, we do
957 * this indirectly: if a cap is unused, we prune its aliases, at which
958 * point the inode will hopefully get dropped to.
959 *
960 * Yes, this is a bit sloppy. Our only real goal here is to respond to
961 * memory pressure from the MDS, though, so it needn't be perfect.
962 */
963static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg)
964{
965 struct ceph_mds_session *session = arg;
966 struct ceph_inode_info *ci = ceph_inode(inode);
967 int used, oissued, mine;
968
969 if (session->s_trim_caps <= 0)
970 return -1;
971
972 spin_lock(&inode->i_lock);
973 mine = cap->issued | cap->implemented;
974 used = __ceph_caps_used(ci);
975 oissued = __ceph_caps_issued_other(ci, cap);
976
977 dout("trim_caps_cb %p cap %p mine %s oissued %s used %s\n",
978 inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued),
979 ceph_cap_string(used));
980 if (ci->i_dirty_caps)
981 goto out; /* dirty caps */
982 if ((used & ~oissued) & mine)
983 goto out; /* we need these caps */
984
985 session->s_trim_caps--;
986 if (oissued) {
987 /* we aren't the only cap.. just remove us */
988 __ceph_remove_cap(cap);
989 } else {
990 /* try to drop referring dentries */
991 spin_unlock(&inode->i_lock);
992 d_prune_aliases(inode);
993 dout("trim_caps_cb %p cap %p pruned, count now %d\n",
994 inode, cap, atomic_read(&inode->i_count));
995 return 0;
996 }
997
998out:
999 spin_unlock(&inode->i_lock);
1000 return 0;
1001}
1002
1003/*
1004 * Trim session cap count down to some max number.
1005 */
1006static int trim_caps(struct ceph_mds_client *mdsc,
1007 struct ceph_mds_session *session,
1008 int max_caps)
1009{
1010 int trim_caps = session->s_nr_caps - max_caps;
1011
1012 dout("trim_caps mds%d start: %d / %d, trim %d\n",
1013 session->s_mds, session->s_nr_caps, max_caps, trim_caps);
1014 if (trim_caps > 0) {
1015 session->s_trim_caps = trim_caps;
1016 iterate_session_caps(session, trim_caps_cb, session);
1017 dout("trim_caps mds%d done: %d / %d, trimmed %d\n",
1018 session->s_mds, session->s_nr_caps, max_caps,
1019 trim_caps - session->s_trim_caps);
1020 session->s_trim_caps = 0;
1021 }
1022 return 0;
1023}
1024
1025/*
1026 * Allocate cap_release messages. If there is a partially full message
1027 * in the queue, try to allocate enough to cover it's remainder, so that
1028 * we can send it immediately.
1029 *
1030 * Called under s_mutex.
1031 */
1032static int add_cap_releases(struct ceph_mds_client *mdsc,
1033 struct ceph_mds_session *session,
1034 int extra)
1035{
1036 struct ceph_msg *msg;
1037 struct ceph_mds_cap_release *head;
1038 int err = -ENOMEM;
1039
1040 if (extra < 0)
1041 extra = mdsc->client->mount_args->cap_release_safety;
1042
1043 spin_lock(&session->s_cap_lock);
1044
1045 if (!list_empty(&session->s_cap_releases)) {
1046 msg = list_first_entry(&session->s_cap_releases,
1047 struct ceph_msg,
1048 list_head);
1049 head = msg->front.iov_base;
1050 extra += CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1051 }
1052
1053 while (session->s_num_cap_releases < session->s_nr_caps + extra) {
1054 spin_unlock(&session->s_cap_lock);
1055 msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
1056 0, 0, NULL);
1057 if (!msg)
1058 goto out_unlocked;
1059 dout("add_cap_releases %p msg %p now %d\n", session, msg,
1060 (int)msg->front.iov_len);
1061 head = msg->front.iov_base;
1062 head->num = cpu_to_le32(0);
1063 msg->front.iov_len = sizeof(*head);
1064 spin_lock(&session->s_cap_lock);
1065 list_add(&msg->list_head, &session->s_cap_releases);
1066 session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE;
1067 }
1068
1069 if (!list_empty(&session->s_cap_releases)) {
1070 msg = list_first_entry(&session->s_cap_releases,
1071 struct ceph_msg,
1072 list_head);
1073 head = msg->front.iov_base;
1074 if (head->num) {
1075 dout(" queueing non-full %p (%d)\n", msg,
1076 le32_to_cpu(head->num));
1077 list_move_tail(&msg->list_head,
1078 &session->s_cap_releases_done);
1079 session->s_num_cap_releases -=
1080 CEPH_CAPS_PER_RELEASE - le32_to_cpu(head->num);
1081 }
1082 }
1083 err = 0;
1084 spin_unlock(&session->s_cap_lock);
1085out_unlocked:
1086 return err;
1087}
1088
1089/*
1090 * flush all dirty inode data to disk.
1091 *
1092 * returns true if we've flushed through want_flush_seq
1093 */
1094static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
1095{
1096 int mds, ret = 1;
1097
1098 dout("check_cap_flush want %lld\n", want_flush_seq);
1099 mutex_lock(&mdsc->mutex);
1100 for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
1101 struct ceph_mds_session *session = mdsc->sessions[mds];
1102
1103 if (!session)
1104 continue;
1105 get_session(session);
1106 mutex_unlock(&mdsc->mutex);
1107
1108 mutex_lock(&session->s_mutex);
1109 if (!list_empty(&session->s_cap_flushing)) {
1110 struct ceph_inode_info *ci =
1111 list_entry(session->s_cap_flushing.next,
1112 struct ceph_inode_info,
1113 i_flushing_item);
1114 struct inode *inode = &ci->vfs_inode;
1115
1116 spin_lock(&inode->i_lock);
1117 if (ci->i_cap_flush_seq <= want_flush_seq) {
1118 dout("check_cap_flush still flushing %p "
1119 "seq %lld <= %lld to mds%d\n", inode,
1120 ci->i_cap_flush_seq, want_flush_seq,
1121 session->s_mds);
1122 ret = 0;
1123 }
1124 spin_unlock(&inode->i_lock);
1125 }
1126 mutex_unlock(&session->s_mutex);
1127 ceph_put_mds_session(session);
1128
1129 if (!ret)
1130 return ret;
1131 mutex_lock(&mdsc->mutex);
1132 }
1133
1134 mutex_unlock(&mdsc->mutex);
1135 dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
1136 return ret;
1137}
1138
1139/*
1140 * called under s_mutex
1141 */
1142static void send_cap_releases(struct ceph_mds_client *mdsc,
1143 struct ceph_mds_session *session)
1144{
1145 struct ceph_msg *msg;
1146
1147 dout("send_cap_releases mds%d\n", session->s_mds);
1148 while (1) {
1149 spin_lock(&session->s_cap_lock);
1150 if (list_empty(&session->s_cap_releases_done))
1151 break;
1152 msg = list_first_entry(&session->s_cap_releases_done,
1153 struct ceph_msg, list_head);
1154 list_del_init(&msg->list_head);
1155 spin_unlock(&session->s_cap_lock);
1156 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1157 dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
1158 ceph_con_send(&session->s_con, msg);
1159 }
1160 spin_unlock(&session->s_cap_lock);
1161}
1162
1163/*
1164 * requests
1165 */
1166
1167/*
1168 * Create an mds request.
1169 */
1170struct ceph_mds_request *
1171ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
1172{
1173 struct ceph_mds_request *req = kzalloc(sizeof(*req), GFP_NOFS);
1174
1175 if (!req)
1176 return ERR_PTR(-ENOMEM);
1177
1178 req->r_started = jiffies;
1179 req->r_resend_mds = -1;
1180 INIT_LIST_HEAD(&req->r_unsafe_dir_item);
1181 req->r_fmode = -1;
1182 kref_init(&req->r_kref);
1183 INIT_LIST_HEAD(&req->r_wait);
1184 init_completion(&req->r_completion);
1185 init_completion(&req->r_safe_completion);
1186 INIT_LIST_HEAD(&req->r_unsafe_item);
1187
1188 req->r_op = op;
1189 req->r_direct_mode = mode;
1190 return req;
1191}
1192
1193/*
1194 * return oldest (lowest) request, tid in request tree, 0 if none.
1195 *
1196 * called under mdsc->mutex.
1197 */
1198static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc)
1199{
1200 if (RB_EMPTY_ROOT(&mdsc->request_tree))
1201 return NULL;
1202 return rb_entry(rb_first(&mdsc->request_tree),
1203 struct ceph_mds_request, r_node);
1204}
1205
1206static u64 __get_oldest_tid(struct ceph_mds_client *mdsc)
1207{
1208 struct ceph_mds_request *req = __get_oldest_req(mdsc);
1209
1210 if (req)
1211 return req->r_tid;
1212 return 0;
1213}
1214
1215/*
1216 * Build a dentry's path. Allocate on heap; caller must kfree. Based
1217 * on build_path_from_dentry in fs/cifs/dir.c.
1218 *
1219 * If @stop_on_nosnap, generate path relative to the first non-snapped
1220 * inode.
1221 *
1222 * Encode hidden .snap dirs as a double /, i.e.
1223 * foo/.snap/bar -> foo//bar
1224 */
1225char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
1226 int stop_on_nosnap)
1227{
1228 struct dentry *temp;
1229 char *path;
1230 int len, pos;
1231
1232 if (dentry == NULL)
1233 return ERR_PTR(-EINVAL);
1234
1235retry:
1236 len = 0;
1237 for (temp = dentry; !IS_ROOT(temp);) {
1238 struct inode *inode = temp->d_inode;
1239 if (inode && ceph_snap(inode) == CEPH_SNAPDIR)
1240 len++; /* slash only */
1241 else if (stop_on_nosnap && inode &&
1242 ceph_snap(inode) == CEPH_NOSNAP)
1243 break;
1244 else
1245 len += 1 + temp->d_name.len;
1246 temp = temp->d_parent;
1247 if (temp == NULL) {
1248 pr_err("build_path_dentry corrupt dentry %p\n", dentry);
1249 return ERR_PTR(-EINVAL);
1250 }
1251 }
1252 if (len)
1253 len--; /* no leading '/' */
1254
1255 path = kmalloc(len+1, GFP_NOFS);
1256 if (path == NULL)
1257 return ERR_PTR(-ENOMEM);
1258 pos = len;
1259 path[pos] = 0; /* trailing null */
1260 for (temp = dentry; !IS_ROOT(temp) && pos != 0; ) {
1261 struct inode *inode = temp->d_inode;
1262
1263 if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
1264 dout("build_path_dentry path+%d: %p SNAPDIR\n",
1265 pos, temp);
1266 } else if (stop_on_nosnap && inode &&
1267 ceph_snap(inode) == CEPH_NOSNAP) {
1268 break;
1269 } else {
1270 pos -= temp->d_name.len;
1271 if (pos < 0)
1272 break;
1273 strncpy(path + pos, temp->d_name.name,
1274 temp->d_name.len);
1275 dout("build_path_dentry path+%d: %p '%.*s'\n",
1276 pos, temp, temp->d_name.len, path + pos);
1277 }
1278 if (pos)
1279 path[--pos] = '/';
1280 temp = temp->d_parent;
1281 if (temp == NULL) {
1282 pr_err("build_path_dentry corrupt dentry\n");
1283 kfree(path);
1284 return ERR_PTR(-EINVAL);
1285 }
1286 }
1287 if (pos != 0) {
1288 pr_err("build_path_dentry did not end path lookup where "
1289 "expected, namelen is %d, pos is %d\n", len, pos);
1290 /* presumably this is only possible if racing with a
1291 rename of one of the parent directories (we can not
1292 lock the dentries above us to prevent this, but
1293 retrying should be harmless) */
1294 kfree(path);
1295 goto retry;
1296 }
1297
1298 *base = ceph_ino(temp->d_inode);
1299 *plen = len;
1300 dout("build_path_dentry on %p %d built %llx '%.*s'\n",
1301 dentry, atomic_read(&dentry->d_count), *base, len, path);
1302 return path;
1303}
1304
1305static int build_dentry_path(struct dentry *dentry,
1306 const char **ppath, int *ppathlen, u64 *pino,
1307 int *pfreepath)
1308{
1309 char *path;
1310
1311 if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP) {
1312 *pino = ceph_ino(dentry->d_parent->d_inode);
1313 *ppath = dentry->d_name.name;
1314 *ppathlen = dentry->d_name.len;
1315 return 0;
1316 }
1317 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1318 if (IS_ERR(path))
1319 return PTR_ERR(path);
1320 *ppath = path;
1321 *pfreepath = 1;
1322 return 0;
1323}
1324
1325static int build_inode_path(struct inode *inode,
1326 const char **ppath, int *ppathlen, u64 *pino,
1327 int *pfreepath)
1328{
1329 struct dentry *dentry;
1330 char *path;
1331
1332 if (ceph_snap(inode) == CEPH_NOSNAP) {
1333 *pino = ceph_ino(inode);
1334 *ppathlen = 0;
1335 return 0;
1336 }
1337 dentry = d_find_alias(inode);
1338 path = ceph_mdsc_build_path(dentry, ppathlen, pino, 1);
1339 dput(dentry);
1340 if (IS_ERR(path))
1341 return PTR_ERR(path);
1342 *ppath = path;
1343 *pfreepath = 1;
1344 return 0;
1345}
1346
1347/*
1348 * request arguments may be specified via an inode *, a dentry *, or
1349 * an explicit ino+path.
1350 */
1351static int set_request_path_attr(struct inode *rinode, struct dentry *rdentry,
1352 const char *rpath, u64 rino,
1353 const char **ppath, int *pathlen,
1354 u64 *ino, int *freepath)
1355{
1356 int r = 0;
1357
1358 if (rinode) {
1359 r = build_inode_path(rinode, ppath, pathlen, ino, freepath);
1360 dout(" inode %p %llx.%llx\n", rinode, ceph_ino(rinode),
1361 ceph_snap(rinode));
1362 } else if (rdentry) {
1363 r = build_dentry_path(rdentry, ppath, pathlen, ino, freepath);
1364 dout(" dentry %p %llx/%.*s\n", rdentry, *ino, *pathlen,
1365 *ppath);
1366 } else if (rpath) {
1367 *ino = rino;
1368 *ppath = rpath;
1369 *pathlen = strlen(rpath);
1370 dout(" path %.*s\n", *pathlen, rpath);
1371 }
1372
1373 return r;
1374}
1375
1376/*
1377 * called under mdsc->mutex
1378 */
1379static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
1380 struct ceph_mds_request *req,
1381 int mds)
1382{
1383 struct ceph_msg *msg;
1384 struct ceph_mds_request_head *head;
1385 const char *path1 = NULL;
1386 const char *path2 = NULL;
1387 u64 ino1 = 0, ino2 = 0;
1388 int pathlen1 = 0, pathlen2 = 0;
1389 int freepath1 = 0, freepath2 = 0;
1390 int len;
1391 u16 releases;
1392 void *p, *end;
1393 int ret;
1394
1395 ret = set_request_path_attr(req->r_inode, req->r_dentry,
1396 req->r_path1, req->r_ino1.ino,
1397 &path1, &pathlen1, &ino1, &freepath1);
1398 if (ret < 0) {
1399 msg = ERR_PTR(ret);
1400 goto out;
1401 }
1402
1403 ret = set_request_path_attr(NULL, req->r_old_dentry,
1404 req->r_path2, req->r_ino2.ino,
1405 &path2, &pathlen2, &ino2, &freepath2);
1406 if (ret < 0) {
1407 msg = ERR_PTR(ret);
1408 goto out_free1;
1409 }
1410
1411 len = sizeof(*head) +
1412 pathlen1 + pathlen2 + 2*(1 + sizeof(u32) + sizeof(u64));
1413
1414 /* calculate (max) length for cap releases */
1415 len += sizeof(struct ceph_mds_request_release) *
1416 (!!req->r_inode_drop + !!req->r_dentry_drop +
1417 !!req->r_old_inode_drop + !!req->r_old_dentry_drop);
1418 if (req->r_dentry_drop)
1419 len += req->r_dentry->d_name.len;
1420 if (req->r_old_dentry_drop)
1421 len += req->r_old_dentry->d_name.len;
1422
1423 msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
1424 if (IS_ERR(msg))
1425 goto out_free2;
1426
1427 msg->hdr.tid = cpu_to_le64(req->r_tid);
1428
1429 head = msg->front.iov_base;
1430 p = msg->front.iov_base + sizeof(*head);
1431 end = msg->front.iov_base + msg->front.iov_len;
1432
1433 head->mdsmap_epoch = cpu_to_le32(mdsc->mdsmap->m_epoch);
1434 head->op = cpu_to_le32(req->r_op);
1435 head->caller_uid = cpu_to_le32(current_fsuid());
1436 head->caller_gid = cpu_to_le32(current_fsgid());
1437 head->args = req->r_args;
1438
1439 ceph_encode_filepath(&p, end, ino1, path1);
1440 ceph_encode_filepath(&p, end, ino2, path2);
1441
1442 /* cap releases */
1443 releases = 0;
1444 if (req->r_inode_drop)
1445 releases += ceph_encode_inode_release(&p,
1446 req->r_inode ? req->r_inode : req->r_dentry->d_inode,
1447 mds, req->r_inode_drop, req->r_inode_unless, 0);
1448 if (req->r_dentry_drop)
1449 releases += ceph_encode_dentry_release(&p, req->r_dentry,
1450 mds, req->r_dentry_drop, req->r_dentry_unless);
1451 if (req->r_old_dentry_drop)
1452 releases += ceph_encode_dentry_release(&p, req->r_old_dentry,
1453 mds, req->r_old_dentry_drop, req->r_old_dentry_unless);
1454 if (req->r_old_inode_drop)
1455 releases += ceph_encode_inode_release(&p,
1456 req->r_old_dentry->d_inode,
1457 mds, req->r_old_inode_drop, req->r_old_inode_unless, 0);
1458 head->num_releases = cpu_to_le16(releases);
1459
1460 BUG_ON(p > end);
1461 msg->front.iov_len = p - msg->front.iov_base;
1462 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
1463
1464 msg->pages = req->r_pages;
1465 msg->nr_pages = req->r_num_pages;
1466 msg->hdr.data_len = cpu_to_le32(req->r_data_len);
1467 msg->hdr.data_off = cpu_to_le16(0);
1468
1469out_free2:
1470 if (freepath2)
1471 kfree((char *)path2);
1472out_free1:
1473 if (freepath1)
1474 kfree((char *)path1);
1475out:
1476 return msg;
1477}
1478
1479/*
1480 * called under mdsc->mutex if error, under no mutex if
1481 * success.
1482 */
1483static void complete_request(struct ceph_mds_client *mdsc,
1484 struct ceph_mds_request *req)
1485{
1486 if (req->r_callback)
1487 req->r_callback(mdsc, req);
1488 else
1489 complete(&req->r_completion);
1490}
1491
1492/*
1493 * called under mdsc->mutex
1494 */
1495static int __prepare_send_request(struct ceph_mds_client *mdsc,
1496 struct ceph_mds_request *req,
1497 int mds)
1498{
1499 struct ceph_mds_request_head *rhead;
1500 struct ceph_msg *msg;
1501 int flags = 0;
1502
1503 req->r_mds = mds;
1504 req->r_attempts++;
1505 dout("prepare_send_request %p tid %lld %s (attempt %d)\n", req,
1506 req->r_tid, ceph_mds_op_name(req->r_op), req->r_attempts);
1507
1508 if (req->r_request) {
1509 ceph_msg_put(req->r_request);
1510 req->r_request = NULL;
1511 }
1512 msg = create_request_message(mdsc, req, mds);
1513 if (IS_ERR(msg)) {
1514 req->r_reply = ERR_PTR(PTR_ERR(msg));
1515 complete_request(mdsc, req);
1516 return -PTR_ERR(msg);
1517 }
1518 req->r_request = msg;
1519
1520 rhead = msg->front.iov_base;
1521 rhead->oldest_client_tid = cpu_to_le64(__get_oldest_tid(mdsc));
1522 if (req->r_got_unsafe)
1523 flags |= CEPH_MDS_FLAG_REPLAY;
1524 if (req->r_locked_dir)
1525 flags |= CEPH_MDS_FLAG_WANT_DENTRY;
1526 rhead->flags = cpu_to_le32(flags);
1527 rhead->num_fwd = req->r_num_fwd;
1528 rhead->num_retry = req->r_attempts - 1;
1529
1530 dout(" r_locked_dir = %p\n", req->r_locked_dir);
1531
1532 if (req->r_target_inode && req->r_got_unsafe)
1533 rhead->ino = cpu_to_le64(ceph_ino(req->r_target_inode));
1534 else
1535 rhead->ino = 0;
1536 return 0;
1537}
1538
1539/*
1540 * send request, or put it on the appropriate wait list.
1541 */
1542static int __do_request(struct ceph_mds_client *mdsc,
1543 struct ceph_mds_request *req)
1544{
1545 struct ceph_mds_session *session = NULL;
1546 int mds = -1;
1547 int err = -EAGAIN;
1548
1549 if (req->r_reply)
1550 goto out;
1551
1552 if (req->r_timeout &&
1553 time_after_eq(jiffies, req->r_started + req->r_timeout)) {
1554 dout("do_request timed out\n");
1555 err = -EIO;
1556 goto finish;
1557 }
1558
1559 mds = __choose_mds(mdsc, req);
1560 if (mds < 0 ||
1561 ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
1562 dout("do_request no mds or not active, waiting for map\n");
1563 list_add(&req->r_wait, &mdsc->waiting_for_map);
1564 goto out;
1565 }
1566
1567 /* get, open session */
1568 session = __ceph_lookup_mds_session(mdsc, mds);
1569 if (!session)
1570 session = register_session(mdsc, mds);
1571 dout("do_request mds%d session %p state %s\n", mds, session,
1572 session_state_name(session->s_state));
1573 if (session->s_state != CEPH_MDS_SESSION_OPEN &&
1574 session->s_state != CEPH_MDS_SESSION_HUNG) {
1575 if (session->s_state == CEPH_MDS_SESSION_NEW ||
1576 session->s_state == CEPH_MDS_SESSION_CLOSING)
1577 __open_session(mdsc, session);
1578 list_add(&req->r_wait, &session->s_waiting);
1579 goto out_session;
1580 }
1581
1582 /* send request */
1583 req->r_session = get_session(session);
1584 req->r_resend_mds = -1; /* forget any previous mds hint */
1585
1586 if (req->r_request_started == 0) /* note request start time */
1587 req->r_request_started = jiffies;
1588
1589 err = __prepare_send_request(mdsc, req, mds);
1590 if (!err) {
1591 ceph_msg_get(req->r_request);
1592 ceph_con_send(&session->s_con, req->r_request);
1593 }
1594
1595out_session:
1596 ceph_put_mds_session(session);
1597out:
1598 return err;
1599
1600finish:
1601 req->r_reply = ERR_PTR(err);
1602 complete_request(mdsc, req);
1603 goto out;
1604}
1605
1606/*
1607 * called under mdsc->mutex
1608 */
1609static void __wake_requests(struct ceph_mds_client *mdsc,
1610 struct list_head *head)
1611{
1612 struct ceph_mds_request *req, *nreq;
1613
1614 list_for_each_entry_safe(req, nreq, head, r_wait) {
1615 list_del_init(&req->r_wait);
1616 __do_request(mdsc, req);
1617 }
1618}
1619
1620/*
1621 * Wake up threads with requests pending for @mds, so that they can
1622 * resubmit their requests to a possibly different mds. If @all is set,
1623 * wake up if their requests has been forwarded to @mds, too.
1624 */
1625static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
1626{
1627 struct ceph_mds_request *req;
1628 struct rb_node *p;
1629
1630 dout("kick_requests mds%d\n", mds);
1631 for (p = rb_first(&mdsc->request_tree); p; p = rb_next(p)) {
1632 req = rb_entry(p, struct ceph_mds_request, r_node);
1633 if (req->r_got_unsafe)
1634 continue;
1635 if (req->r_session &&
1636 req->r_session->s_mds == mds) {
1637 dout(" kicking tid %llu\n", req->r_tid);
1638 put_request_session(req);
1639 __do_request(mdsc, req);
1640 }
1641 }
1642}
1643
1644void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
1645 struct ceph_mds_request *req)
1646{
1647 dout("submit_request on %p\n", req);
1648 mutex_lock(&mdsc->mutex);
1649 __register_request(mdsc, req, NULL);
1650 __do_request(mdsc, req);
1651 mutex_unlock(&mdsc->mutex);
1652}
1653
1654/*
1655 * Synchrously perform an mds request. Take care of all of the
1656 * session setup, forwarding, retry details.
1657 */
1658int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
1659 struct inode *dir,
1660 struct ceph_mds_request *req)
1661{
1662 int err;
1663
1664 dout("do_request on %p\n", req);
1665
1666 /* take CAP_PIN refs for r_inode, r_locked_dir, r_old_dentry */
1667 if (req->r_inode)
1668 ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN);
1669 if (req->r_locked_dir)
1670 ceph_get_cap_refs(ceph_inode(req->r_locked_dir), CEPH_CAP_PIN);
1671 if (req->r_old_dentry)
1672 ceph_get_cap_refs(
1673 ceph_inode(req->r_old_dentry->d_parent->d_inode),
1674 CEPH_CAP_PIN);
1675
1676 /* issue */
1677 mutex_lock(&mdsc->mutex);
1678 __register_request(mdsc, req, dir);
1679 __do_request(mdsc, req);
1680
1681 /* wait */
1682 if (!req->r_reply) {
1683 mutex_unlock(&mdsc->mutex);
1684 if (req->r_timeout) {
1685 err = (long)wait_for_completion_interruptible_timeout(
1686 &req->r_completion, req->r_timeout);
1687 if (err == 0)
1688 req->r_reply = ERR_PTR(-EIO);
1689 else if (err < 0)
1690 req->r_reply = ERR_PTR(err);
1691 } else {
1692 err = wait_for_completion_interruptible(
1693 &req->r_completion);
1694 if (err)
1695 req->r_reply = ERR_PTR(err);
1696 }
1697 mutex_lock(&mdsc->mutex);
1698 }
1699
1700 if (IS_ERR(req->r_reply)) {
1701 err = PTR_ERR(req->r_reply);
1702 req->r_reply = NULL;
1703
1704 if (err == -ERESTARTSYS) {
1705 /* aborted */
1706 req->r_aborted = true;
1707
1708 if (req->r_locked_dir &&
1709 (req->r_op & CEPH_MDS_OP_WRITE)) {
1710 struct ceph_inode_info *ci =
1711 ceph_inode(req->r_locked_dir);
1712
1713 dout("aborted, clearing I_COMPLETE on %p\n",
1714 req->r_locked_dir);
1715 spin_lock(&req->r_locked_dir->i_lock);
1716 ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
1717 ci->i_release_count++;
1718 spin_unlock(&req->r_locked_dir->i_lock);
1719 }
1720 } else {
1721 /* clean up this request */
1722 __unregister_request(mdsc, req);
1723 if (!list_empty(&req->r_unsafe_item))
1724 list_del_init(&req->r_unsafe_item);
1725 complete(&req->r_safe_completion);
1726 }
1727 } else if (req->r_err) {
1728 err = req->r_err;
1729 } else {
1730 err = le32_to_cpu(req->r_reply_info.head->result);
1731 }
1732 mutex_unlock(&mdsc->mutex);
1733
1734 dout("do_request %p done, result %d\n", req, err);
1735 return err;
1736}
1737
1738/*
1739 * Handle mds reply.
1740 *
1741 * We take the session mutex and parse and process the reply immediately.
1742 * This preserves the logical ordering of replies, capabilities, etc., sent
1743 * by the MDS as they are applied to our local cache.
1744 */
1745static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
1746{
1747 struct ceph_mds_client *mdsc = session->s_mdsc;
1748 struct ceph_mds_request *req;
1749 struct ceph_mds_reply_head *head = msg->front.iov_base;
1750 struct ceph_mds_reply_info_parsed *rinfo; /* parsed reply info */
1751 u64 tid;
1752 int err, result;
1753 int mds = session->s_mds;
1754
1755 if (msg->front.iov_len < sizeof(*head)) {
1756 pr_err("mdsc_handle_reply got corrupt (short) reply\n");
1757 ceph_msg_dump(msg);
1758 return;
1759 }
1760
1761 /* get request, session */
1762 tid = le64_to_cpu(msg->hdr.tid);
1763 mutex_lock(&mdsc->mutex);
1764 req = __lookup_request(mdsc, tid);
1765 if (!req) {
1766 dout("handle_reply on unknown tid %llu\n", tid);
1767 mutex_unlock(&mdsc->mutex);
1768 return;
1769 }
1770 dout("handle_reply %p\n", req);
1771
1772 /* correct session? */
1773 if (!req->r_session && req->r_session != session) {
1774 pr_err("mdsc_handle_reply got %llu on session mds%d"
1775 " not mds%d\n", tid, session->s_mds,
1776 req->r_session ? req->r_session->s_mds : -1);
1777 mutex_unlock(&mdsc->mutex);
1778 goto out;
1779 }
1780
1781 /* dup? */
1782 if ((req->r_got_unsafe && !head->safe) ||
1783 (req->r_got_safe && head->safe)) {
1784 pr_warning("got a dup %s reply on %llu from mds%d\n",
1785 head->safe ? "safe" : "unsafe", tid, mds);
1786 mutex_unlock(&mdsc->mutex);
1787 goto out;
1788 }
1789
1790 result = le32_to_cpu(head->result);
1791
1792 /*
1793 * Tolerate 2 consecutive ESTALEs from the same mds.
1794 * FIXME: we should be looking at the cap migrate_seq.
1795 */
1796 if (result == -ESTALE) {
1797 req->r_direct_mode = USE_AUTH_MDS;
1798 req->r_num_stale++;
1799 if (req->r_num_stale <= 2) {
1800 __do_request(mdsc, req);
1801 mutex_unlock(&mdsc->mutex);
1802 goto out;
1803 }
1804 } else {
1805 req->r_num_stale = 0;
1806 }
1807
1808 if (head->safe) {
1809 req->r_got_safe = true;
1810 __unregister_request(mdsc, req);
1811 complete(&req->r_safe_completion);
1812
1813 if (req->r_got_unsafe) {
1814 /*
1815 * We already handled the unsafe response, now do the
1816 * cleanup. No need to examine the response; the MDS
1817 * doesn't include any result info in the safe
1818 * response. And even if it did, there is nothing
1819 * useful we could do with a revised return value.
1820 */
1821 dout("got safe reply %llu, mds%d\n", tid, mds);
1822 list_del_init(&req->r_unsafe_item);
1823
1824 /* last unsafe request during umount? */
1825 if (mdsc->stopping && !__get_oldest_req(mdsc))
1826 complete(&mdsc->safe_umount_waiters);
1827 mutex_unlock(&mdsc->mutex);
1828 goto out;
1829 }
1830 }
1831
1832 BUG_ON(req->r_reply);
1833
1834 if (!head->safe) {
1835 req->r_got_unsafe = true;
1836 list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
1837 }
1838
1839 dout("handle_reply tid %lld result %d\n", tid, result);
1840 rinfo = &req->r_reply_info;
1841 err = parse_reply_info(msg, rinfo);
1842 mutex_unlock(&mdsc->mutex);
1843
1844 mutex_lock(&session->s_mutex);
1845 if (err < 0) {
1846 pr_err("mdsc_handle_reply got corrupt reply mds%d\n", mds);
1847 ceph_msg_dump(msg);
1848 goto out_err;
1849 }
1850
1851 /* snap trace */
1852 if (rinfo->snapblob_len) {
1853 down_write(&mdsc->snap_rwsem);
1854 ceph_update_snap_trace(mdsc, rinfo->snapblob,
1855 rinfo->snapblob + rinfo->snapblob_len,
1856 le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
1857 downgrade_write(&mdsc->snap_rwsem);
1858 } else {
1859 down_read(&mdsc->snap_rwsem);
1860 }
1861
1862 /* insert trace into our cache */
1863 err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
1864 if (err == 0) {
1865 if (result == 0 && rinfo->dir_nr)
1866 ceph_readdir_prepopulate(req, req->r_session);
1867 ceph_unreserve_caps(&req->r_caps_reservation);
1868 }
1869
1870 up_read(&mdsc->snap_rwsem);
1871out_err:
1872 if (err) {
1873 req->r_err = err;
1874 } else {
1875 req->r_reply = msg;
1876 ceph_msg_get(msg);
1877 }
1878
1879 add_cap_releases(mdsc, req->r_session, -1);
1880 mutex_unlock(&session->s_mutex);
1881
1882 /* kick calling process */
1883 complete_request(mdsc, req);
1884out:
1885 ceph_mdsc_put_request(req);
1886 return;
1887}
1888
1889
1890
1891/*
1892 * handle mds notification that our request has been forwarded.
1893 */
1894static void handle_forward(struct ceph_mds_client *mdsc,
1895 struct ceph_mds_session *session,
1896 struct ceph_msg *msg)
1897{
1898 struct ceph_mds_request *req;
1899 u64 tid = le64_to_cpu(msg->hdr.tid);
1900 u32 next_mds;
1901 u32 fwd_seq;
1902 int err = -EINVAL;
1903 void *p = msg->front.iov_base;
1904 void *end = p + msg->front.iov_len;
1905
1906 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1907 next_mds = ceph_decode_32(&p);
1908 fwd_seq = ceph_decode_32(&p);
1909
1910 mutex_lock(&mdsc->mutex);
1911 req = __lookup_request(mdsc, tid);
1912 if (!req) {
1913 dout("forward %llu to mds%d - req dne\n", tid, next_mds);
1914 goto out; /* dup reply? */
1915 }
1916
1917 if (fwd_seq <= req->r_num_fwd) {
1918 dout("forward %llu to mds%d - old seq %d <= %d\n",
1919 tid, next_mds, req->r_num_fwd, fwd_seq);
1920 } else {
1921 /* resend. forward race not possible; mds would drop */
1922 dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
1923 req->r_num_fwd = fwd_seq;
1924 req->r_resend_mds = next_mds;
1925 put_request_session(req);
1926 __do_request(mdsc, req);
1927 }
1928 ceph_mdsc_put_request(req);
1929out:
1930 mutex_unlock(&mdsc->mutex);
1931 return;
1932
1933bad:
1934 pr_err("mdsc_handle_forward decode error err=%d\n", err);
1935}
1936
1937/*
1938 * handle a mds session control message
1939 */
1940static void handle_session(struct ceph_mds_session *session,
1941 struct ceph_msg *msg)
1942{
1943 struct ceph_mds_client *mdsc = session->s_mdsc;
1944 u32 op;
1945 u64 seq;
1946 int mds = session->s_mds;
1947 struct ceph_mds_session_head *h = msg->front.iov_base;
1948 int wake = 0;
1949
1950 /* decode */
1951 if (msg->front.iov_len != sizeof(*h))
1952 goto bad;
1953 op = le32_to_cpu(h->op);
1954 seq = le64_to_cpu(h->seq);
1955
1956 mutex_lock(&mdsc->mutex);
1957 if (op == CEPH_SESSION_CLOSE)
1958 __unregister_session(mdsc, session);
1959 /* FIXME: this ttl calculation is generous */
1960 session->s_ttl = jiffies + HZ*mdsc->mdsmap->m_session_autoclose;
1961 mutex_unlock(&mdsc->mutex);
1962
1963 mutex_lock(&session->s_mutex);
1964
1965 dout("handle_session mds%d %s %p state %s seq %llu\n",
1966 mds, ceph_session_op_name(op), session,
1967 session_state_name(session->s_state), seq);
1968
1969 if (session->s_state == CEPH_MDS_SESSION_HUNG) {
1970 session->s_state = CEPH_MDS_SESSION_OPEN;
1971 pr_info("mds%d came back\n", session->s_mds);
1972 }
1973
1974 switch (op) {
1975 case CEPH_SESSION_OPEN:
1976 session->s_state = CEPH_MDS_SESSION_OPEN;
1977 renewed_caps(mdsc, session, 0);
1978 wake = 1;
1979 if (mdsc->stopping)
1980 __close_session(mdsc, session);
1981 break;
1982
1983 case CEPH_SESSION_RENEWCAPS:
1984 if (session->s_renew_seq == seq)
1985 renewed_caps(mdsc, session, 1);
1986 break;
1987
1988 case CEPH_SESSION_CLOSE:
1989 remove_session_caps(session);
1990 wake = 1; /* for good measure */
1991 complete(&mdsc->session_close_waiters);
1992 kick_requests(mdsc, mds, 0); /* cur only */
1993 break;
1994
1995 case CEPH_SESSION_STALE:
1996 pr_info("mds%d caps went stale, renewing\n",
1997 session->s_mds);
1998 spin_lock(&session->s_cap_lock);
1999 session->s_cap_gen++;
2000 session->s_cap_ttl = 0;
2001 spin_unlock(&session->s_cap_lock);
2002 send_renew_caps(mdsc, session);
2003 break;
2004
2005 case CEPH_SESSION_RECALL_STATE:
2006 trim_caps(mdsc, session, le32_to_cpu(h->max_caps));
2007 break;
2008
2009 default:
2010 pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
2011 WARN_ON(1);
2012 }
2013
2014 mutex_unlock(&session->s_mutex);
2015 if (wake) {
2016 mutex_lock(&mdsc->mutex);
2017 __wake_requests(mdsc, &session->s_waiting);
2018 mutex_unlock(&mdsc->mutex);
2019 }
2020 return;
2021
2022bad:
2023 pr_err("mdsc_handle_session corrupt message mds%d len %d\n", mds,
2024 (int)msg->front.iov_len);
2025 ceph_msg_dump(msg);
2026 return;
2027}
2028
2029
2030/*
2031 * called under session->mutex.
2032 */
2033static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
2034 struct ceph_mds_session *session)
2035{
2036 struct ceph_mds_request *req, *nreq;
2037 int err;
2038
2039 dout("replay_unsafe_requests mds%d\n", session->s_mds);
2040
2041 mutex_lock(&mdsc->mutex);
2042 list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) {
2043 err = __prepare_send_request(mdsc, req, session->s_mds);
2044 if (!err) {
2045 ceph_msg_get(req->r_request);
2046 ceph_con_send(&session->s_con, req->r_request);
2047 }
2048 }
2049 mutex_unlock(&mdsc->mutex);
2050}
2051
2052/*
2053 * Encode information about a cap for a reconnect with the MDS.
2054 */
2055static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
2056 void *arg)
2057{
2058 struct ceph_mds_cap_reconnect rec;
2059 struct ceph_inode_info *ci;
2060 struct ceph_pagelist *pagelist = arg;
2061 char *path;
2062 int pathlen, err;
2063 u64 pathbase;
2064 struct dentry *dentry;
2065
2066 ci = cap->ci;
2067
2068 dout(" adding %p ino %llx.%llx cap %p %lld %s\n",
2069 inode, ceph_vinop(inode), cap, cap->cap_id,
2070 ceph_cap_string(cap->issued));
2071 err = ceph_pagelist_encode_64(pagelist, ceph_ino(inode));
2072 if (err)
2073 return err;
2074
2075 dentry = d_find_alias(inode);
2076 if (dentry) {
2077 path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, 0);
2078 if (IS_ERR(path)) {
2079 err = PTR_ERR(path);
2080 BUG_ON(err);
2081 }
2082 } else {
2083 path = NULL;
2084 pathlen = 0;
2085 }
2086 err = ceph_pagelist_encode_string(pagelist, path, pathlen);
2087 if (err)
2088 goto out;
2089
2090 spin_lock(&inode->i_lock);
2091 cap->seq = 0; /* reset cap seq */
2092 cap->issue_seq = 0; /* and issue_seq */
2093 rec.cap_id = cpu_to_le64(cap->cap_id);
2094 rec.pathbase = cpu_to_le64(pathbase);
2095 rec.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
2096 rec.issued = cpu_to_le32(cap->issued);
2097 rec.size = cpu_to_le64(inode->i_size);
2098 ceph_encode_timespec(&rec.mtime, &inode->i_mtime);
2099 ceph_encode_timespec(&rec.atime, &inode->i_atime);
2100 rec.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
2101 spin_unlock(&inode->i_lock);
2102
2103 err = ceph_pagelist_append(pagelist, &rec, sizeof(rec));
2104
2105out:
2106 kfree(path);
2107 dput(dentry);
2108 return err;
2109}
2110
2111
2112/*
2113 * If an MDS fails and recovers, clients need to reconnect in order to
2114 * reestablish shared state. This includes all caps issued through
2115 * this session _and_ the snap_realm hierarchy. Because it's not
2116 * clear which snap realms the mds cares about, we send everything we
2117 * know about.. that ensures we'll then get any new info the
2118 * recovering MDS might have.
2119 *
2120 * This is a relatively heavyweight operation, but it's rare.
2121 *
2122 * called with mdsc->mutex held.
2123 */
2124static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
2125{
2126 struct ceph_mds_session *session = NULL;
2127 struct ceph_msg *reply;
2128 struct rb_node *p;
2129 int err;
2130 struct ceph_pagelist *pagelist;
2131
2132 pr_info("reconnect to recovering mds%d\n", mds);
2133
2134 pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
2135 if (!pagelist)
2136 goto fail_nopagelist;
2137 ceph_pagelist_init(pagelist);
2138
2139 reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
2140 if (IS_ERR(reply)) {
2141 err = PTR_ERR(reply);
2142 goto fail_nomsg;
2143 }
2144
2145 /* find session */
2146 session = __ceph_lookup_mds_session(mdsc, mds);
2147 mutex_unlock(&mdsc->mutex); /* drop lock for duration */
2148
2149 if (session) {
2150 mutex_lock(&session->s_mutex);
2151
2152 session->s_state = CEPH_MDS_SESSION_RECONNECTING;
2153 session->s_seq = 0;
2154
2155 ceph_con_open(&session->s_con,
2156 ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
2157
2158 /* replay unsafe requests */
2159 replay_unsafe_requests(mdsc, session);
2160 } else {
2161 dout("no session for mds%d, will send short reconnect\n",
2162 mds);
2163 }
2164
2165 down_read(&mdsc->snap_rwsem);
2166
2167 if (!session)
2168 goto send;
2169 dout("session %p state %s\n", session,
2170 session_state_name(session->s_state));
2171
2172 /* traverse this session's caps */
2173 err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
2174 if (err)
2175 goto fail;
2176 err = iterate_session_caps(session, encode_caps_cb, pagelist);
2177 if (err < 0)
2178 goto out;
2179
2180 /*
2181 * snaprealms. we provide mds with the ino, seq (version), and
2182 * parent for all of our realms. If the mds has any newer info,
2183 * it will tell us.
2184 */
2185 for (p = rb_first(&mdsc->snap_realms); p; p = rb_next(p)) {
2186 struct ceph_snap_realm *realm =
2187 rb_entry(p, struct ceph_snap_realm, node);
2188 struct ceph_mds_snaprealm_reconnect sr_rec;
2189
2190 dout(" adding snap realm %llx seq %lld parent %llx\n",
2191 realm->ino, realm->seq, realm->parent_ino);
2192 sr_rec.ino = cpu_to_le64(realm->ino);
2193 sr_rec.seq = cpu_to_le64(realm->seq);
2194 sr_rec.parent = cpu_to_le64(realm->parent_ino);
2195 err = ceph_pagelist_append(pagelist, &sr_rec, sizeof(sr_rec));
2196 if (err)
2197 goto fail;
2198 }
2199
2200send:
2201 reply->pagelist = pagelist;
2202 reply->hdr.data_len = cpu_to_le32(pagelist->length);
2203 reply->nr_pages = calc_pages_for(0, pagelist->length);
2204 ceph_con_send(&session->s_con, reply);
2205
2206 if (session) {
2207 session->s_state = CEPH_MDS_SESSION_OPEN;
2208 __wake_requests(mdsc, &session->s_waiting);
2209 }
2210
2211out:
2212 up_read(&mdsc->snap_rwsem);
2213 if (session) {
2214 mutex_unlock(&session->s_mutex);
2215 ceph_put_mds_session(session);
2216 }
2217 mutex_lock(&mdsc->mutex);
2218 return;
2219
2220fail:
2221 ceph_msg_put(reply);
2222fail_nomsg:
2223 ceph_pagelist_release(pagelist);
2224 kfree(pagelist);
2225fail_nopagelist:
2226 pr_err("ENOMEM preparing reconnect for mds%d\n", mds);
2227 goto out;
2228}
2229
2230
2231/*
2232 * compare old and new mdsmaps, kicking requests
2233 * and closing out old connections as necessary
2234 *
2235 * called under mdsc->mutex.
2236 */
2237static void check_new_map(struct ceph_mds_client *mdsc,
2238 struct ceph_mdsmap *newmap,
2239 struct ceph_mdsmap *oldmap)
2240{
2241 int i;
2242 int oldstate, newstate;
2243 struct ceph_mds_session *s;
2244
2245 dout("check_new_map new %u old %u\n",
2246 newmap->m_epoch, oldmap->m_epoch);
2247
2248 for (i = 0; i < oldmap->m_max_mds && i < mdsc->max_sessions; i++) {
2249 if (mdsc->sessions[i] == NULL)
2250 continue;
2251 s = mdsc->sessions[i];
2252 oldstate = ceph_mdsmap_get_state(oldmap, i);
2253 newstate = ceph_mdsmap_get_state(newmap, i);
2254
2255 dout("check_new_map mds%d state %s -> %s (session %s)\n",
2256 i, ceph_mds_state_name(oldstate),
2257 ceph_mds_state_name(newstate),
2258 session_state_name(s->s_state));
2259
2260 if (memcmp(ceph_mdsmap_get_addr(oldmap, i),
2261 ceph_mdsmap_get_addr(newmap, i),
2262 sizeof(struct ceph_entity_addr))) {
2263 if (s->s_state == CEPH_MDS_SESSION_OPENING) {
2264 /* the session never opened, just close it
2265 * out now */
2266 __wake_requests(mdsc, &s->s_waiting);
2267 __unregister_session(mdsc, s);
2268 } else {
2269 /* just close it */
2270 mutex_unlock(&mdsc->mutex);
2271 mutex_lock(&s->s_mutex);
2272 mutex_lock(&mdsc->mutex);
2273 ceph_con_close(&s->s_con);
2274 mutex_unlock(&s->s_mutex);
2275 s->s_state = CEPH_MDS_SESSION_RESTARTING;
2276 }
2277
2278 /* kick any requests waiting on the recovering mds */
2279 kick_requests(mdsc, i, 1);
2280 } else if (oldstate == newstate) {
2281 continue; /* nothing new with this mds */
2282 }
2283
2284 /*
2285 * send reconnect?
2286 */
2287 if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
2288 newstate >= CEPH_MDS_STATE_RECONNECT)
2289 send_mds_reconnect(mdsc, i);
2290
2291 /*
2292 * kick requests on any mds that has gone active.
2293 *
2294 * kick requests on cur or forwarder: we may have sent
2295 * the request to mds1, mds1 told us it forwarded it
2296 * to mds2, but then we learn mds1 failed and can't be
2297 * sure it successfully forwarded our request before
2298 * it died.
2299 */
2300 if (oldstate < CEPH_MDS_STATE_ACTIVE &&
2301 newstate >= CEPH_MDS_STATE_ACTIVE) {
2302 pr_info("mds%d reconnect completed\n", s->s_mds);
2303 kick_requests(mdsc, i, 1);
2304 ceph_kick_flushing_caps(mdsc, s);
2305 wake_up_session_caps(s, 1);
2306 }
2307 }
2308}
2309
2310
2311
2312/*
2313 * leases
2314 */
2315
2316/*
2317 * caller must hold session s_mutex, dentry->d_lock
2318 */
2319void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry)
2320{
2321 struct ceph_dentry_info *di = ceph_dentry(dentry);
2322
2323 ceph_put_mds_session(di->lease_session);
2324 di->lease_session = NULL;
2325}
2326
2327static void handle_lease(struct ceph_mds_client *mdsc,
2328 struct ceph_mds_session *session,
2329 struct ceph_msg *msg)
2330{
2331 struct super_block *sb = mdsc->client->sb;
2332 struct inode *inode;
2333 struct ceph_inode_info *ci;
2334 struct dentry *parent, *dentry;
2335 struct ceph_dentry_info *di;
2336 int mds = session->s_mds;
2337 struct ceph_mds_lease *h = msg->front.iov_base;
2338 struct ceph_vino vino;
2339 int mask;
2340 struct qstr dname;
2341 int release = 0;
2342
2343 dout("handle_lease from mds%d\n", mds);
2344
2345 /* decode */
2346 if (msg->front.iov_len < sizeof(*h) + sizeof(u32))
2347 goto bad;
2348 vino.ino = le64_to_cpu(h->ino);
2349 vino.snap = CEPH_NOSNAP;
2350 mask = le16_to_cpu(h->mask);
2351 dname.name = (void *)h + sizeof(*h) + sizeof(u32);
2352 dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
2353 if (dname.len != get_unaligned_le32(h+1))
2354 goto bad;
2355
2356 mutex_lock(&session->s_mutex);
2357 session->s_seq++;
2358
2359 /* lookup inode */
2360 inode = ceph_find_inode(sb, vino);
2361 dout("handle_lease '%s', mask %d, ino %llx %p\n",
2362 ceph_lease_op_name(h->action), mask, vino.ino, inode);
2363 if (inode == NULL) {
2364 dout("handle_lease no inode %llx\n", vino.ino);
2365 goto release;
2366 }
2367 ci = ceph_inode(inode);
2368
2369 /* dentry */
2370 parent = d_find_alias(inode);
2371 if (!parent) {
2372 dout("no parent dentry on inode %p\n", inode);
2373 WARN_ON(1);
2374 goto release; /* hrm... */
2375 }
2376 dname.hash = full_name_hash(dname.name, dname.len);
2377 dentry = d_lookup(parent, &dname);
2378 dput(parent);
2379 if (!dentry)
2380 goto release;
2381
2382 spin_lock(&dentry->d_lock);
2383 di = ceph_dentry(dentry);
2384 switch (h->action) {
2385 case CEPH_MDS_LEASE_REVOKE:
2386 if (di && di->lease_session == session) {
2387 h->seq = cpu_to_le32(di->lease_seq);
2388 __ceph_mdsc_drop_dentry_lease(dentry);
2389 }
2390 release = 1;
2391 break;
2392
2393 case CEPH_MDS_LEASE_RENEW:
2394 if (di && di->lease_session == session &&
2395 di->lease_gen == session->s_cap_gen &&
2396 di->lease_renew_from &&
2397 di->lease_renew_after == 0) {
2398 unsigned long duration =
2399 le32_to_cpu(h->duration_ms) * HZ / 1000;
2400
2401 di->lease_seq = le32_to_cpu(h->seq);
2402 dentry->d_time = di->lease_renew_from + duration;
2403 di->lease_renew_after = di->lease_renew_from +
2404 (duration >> 1);
2405 di->lease_renew_from = 0;
2406 }
2407 break;
2408 }
2409 spin_unlock(&dentry->d_lock);
2410 dput(dentry);
2411
2412 if (!release)
2413 goto out;
2414
2415release:
2416 /* let's just reuse the same message */
2417 h->action = CEPH_MDS_LEASE_REVOKE_ACK;
2418 ceph_msg_get(msg);
2419 ceph_con_send(&session->s_con, msg);
2420
2421out:
2422 iput(inode);
2423 mutex_unlock(&session->s_mutex);
2424 return;
2425
2426bad:
2427 pr_err("corrupt lease message\n");
2428 ceph_msg_dump(msg);
2429}
2430
2431void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
2432 struct inode *inode,
2433 struct dentry *dentry, char action,
2434 u32 seq)
2435{
2436 struct ceph_msg *msg;
2437 struct ceph_mds_lease *lease;
2438 int len = sizeof(*lease) + sizeof(u32);
2439 int dnamelen = 0;
2440
2441 dout("lease_send_msg inode %p dentry %p %s to mds%d\n",
2442 inode, dentry, ceph_lease_op_name(action), session->s_mds);
2443 dnamelen = dentry->d_name.len;
2444 len += dnamelen;
2445
2446 msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
2447 if (IS_ERR(msg))
2448 return;
2449 lease = msg->front.iov_base;
2450 lease->action = action;
2451 lease->mask = cpu_to_le16(CEPH_LOCK_DN);
2452 lease->ino = cpu_to_le64(ceph_vino(inode).ino);
2453 lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
2454 lease->seq = cpu_to_le32(seq);
2455 put_unaligned_le32(dnamelen, lease + 1);
2456 memcpy((void *)(lease + 1) + 4, dentry->d_name.name, dnamelen);
2457
2458 /*
2459 * if this is a preemptive lease RELEASE, no need to
2460 * flush request stream, since the actual request will
2461 * soon follow.
2462 */
2463 msg->more_to_follow = (action == CEPH_MDS_LEASE_RELEASE);
2464
2465 ceph_con_send(&session->s_con, msg);
2466}
2467
2468/*
2469 * Preemptively release a lease we expect to invalidate anyway.
2470 * Pass @inode always, @dentry is optional.
2471 */
2472void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
2473 struct dentry *dentry, int mask)
2474{
2475 struct ceph_dentry_info *di;
2476 struct ceph_mds_session *session;
2477 u32 seq;
2478
2479 BUG_ON(inode == NULL);
2480 BUG_ON(dentry == NULL);
2481 BUG_ON(mask != CEPH_LOCK_DN);
2482
2483 /* is dentry lease valid? */
2484 spin_lock(&dentry->d_lock);
2485 di = ceph_dentry(dentry);
2486 if (!di || !di->lease_session ||
2487 di->lease_session->s_mds < 0 ||
2488 di->lease_gen != di->lease_session->s_cap_gen ||
2489 !time_before(jiffies, dentry->d_time)) {
2490 dout("lease_release inode %p dentry %p -- "
2491 "no lease on %d\n",
2492 inode, dentry, mask);
2493 spin_unlock(&dentry->d_lock);
2494 return;
2495 }
2496
2497 /* we do have a lease on this dentry; note mds and seq */
2498 session = ceph_get_mds_session(di->lease_session);
2499 seq = di->lease_seq;
2500 __ceph_mdsc_drop_dentry_lease(dentry);
2501 spin_unlock(&dentry->d_lock);
2502
2503 dout("lease_release inode %p dentry %p mask %d to mds%d\n",
2504 inode, dentry, mask, session->s_mds);
2505 ceph_mdsc_lease_send_msg(session, inode, dentry,
2506 CEPH_MDS_LEASE_RELEASE, seq);
2507 ceph_put_mds_session(session);
2508}
2509
2510/*
2511 * drop all leases (and dentry refs) in preparation for umount
2512 */
2513static void drop_leases(struct ceph_mds_client *mdsc)
2514{
2515 int i;
2516
2517 dout("drop_leases\n");
2518 mutex_lock(&mdsc->mutex);
2519 for (i = 0; i < mdsc->max_sessions; i++) {
2520 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2521 if (!s)
2522 continue;
2523 mutex_unlock(&mdsc->mutex);
2524 mutex_lock(&s->s_mutex);
2525 mutex_unlock(&s->s_mutex);
2526 ceph_put_mds_session(s);
2527 mutex_lock(&mdsc->mutex);
2528 }
2529 mutex_unlock(&mdsc->mutex);
2530}
2531
2532
2533
2534/*
2535 * delayed work -- periodically trim expired leases, renew caps with mds
2536 */
2537static void schedule_delayed(struct ceph_mds_client *mdsc)
2538{
2539 int delay = 5;
2540 unsigned hz = round_jiffies_relative(HZ * delay);
2541 schedule_delayed_work(&mdsc->delayed_work, hz);
2542}
2543
2544static void delayed_work(struct work_struct *work)
2545{
2546 int i;
2547 struct ceph_mds_client *mdsc =
2548 container_of(work, struct ceph_mds_client, delayed_work.work);
2549 int renew_interval;
2550 int renew_caps;
2551
2552 dout("mdsc delayed_work\n");
2553 ceph_check_delayed_caps(mdsc);
2554
2555 mutex_lock(&mdsc->mutex);
2556 renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
2557 renew_caps = time_after_eq(jiffies, HZ*renew_interval +
2558 mdsc->last_renew_caps);
2559 if (renew_caps)
2560 mdsc->last_renew_caps = jiffies;
2561
2562 for (i = 0; i < mdsc->max_sessions; i++) {
2563 struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
2564 if (s == NULL)
2565 continue;
2566 if (s->s_state == CEPH_MDS_SESSION_CLOSING) {
2567 dout("resending session close request for mds%d\n",
2568 s->s_mds);
2569 request_close_session(mdsc, s);
2570 ceph_put_mds_session(s);
2571 continue;
2572 }
2573 if (s->s_ttl && time_after(jiffies, s->s_ttl)) {
2574 if (s->s_state == CEPH_MDS_SESSION_OPEN) {
2575 s->s_state = CEPH_MDS_SESSION_HUNG;
2576 pr_info("mds%d hung\n", s->s_mds);
2577 }
2578 }
2579 if (s->s_state < CEPH_MDS_SESSION_OPEN) {
2580 /* this mds is failed or recovering, just wait */
2581 ceph_put_mds_session(s);
2582 continue;
2583 }
2584 mutex_unlock(&mdsc->mutex);
2585
2586 mutex_lock(&s->s_mutex);
2587 if (renew_caps)
2588 send_renew_caps(mdsc, s);
2589 else
2590 ceph_con_keepalive(&s->s_con);
2591 add_cap_releases(mdsc, s, -1);
2592 send_cap_releases(mdsc, s);
2593 mutex_unlock(&s->s_mutex);
2594 ceph_put_mds_session(s);
2595
2596 mutex_lock(&mdsc->mutex);
2597 }
2598 mutex_unlock(&mdsc->mutex);
2599
2600 schedule_delayed(mdsc);
2601}
2602
2603
2604int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
2605{
2606 mdsc->client = client;
2607 mutex_init(&mdsc->mutex);
2608 mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
2609 init_completion(&mdsc->safe_umount_waiters);
2610 init_completion(&mdsc->session_close_waiters);
2611 INIT_LIST_HEAD(&mdsc->waiting_for_map);
2612 mdsc->sessions = NULL;
2613 mdsc->max_sessions = 0;
2614 mdsc->stopping = 0;
2615 init_rwsem(&mdsc->snap_rwsem);
2616 mdsc->snap_realms = RB_ROOT;
2617 INIT_LIST_HEAD(&mdsc->snap_empty);
2618 spin_lock_init(&mdsc->snap_empty_lock);
2619 mdsc->last_tid = 0;
2620 mdsc->request_tree = RB_ROOT;
2621 INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work);
2622 mdsc->last_renew_caps = jiffies;
2623 INIT_LIST_HEAD(&mdsc->cap_delay_list);
2624 spin_lock_init(&mdsc->cap_delay_lock);
2625 INIT_LIST_HEAD(&mdsc->snap_flush_list);
2626 spin_lock_init(&mdsc->snap_flush_lock);
2627 mdsc->cap_flush_seq = 0;
2628 INIT_LIST_HEAD(&mdsc->cap_dirty);
2629 mdsc->num_cap_flushing = 0;
2630 spin_lock_init(&mdsc->cap_dirty_lock);
2631 init_waitqueue_head(&mdsc->cap_flushing_wq);
2632 spin_lock_init(&mdsc->dentry_lru_lock);
2633 INIT_LIST_HEAD(&mdsc->dentry_lru);
2634 return 0;
2635}
2636
2637/*
2638 * Wait for safe replies on open mds requests. If we time out, drop
2639 * all requests from the tree to avoid dangling dentry refs.
2640 */
2641static void wait_requests(struct ceph_mds_client *mdsc)
2642{
2643 struct ceph_mds_request *req;
2644 struct ceph_client *client = mdsc->client;
2645
2646 mutex_lock(&mdsc->mutex);
2647 if (__get_oldest_req(mdsc)) {
2648 mutex_unlock(&mdsc->mutex);
2649
2650 dout("wait_requests waiting for requests\n");
2651 wait_for_completion_timeout(&mdsc->safe_umount_waiters,
2652 client->mount_args->mount_timeout * HZ);
2653
2654 /* tear down remaining requests */
2655 mutex_lock(&mdsc->mutex);
2656 while ((req = __get_oldest_req(mdsc))) {
2657 dout("wait_requests timed out on tid %llu\n",
2658 req->r_tid);
2659 __unregister_request(mdsc, req);
2660 }
2661 }
2662 mutex_unlock(&mdsc->mutex);
2663 dout("wait_requests done\n");
2664}
2665
2666/*
2667 * called before mount is ro, and before dentries are torn down.
2668 * (hmm, does this still race with new lookups?)
2669 */
2670void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc)
2671{
2672 dout("pre_umount\n");
2673 mdsc->stopping = 1;
2674
2675 drop_leases(mdsc);
2676 ceph_flush_dirty_caps(mdsc);
2677 wait_requests(mdsc);
2678}
2679
2680/*
2681 * wait for all write mds requests to flush.
2682 */
2683static void wait_unsafe_requests(struct ceph_mds_client *mdsc, u64 want_tid)
2684{
2685 struct ceph_mds_request *req = NULL;
2686 struct rb_node *n;
2687
2688 mutex_lock(&mdsc->mutex);
2689 dout("wait_unsafe_requests want %lld\n", want_tid);
2690 req = __get_oldest_req(mdsc);
2691 while (req && req->r_tid <= want_tid) {
2692 if ((req->r_op & CEPH_MDS_OP_WRITE)) {
2693 /* write op */
2694 ceph_mdsc_get_request(req);
2695 mutex_unlock(&mdsc->mutex);
2696 dout("wait_unsafe_requests wait on %llu (want %llu)\n",
2697 req->r_tid, want_tid);
2698 wait_for_completion(&req->r_safe_completion);
2699 mutex_lock(&mdsc->mutex);
2700 n = rb_next(&req->r_node);
2701 ceph_mdsc_put_request(req);
2702 } else {
2703 n = rb_next(&req->r_node);
2704 }
2705 if (!n)
2706 break;
2707 req = rb_entry(n, struct ceph_mds_request, r_node);
2708 }
2709 mutex_unlock(&mdsc->mutex);
2710 dout("wait_unsafe_requests done\n");
2711}
2712
2713void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
2714{
2715 u64 want_tid, want_flush;
2716
2717 dout("sync\n");
2718 mutex_lock(&mdsc->mutex);
2719 want_tid = mdsc->last_tid;
2720 want_flush = mdsc->cap_flush_seq;
2721 mutex_unlock(&mdsc->mutex);
2722 dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
2723
2724 ceph_flush_dirty_caps(mdsc);
2725
2726 wait_unsafe_requests(mdsc, want_tid);
2727 wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
2728}
2729
2730
2731/*
2732 * called after sb is ro.
2733 */
2734void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
2735{
2736 struct ceph_mds_session *session;
2737 int i;
2738 int n;
2739 struct ceph_client *client = mdsc->client;
2740 unsigned long started, timeout = client->mount_args->mount_timeout * HZ;
2741
2742 dout("close_sessions\n");
2743
2744 mutex_lock(&mdsc->mutex);
2745
2746 /* close sessions */
2747 started = jiffies;
2748 while (time_before(jiffies, started + timeout)) {
2749 dout("closing sessions\n");
2750 n = 0;
2751 for (i = 0; i < mdsc->max_sessions; i++) {
2752 session = __ceph_lookup_mds_session(mdsc, i);
2753 if (!session)
2754 continue;
2755 mutex_unlock(&mdsc->mutex);
2756 mutex_lock(&session->s_mutex);
2757 __close_session(mdsc, session);
2758 mutex_unlock(&session->s_mutex);
2759 ceph_put_mds_session(session);
2760 mutex_lock(&mdsc->mutex);
2761 n++;
2762 }
2763 if (n == 0)
2764 break;
2765
2766 if (client->mount_state == CEPH_MOUNT_SHUTDOWN)
2767 break;
2768
2769 dout("waiting for sessions to close\n");
2770 mutex_unlock(&mdsc->mutex);
2771 wait_for_completion_timeout(&mdsc->session_close_waiters,
2772 timeout);
2773 mutex_lock(&mdsc->mutex);
2774 }
2775
2776 /* tear down remaining sessions */
2777 for (i = 0; i < mdsc->max_sessions; i++) {
2778 if (mdsc->sessions[i]) {
2779 session = get_session(mdsc->sessions[i]);
2780 __unregister_session(mdsc, session);
2781 mutex_unlock(&mdsc->mutex);
2782 mutex_lock(&session->s_mutex);
2783 remove_session_caps(session);
2784 mutex_unlock(&session->s_mutex);
2785 ceph_put_mds_session(session);
2786 mutex_lock(&mdsc->mutex);
2787 }
2788 }
2789
2790 WARN_ON(!list_empty(&mdsc->cap_delay_list));
2791
2792 mutex_unlock(&mdsc->mutex);
2793
2794 ceph_cleanup_empty_realms(mdsc);
2795
2796 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2797
2798 dout("stopped\n");
2799}
2800
2801void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
2802{
2803 dout("stop\n");
2804 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
2805 if (mdsc->mdsmap)
2806 ceph_mdsmap_destroy(mdsc->mdsmap);
2807 kfree(mdsc->sessions);
2808}
2809
2810
2811/*
2812 * handle mds map update.
2813 */
2814void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
2815{
2816 u32 epoch;
2817 u32 maplen;
2818 void *p = msg->front.iov_base;
2819 void *end = p + msg->front.iov_len;
2820 struct ceph_mdsmap *newmap, *oldmap;
2821 struct ceph_fsid fsid;
2822 int err = -EINVAL;
2823
2824 ceph_decode_need(&p, end, sizeof(fsid)+2*sizeof(u32), bad);
2825 ceph_decode_copy(&p, &fsid, sizeof(fsid));
2826 if (ceph_check_fsid(mdsc->client, &fsid) < 0)
2827 return;
2828 epoch = ceph_decode_32(&p);
2829 maplen = ceph_decode_32(&p);
2830 dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
2831
2832 /* do we need it? */
2833 ceph_monc_got_mdsmap(&mdsc->client->monc, epoch);
2834 mutex_lock(&mdsc->mutex);
2835 if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
2836 dout("handle_map epoch %u <= our %u\n",
2837 epoch, mdsc->mdsmap->m_epoch);
2838 mutex_unlock(&mdsc->mutex);
2839 return;
2840 }
2841
2842 newmap = ceph_mdsmap_decode(&p, end);
2843 if (IS_ERR(newmap)) {
2844 err = PTR_ERR(newmap);
2845 goto bad_unlock;
2846 }
2847
2848 /* swap into place */
2849 if (mdsc->mdsmap) {
2850 oldmap = mdsc->mdsmap;
2851 mdsc->mdsmap = newmap;
2852 check_new_map(mdsc, newmap, oldmap);
2853 ceph_mdsmap_destroy(oldmap);
2854 } else {
2855 mdsc->mdsmap = newmap; /* first mds map */
2856 }
2857 mdsc->client->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
2858
2859 __wake_requests(mdsc, &mdsc->waiting_for_map);
2860
2861 mutex_unlock(&mdsc->mutex);
2862 schedule_delayed(mdsc);
2863 return;
2864
2865bad_unlock:
2866 mutex_unlock(&mdsc->mutex);
2867bad:
2868 pr_err("error decoding mdsmap %d\n", err);
2869 return;
2870}
2871
2872static struct ceph_connection *con_get(struct ceph_connection *con)
2873{
2874 struct ceph_mds_session *s = con->private;
2875
2876 if (get_session(s)) {
2877 dout("mdsc con_get %p ok (%d)\n", s, atomic_read(&s->s_ref));
2878 return con;
2879 }
2880 dout("mdsc con_get %p FAIL\n", s);
2881 return NULL;
2882}
2883
2884static void con_put(struct ceph_connection *con)
2885{
2886 struct ceph_mds_session *s = con->private;
2887
2888 ceph_put_mds_session(s);
2889 dout("mdsc con_put %p (%d)\n", s, atomic_read(&s->s_ref));
2890}
2891
2892/*
2893 * if the client is unresponsive for long enough, the mds will kill
2894 * the session entirely.
2895 */
2896static void peer_reset(struct ceph_connection *con)
2897{
2898 struct ceph_mds_session *s = con->private;
2899
2900 pr_err("mds%d gave us the boot. IMPLEMENT RECONNECT.\n",
2901 s->s_mds);
2902}
2903
2904static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
2905{
2906 struct ceph_mds_session *s = con->private;
2907 struct ceph_mds_client *mdsc = s->s_mdsc;
2908 int type = le16_to_cpu(msg->hdr.type);
2909
2910 mutex_lock(&mdsc->mutex);
2911 if (__verify_registered_session(mdsc, s) < 0) {
2912 mutex_unlock(&mdsc->mutex);
2913 goto out;
2914 }
2915 mutex_unlock(&mdsc->mutex);
2916
2917 switch (type) {
2918 case CEPH_MSG_MDS_MAP:
2919 ceph_mdsc_handle_map(mdsc, msg);
2920 break;
2921 case CEPH_MSG_CLIENT_SESSION:
2922 handle_session(s, msg);
2923 break;
2924 case CEPH_MSG_CLIENT_REPLY:
2925 handle_reply(s, msg);
2926 break;
2927 case CEPH_MSG_CLIENT_REQUEST_FORWARD:
2928 handle_forward(mdsc, s, msg);
2929 break;
2930 case CEPH_MSG_CLIENT_CAPS:
2931 ceph_handle_caps(s, msg);
2932 break;
2933 case CEPH_MSG_CLIENT_SNAP:
2934 ceph_handle_snap(mdsc, s, msg);
2935 break;
2936 case CEPH_MSG_CLIENT_LEASE:
2937 handle_lease(mdsc, s, msg);
2938 break;
2939
2940 default:
2941 pr_err("received unknown message type %d %s\n", type,
2942 ceph_msg_type_name(type));
2943 }
2944out:
2945 ceph_msg_put(msg);
2946}
2947
2948/*
2949 * authentication
2950 */
2951static int get_authorizer(struct ceph_connection *con,
2952 void **buf, int *len, int *proto,
2953 void **reply_buf, int *reply_len, int force_new)
2954{
2955 struct ceph_mds_session *s = con->private;
2956 struct ceph_mds_client *mdsc = s->s_mdsc;
2957 struct ceph_auth_client *ac = mdsc->client->monc.auth;
2958 int ret = 0;
2959
2960 if (force_new && s->s_authorizer) {
2961 ac->ops->destroy_authorizer(ac, s->s_authorizer);
2962 s->s_authorizer = NULL;
2963 }
2964 if (s->s_authorizer == NULL) {
2965 if (ac->ops->create_authorizer) {
2966 ret = ac->ops->create_authorizer(
2967 ac, CEPH_ENTITY_TYPE_MDS,
2968 &s->s_authorizer,
2969 &s->s_authorizer_buf,
2970 &s->s_authorizer_buf_len,
2971 &s->s_authorizer_reply_buf,
2972 &s->s_authorizer_reply_buf_len);
2973 if (ret)
2974 return ret;
2975 }
2976 }
2977
2978 *proto = ac->protocol;
2979 *buf = s->s_authorizer_buf;
2980 *len = s->s_authorizer_buf_len;
2981 *reply_buf = s->s_authorizer_reply_buf;
2982 *reply_len = s->s_authorizer_reply_buf_len;
2983 return 0;
2984}
2985
2986
2987static int verify_authorizer_reply(struct ceph_connection *con, int len)
2988{
2989 struct ceph_mds_session *s = con->private;
2990 struct ceph_mds_client *mdsc = s->s_mdsc;
2991 struct ceph_auth_client *ac = mdsc->client->monc.auth;
2992
2993 return ac->ops->verify_authorizer_reply(ac, s->s_authorizer, len);
2994}
2995
2996static int invalidate_authorizer(struct ceph_connection *con)
2997{
2998 struct ceph_mds_session *s = con->private;
2999 struct ceph_mds_client *mdsc = s->s_mdsc;
3000 struct ceph_auth_client *ac = mdsc->client->monc.auth;
3001
3002 if (ac->ops->invalidate_authorizer)
3003 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_MDS);
3004
3005 return ceph_monc_validate_auth(&mdsc->client->monc);
3006}
3007
3008const static struct ceph_connection_operations mds_con_ops = {
3009 .get = con_get,
3010 .put = con_put,
3011 .dispatch = dispatch,
3012 .get_authorizer = get_authorizer,
3013 .verify_authorizer_reply = verify_authorizer_reply,
3014 .invalidate_authorizer = invalidate_authorizer,
3015 .peer_reset = peer_reset,
3016};
3017
3018
3019
3020
3021/* eof */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
new file mode 100644
index 000000000000..961cc6f65878
--- /dev/null
+++ b/fs/ceph/mds_client.h
@@ -0,0 +1,335 @@
1#ifndef _FS_CEPH_MDS_CLIENT_H
2#define _FS_CEPH_MDS_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/list.h>
7#include <linux/mutex.h>
8#include <linux/rbtree.h>
9#include <linux/spinlock.h>
10
11#include "types.h"
12#include "messenger.h"
13#include "mdsmap.h"
14
15/*
16 * Some lock dependencies:
17 *
18 * session->s_mutex
19 * mdsc->mutex
20 *
21 * mdsc->snap_rwsem
22 *
23 * inode->i_lock
24 * mdsc->snap_flush_lock
25 * mdsc->cap_delay_lock
26 *
27 */
28
29struct ceph_client;
30struct ceph_cap;
31
32/*
33 * parsed info about a single inode. pointers are into the encoded
34 * on-wire structures within the mds reply message payload.
35 */
36struct ceph_mds_reply_info_in {
37 struct ceph_mds_reply_inode *in;
38 u32 symlink_len;
39 char *symlink;
40 u32 xattr_len;
41 char *xattr_data;
42};
43
44/*
45 * parsed info about an mds reply, including information about the
46 * target inode and/or its parent directory and dentry, and directory
47 * contents (for readdir results).
48 */
49struct ceph_mds_reply_info_parsed {
50 struct ceph_mds_reply_head *head;
51
52 struct ceph_mds_reply_info_in diri, targeti;
53 struct ceph_mds_reply_dirfrag *dirfrag;
54 char *dname;
55 u32 dname_len;
56 struct ceph_mds_reply_lease *dlease;
57
58 struct ceph_mds_reply_dirfrag *dir_dir;
59 int dir_nr;
60 char **dir_dname;
61 u32 *dir_dname_len;
62 struct ceph_mds_reply_lease **dir_dlease;
63 struct ceph_mds_reply_info_in *dir_in;
64 u8 dir_complete, dir_end;
65
66 /* encoded blob describing snapshot contexts for certain
67 operations (e.g., open) */
68 void *snapblob;
69 int snapblob_len;
70};
71
72
73/*
74 * cap releases are batched and sent to the MDS en masse.
75 */
76#define CEPH_CAPS_PER_RELEASE ((PAGE_CACHE_SIZE - \
77 sizeof(struct ceph_mds_cap_release)) / \
78 sizeof(struct ceph_mds_cap_item))
79
80
81/*
82 * state associated with each MDS<->client session
83 */
84enum {
85 CEPH_MDS_SESSION_NEW = 1,
86 CEPH_MDS_SESSION_OPENING = 2,
87 CEPH_MDS_SESSION_OPEN = 3,
88 CEPH_MDS_SESSION_HUNG = 4,
89 CEPH_MDS_SESSION_CLOSING = 5,
90 CEPH_MDS_SESSION_RESTARTING = 6,
91 CEPH_MDS_SESSION_RECONNECTING = 7,
92};
93
94struct ceph_mds_session {
95 struct ceph_mds_client *s_mdsc;
96 int s_mds;
97 int s_state;
98 unsigned long s_ttl; /* time until mds kills us */
99 u64 s_seq; /* incoming msg seq # */
100 struct mutex s_mutex; /* serialize session messages */
101
102 struct ceph_connection s_con;
103
104 struct ceph_authorizer *s_authorizer;
105 void *s_authorizer_buf, *s_authorizer_reply_buf;
106 size_t s_authorizer_buf_len, s_authorizer_reply_buf_len;
107
108 /* protected by s_cap_lock */
109 spinlock_t s_cap_lock;
110 u32 s_cap_gen; /* inc each time we get mds stale msg */
111 unsigned long s_cap_ttl; /* when session caps expire */
112 struct list_head s_caps; /* all caps issued by this session */
113 int s_nr_caps, s_trim_caps;
114 int s_num_cap_releases;
115 struct list_head s_cap_releases; /* waiting cap_release messages */
116 struct list_head s_cap_releases_done; /* ready to send */
117 struct ceph_cap *s_cap_iterator;
118
119 /* protected by mutex */
120 struct list_head s_cap_flushing; /* inodes w/ flushing caps */
121 struct list_head s_cap_snaps_flushing;
122 unsigned long s_renew_requested; /* last time we sent a renew req */
123 u64 s_renew_seq;
124
125 atomic_t s_ref;
126 struct list_head s_waiting; /* waiting requests */
127 struct list_head s_unsafe; /* unsafe requests */
128};
129
130/*
131 * modes of choosing which MDS to send a request to
132 */
133enum {
134 USE_ANY_MDS,
135 USE_RANDOM_MDS,
136 USE_AUTH_MDS, /* prefer authoritative mds for this metadata item */
137};
138
139struct ceph_mds_request;
140struct ceph_mds_client;
141
142/*
143 * request completion callback
144 */
145typedef void (*ceph_mds_request_callback_t) (struct ceph_mds_client *mdsc,
146 struct ceph_mds_request *req);
147
148/*
149 * an in-flight mds request
150 */
151struct ceph_mds_request {
152 u64 r_tid; /* transaction id */
153 struct rb_node r_node;
154
155 int r_op; /* mds op code */
156 int r_mds;
157
158 /* operation on what? */
159 struct inode *r_inode; /* arg1 */
160 struct dentry *r_dentry; /* arg1 */
161 struct dentry *r_old_dentry; /* arg2: rename from or link from */
162 char *r_path1, *r_path2;
163 struct ceph_vino r_ino1, r_ino2;
164
165 struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
166 struct inode *r_target_inode; /* resulting inode */
167
168 union ceph_mds_request_args r_args;
169 int r_fmode; /* file mode, if expecting cap */
170
171 /* for choosing which mds to send this request to */
172 int r_direct_mode;
173 u32 r_direct_hash; /* choose dir frag based on this dentry hash */
174 bool r_direct_is_hash; /* true if r_direct_hash is valid */
175
176 /* data payload is used for xattr ops */
177 struct page **r_pages;
178 int r_num_pages;
179 int r_data_len;
180
181 /* what caps shall we drop? */
182 int r_inode_drop, r_inode_unless;
183 int r_dentry_drop, r_dentry_unless;
184 int r_old_dentry_drop, r_old_dentry_unless;
185 struct inode *r_old_inode;
186 int r_old_inode_drop, r_old_inode_unless;
187
188 struct ceph_msg *r_request; /* original request */
189 struct ceph_msg *r_reply;
190 struct ceph_mds_reply_info_parsed r_reply_info;
191 int r_err;
192 bool r_aborted;
193
194 unsigned long r_timeout; /* optional. jiffies */
195 unsigned long r_started; /* start time to measure timeout against */
196 unsigned long r_request_started; /* start time for mds request only,
197 used to measure lease durations */
198
199 /* link unsafe requests to parent directory, for fsync */
200 struct inode *r_unsafe_dir;
201 struct list_head r_unsafe_dir_item;
202
203 struct ceph_mds_session *r_session;
204
205 int r_attempts; /* resend attempts */
206 int r_num_fwd; /* number of forward attempts */
207 int r_num_stale;
208 int r_resend_mds; /* mds to resend to next, if any*/
209
210 struct kref r_kref;
211 struct list_head r_wait;
212 struct completion r_completion;
213 struct completion r_safe_completion;
214 ceph_mds_request_callback_t r_callback;
215 struct list_head r_unsafe_item; /* per-session unsafe list item */
216 bool r_got_unsafe, r_got_safe;
217
218 bool r_did_prepopulate;
219 u32 r_readdir_offset;
220
221 struct ceph_cap_reservation r_caps_reservation;
222 int r_num_caps;
223};
224
225/*
226 * mds client state
227 */
228struct ceph_mds_client {
229 struct ceph_client *client;
230 struct mutex mutex; /* all nested structures */
231
232 struct ceph_mdsmap *mdsmap;
233 struct completion safe_umount_waiters, session_close_waiters;
234 struct list_head waiting_for_map;
235
236 struct ceph_mds_session **sessions; /* NULL for mds if no session */
237 int max_sessions; /* len of s_mds_sessions */
238 int stopping; /* true if shutting down */
239
240 /*
241 * snap_rwsem will cover cap linkage into snaprealms, and
242 * realm snap contexts. (later, we can do per-realm snap
243 * contexts locks..) the empty list contains realms with no
244 * references (implying they contain no inodes with caps) that
245 * should be destroyed.
246 */
247 struct rw_semaphore snap_rwsem;
248 struct rb_root snap_realms;
249 struct list_head snap_empty;
250 spinlock_t snap_empty_lock; /* protect snap_empty */
251
252 u64 last_tid; /* most recent mds request */
253 struct rb_root request_tree; /* pending mds requests */
254 struct delayed_work delayed_work; /* delayed work */
255 unsigned long last_renew_caps; /* last time we renewed our caps */
256 struct list_head cap_delay_list; /* caps with delayed release */
257 spinlock_t cap_delay_lock; /* protects cap_delay_list */
258 struct list_head snap_flush_list; /* cap_snaps ready to flush */
259 spinlock_t snap_flush_lock;
260
261 u64 cap_flush_seq;
262 struct list_head cap_dirty; /* inodes with dirty caps */
263 int num_cap_flushing; /* # caps we are flushing */
264 spinlock_t cap_dirty_lock; /* protects above items */
265 wait_queue_head_t cap_flushing_wq;
266
267#ifdef CONFIG_DEBUG_FS
268 struct dentry *debugfs_file;
269#endif
270
271 spinlock_t dentry_lru_lock;
272 struct list_head dentry_lru;
273 int num_dentry;
274};
275
276extern const char *ceph_mds_op_name(int op);
277
278extern struct ceph_mds_session *
279__ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
280
281static inline struct ceph_mds_session *
282ceph_get_mds_session(struct ceph_mds_session *s)
283{
284 atomic_inc(&s->s_ref);
285 return s;
286}
287
288extern void ceph_put_mds_session(struct ceph_mds_session *s);
289
290extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
291 struct ceph_msg *msg, int mds);
292
293extern int ceph_mdsc_init(struct ceph_mds_client *mdsc,
294 struct ceph_client *client);
295extern void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc);
296extern void ceph_mdsc_stop(struct ceph_mds_client *mdsc);
297
298extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
299
300extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
301 struct inode *inode,
302 struct dentry *dn, int mask);
303
304extern struct ceph_mds_request *
305ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
306extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
307 struct ceph_mds_request *req);
308extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
309 struct inode *dir,
310 struct ceph_mds_request *req);
311static inline void ceph_mdsc_get_request(struct ceph_mds_request *req)
312{
313 kref_get(&req->r_kref);
314}
315extern void ceph_mdsc_release_request(struct kref *kref);
316static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
317{
318 kref_put(&req->r_kref, ceph_mdsc_release_request);
319}
320
321extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
322
323extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
324 int stop_on_nosnap);
325
326extern void __ceph_mdsc_drop_dentry_lease(struct dentry *dentry);
327extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
328 struct inode *inode,
329 struct dentry *dentry, char action,
330 u32 seq);
331
332extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
333 struct ceph_msg *msg);
334
335#endif
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
new file mode 100644
index 000000000000..c4c498e6dfef
--- /dev/null
+++ b/fs/ceph/mdsmap.c
@@ -0,0 +1,174 @@
1#include "ceph_debug.h"
2
3#include <linux/bug.h>
4#include <linux/err.h>
5#include <linux/random.h>
6#include <linux/slab.h>
7#include <linux/types.h>
8
9#include "mdsmap.h"
10#include "messenger.h"
11#include "decode.h"
12
13#include "super.h"
14
15
16/*
17 * choose a random mds that is "up" (i.e. has a state > 0), or -1.
18 */
19int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
20{
21 int n = 0;
22 int i;
23 char r;
24
25 /* count */
26 for (i = 0; i < m->m_max_mds; i++)
27 if (m->m_info[i].state > 0)
28 n++;
29 if (n == 0)
30 return -1;
31
32 /* pick */
33 get_random_bytes(&r, 1);
34 n = r % n;
35 i = 0;
36 for (i = 0; n > 0; i++, n--)
37 while (m->m_info[i].state <= 0)
38 i++;
39
40 return i;
41}
42
43/*
44 * Decode an MDS map
45 *
46 * Ignore any fields we don't care about (there are quite a few of
47 * them).
48 */
49struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
50{
51 struct ceph_mdsmap *m;
52 const void *start = *p;
53 int i, j, n;
54 int err = -EINVAL;
55 u16 version;
56
57 m = kzalloc(sizeof(*m), GFP_NOFS);
58 if (m == NULL)
59 return ERR_PTR(-ENOMEM);
60
61 ceph_decode_16_safe(p, end, version, bad);
62
63 ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
64 m->m_epoch = ceph_decode_32(p);
65 m->m_client_epoch = ceph_decode_32(p);
66 m->m_last_failure = ceph_decode_32(p);
67 m->m_root = ceph_decode_32(p);
68 m->m_session_timeout = ceph_decode_32(p);
69 m->m_session_autoclose = ceph_decode_32(p);
70 m->m_max_file_size = ceph_decode_64(p);
71 m->m_max_mds = ceph_decode_32(p);
72
73 m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
74 if (m->m_info == NULL)
75 goto badmem;
76
77 /* pick out active nodes from mds_info (state > 0) */
78 n = ceph_decode_32(p);
79 for (i = 0; i < n; i++) {
80 u64 global_id;
81 u32 namelen;
82 s32 mds, inc, state;
83 u64 state_seq;
84 u8 infoversion;
85 struct ceph_entity_addr addr;
86 u32 num_export_targets;
87 void *pexport_targets = NULL;
88
89 ceph_decode_need(p, end, sizeof(u64)*2 + 1 + sizeof(u32), bad);
90 global_id = ceph_decode_64(p);
91 infoversion = ceph_decode_8(p);
92 *p += sizeof(u64);
93 namelen = ceph_decode_32(p); /* skip mds name */
94 *p += namelen;
95
96 ceph_decode_need(p, end,
97 4*sizeof(u32) + sizeof(u64) +
98 sizeof(addr) + sizeof(struct ceph_timespec),
99 bad);
100 mds = ceph_decode_32(p);
101 inc = ceph_decode_32(p);
102 state = ceph_decode_32(p);
103 state_seq = ceph_decode_64(p);
104 ceph_decode_copy(p, &addr, sizeof(addr));
105 ceph_decode_addr(&addr);
106 *p += sizeof(struct ceph_timespec);
107 *p += sizeof(u32);
108 ceph_decode_32_safe(p, end, namelen, bad);
109 *p += namelen;
110 if (infoversion >= 2) {
111 ceph_decode_32_safe(p, end, num_export_targets, bad);
112 pexport_targets = *p;
113 *p += num_export_targets * sizeof(u32);
114 } else {
115 num_export_targets = 0;
116 }
117
118 dout("mdsmap_decode %d/%d %lld mds%d.%d %s %s\n",
119 i+1, n, global_id, mds, inc, pr_addr(&addr.in_addr),
120 ceph_mds_state_name(state));
121 if (mds >= 0 && mds < m->m_max_mds && state > 0) {
122 m->m_info[mds].global_id = global_id;
123 m->m_info[mds].state = state;
124 m->m_info[mds].addr = addr;
125 m->m_info[mds].num_export_targets = num_export_targets;
126 if (num_export_targets) {
127 m->m_info[mds].export_targets =
128 kcalloc(num_export_targets, sizeof(u32),
129 GFP_NOFS);
130 for (j = 0; j < num_export_targets; j++)
131 m->m_info[mds].export_targets[j] =
132 ceph_decode_32(&pexport_targets);
133 } else {
134 m->m_info[mds].export_targets = NULL;
135 }
136 }
137 }
138
139 /* pg_pools */
140 ceph_decode_32_safe(p, end, n, bad);
141 m->m_num_data_pg_pools = n;
142 m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS);
143 if (!m->m_data_pg_pools)
144 goto badmem;
145 ceph_decode_need(p, end, sizeof(u32)*(n+1), bad);
146 for (i = 0; i < n; i++)
147 m->m_data_pg_pools[i] = ceph_decode_32(p);
148 m->m_cas_pg_pool = ceph_decode_32(p);
149
150 /* ok, we don't care about the rest. */
151 dout("mdsmap_decode success epoch %u\n", m->m_epoch);
152 return m;
153
154badmem:
155 err = -ENOMEM;
156bad:
157 pr_err("corrupt mdsmap\n");
158 print_hex_dump(KERN_DEBUG, "mdsmap: ",
159 DUMP_PREFIX_OFFSET, 16, 1,
160 start, end - start, true);
161 ceph_mdsmap_destroy(m);
162 return ERR_PTR(-EINVAL);
163}
164
165void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
166{
167 int i;
168
169 for (i = 0; i < m->m_max_mds; i++)
170 kfree(m->m_info[i].export_targets);
171 kfree(m->m_info);
172 kfree(m->m_data_pg_pools);
173 kfree(m);
174}
diff --git a/fs/ceph/mdsmap.h b/fs/ceph/mdsmap.h
new file mode 100644
index 000000000000..eacc131aa5cb
--- /dev/null
+++ b/fs/ceph/mdsmap.h
@@ -0,0 +1,54 @@
1#ifndef _FS_CEPH_MDSMAP_H
2#define _FS_CEPH_MDSMAP_H
3
4#include "types.h"
5
6/*
7 * mds map - describe servers in the mds cluster.
8 *
9 * we limit fields to those the client actually xcares about
10 */
11struct ceph_mds_info {
12 u64 global_id;
13 struct ceph_entity_addr addr;
14 s32 state;
15 int num_export_targets;
16 u32 *export_targets;
17};
18
19struct ceph_mdsmap {
20 u32 m_epoch, m_client_epoch, m_last_failure;
21 u32 m_root;
22 u32 m_session_timeout; /* seconds */
23 u32 m_session_autoclose; /* seconds */
24 u64 m_max_file_size;
25 u32 m_max_mds; /* size of m_addr, m_state arrays */
26 struct ceph_mds_info *m_info;
27
28 /* which object pools file data can be stored in */
29 int m_num_data_pg_pools;
30 u32 *m_data_pg_pools;
31 u32 m_cas_pg_pool;
32};
33
34static inline struct ceph_entity_addr *
35ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
36{
37 if (w >= m->m_max_mds)
38 return NULL;
39 return &m->m_info[w].addr;
40}
41
42static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
43{
44 BUG_ON(w < 0);
45 if (w >= m->m_max_mds)
46 return CEPH_MDS_STATE_DNE;
47 return m->m_info[w].state;
48}
49
50extern int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m);
51extern struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end);
52extern void ceph_mdsmap_destroy(struct ceph_mdsmap *m);
53
54#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
new file mode 100644
index 000000000000..781656a49bf8
--- /dev/null
+++ b/fs/ceph/messenger.c
@@ -0,0 +1,2240 @@
1#include "ceph_debug.h"
2
3#include <linux/crc32c.h>
4#include <linux/ctype.h>
5#include <linux/highmem.h>
6#include <linux/inet.h>
7#include <linux/kthread.h>
8#include <linux/net.h>
9#include <linux/socket.h>
10#include <linux/string.h>
11#include <net/tcp.h>
12
13#include "super.h"
14#include "messenger.h"
15#include "decode.h"
16#include "pagelist.h"
17
18/*
19 * Ceph uses the messenger to exchange ceph_msg messages with other
20 * hosts in the system. The messenger provides ordered and reliable
21 * delivery. We tolerate TCP disconnects by reconnecting (with
22 * exponential backoff) in the case of a fault (disconnection, bad
23 * crc, protocol error). Acks allow sent messages to be discarded by
24 * the sender.
25 */
26
27/* static tag bytes (protocol control messages) */
28static char tag_msg = CEPH_MSGR_TAG_MSG;
29static char tag_ack = CEPH_MSGR_TAG_ACK;
30static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
31
32
33static void queue_con(struct ceph_connection *con);
34static void con_work(struct work_struct *);
35static void ceph_fault(struct ceph_connection *con);
36
37const char *ceph_name_type_str(int t)
38{
39 switch (t) {
40 case CEPH_ENTITY_TYPE_MON: return "mon";
41 case CEPH_ENTITY_TYPE_MDS: return "mds";
42 case CEPH_ENTITY_TYPE_OSD: return "osd";
43 case CEPH_ENTITY_TYPE_CLIENT: return "client";
44 case CEPH_ENTITY_TYPE_ADMIN: return "admin";
45 default: return "???";
46 }
47}
48
49/*
50 * nicely render a sockaddr as a string.
51 */
52#define MAX_ADDR_STR 20
53static char addr_str[MAX_ADDR_STR][40];
54static DEFINE_SPINLOCK(addr_str_lock);
55static int last_addr_str;
56
57const char *pr_addr(const struct sockaddr_storage *ss)
58{
59 int i;
60 char *s;
61 struct sockaddr_in *in4 = (void *)ss;
62 unsigned char *quad = (void *)&in4->sin_addr.s_addr;
63 struct sockaddr_in6 *in6 = (void *)ss;
64
65 spin_lock(&addr_str_lock);
66 i = last_addr_str++;
67 if (last_addr_str == MAX_ADDR_STR)
68 last_addr_str = 0;
69 spin_unlock(&addr_str_lock);
70 s = addr_str[i];
71
72 switch (ss->ss_family) {
73 case AF_INET:
74 sprintf(s, "%u.%u.%u.%u:%u",
75 (unsigned int)quad[0],
76 (unsigned int)quad[1],
77 (unsigned int)quad[2],
78 (unsigned int)quad[3],
79 (unsigned int)ntohs(in4->sin_port));
80 break;
81
82 case AF_INET6:
83 sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
84 in6->sin6_addr.s6_addr16[0],
85 in6->sin6_addr.s6_addr16[1],
86 in6->sin6_addr.s6_addr16[2],
87 in6->sin6_addr.s6_addr16[3],
88 in6->sin6_addr.s6_addr16[4],
89 in6->sin6_addr.s6_addr16[5],
90 in6->sin6_addr.s6_addr16[6],
91 in6->sin6_addr.s6_addr16[7],
92 (unsigned int)ntohs(in6->sin6_port));
93 break;
94
95 default:
96 sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
97 }
98
99 return s;
100}
101
102static void encode_my_addr(struct ceph_messenger *msgr)
103{
104 memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
105 ceph_encode_addr(&msgr->my_enc_addr);
106}
107
108/*
109 * work queue for all reading and writing to/from the socket.
110 */
111struct workqueue_struct *ceph_msgr_wq;
112
113int __init ceph_msgr_init(void)
114{
115 ceph_msgr_wq = create_workqueue("ceph-msgr");
116 if (IS_ERR(ceph_msgr_wq)) {
117 int ret = PTR_ERR(ceph_msgr_wq);
118 pr_err("msgr_init failed to create workqueue: %d\n", ret);
119 ceph_msgr_wq = NULL;
120 return ret;
121 }
122 return 0;
123}
124
125void ceph_msgr_exit(void)
126{
127 destroy_workqueue(ceph_msgr_wq);
128}
129
130/*
131 * socket callback functions
132 */
133
134/* data available on socket, or listen socket received a connect */
135static void ceph_data_ready(struct sock *sk, int count_unused)
136{
137 struct ceph_connection *con =
138 (struct ceph_connection *)sk->sk_user_data;
139 if (sk->sk_state != TCP_CLOSE_WAIT) {
140 dout("ceph_data_ready on %p state = %lu, queueing work\n",
141 con, con->state);
142 queue_con(con);
143 }
144}
145
146/* socket has buffer space for writing */
147static void ceph_write_space(struct sock *sk)
148{
149 struct ceph_connection *con =
150 (struct ceph_connection *)sk->sk_user_data;
151
152 /* only queue to workqueue if there is data we want to write. */
153 if (test_bit(WRITE_PENDING, &con->state)) {
154 dout("ceph_write_space %p queueing write work\n", con);
155 queue_con(con);
156 } else {
157 dout("ceph_write_space %p nothing to write\n", con);
158 }
159
160 /* since we have our own write_space, clear the SOCK_NOSPACE flag */
161 clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
162}
163
164/* socket's state has changed */
165static void ceph_state_change(struct sock *sk)
166{
167 struct ceph_connection *con =
168 (struct ceph_connection *)sk->sk_user_data;
169
170 dout("ceph_state_change %p state = %lu sk_state = %u\n",
171 con, con->state, sk->sk_state);
172
173 if (test_bit(CLOSED, &con->state))
174 return;
175
176 switch (sk->sk_state) {
177 case TCP_CLOSE:
178 dout("ceph_state_change TCP_CLOSE\n");
179 case TCP_CLOSE_WAIT:
180 dout("ceph_state_change TCP_CLOSE_WAIT\n");
181 if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
182 if (test_bit(CONNECTING, &con->state))
183 con->error_msg = "connection failed";
184 else
185 con->error_msg = "socket closed";
186 queue_con(con);
187 }
188 break;
189 case TCP_ESTABLISHED:
190 dout("ceph_state_change TCP_ESTABLISHED\n");
191 queue_con(con);
192 break;
193 }
194}
195
196/*
197 * set up socket callbacks
198 */
199static void set_sock_callbacks(struct socket *sock,
200 struct ceph_connection *con)
201{
202 struct sock *sk = sock->sk;
203 sk->sk_user_data = (void *)con;
204 sk->sk_data_ready = ceph_data_ready;
205 sk->sk_write_space = ceph_write_space;
206 sk->sk_state_change = ceph_state_change;
207}
208
209
210/*
211 * socket helpers
212 */
213
214/*
215 * initiate connection to a remote socket.
216 */
217static struct socket *ceph_tcp_connect(struct ceph_connection *con)
218{
219 struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
220 struct socket *sock;
221 int ret;
222
223 BUG_ON(con->sock);
224 ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
225 if (ret)
226 return ERR_PTR(ret);
227 con->sock = sock;
228 sock->sk->sk_allocation = GFP_NOFS;
229
230 set_sock_callbacks(sock, con);
231
232 dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
233
234 ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
235 if (ret == -EINPROGRESS) {
236 dout("connect %s EINPROGRESS sk_state = %u\n",
237 pr_addr(&con->peer_addr.in_addr),
238 sock->sk->sk_state);
239 ret = 0;
240 }
241 if (ret < 0) {
242 pr_err("connect %s error %d\n",
243 pr_addr(&con->peer_addr.in_addr), ret);
244 sock_release(sock);
245 con->sock = NULL;
246 con->error_msg = "connect error";
247 }
248
249 if (ret < 0)
250 return ERR_PTR(ret);
251 return sock;
252}
253
254static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
255{
256 struct kvec iov = {buf, len};
257 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
258
259 return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
260}
261
262/*
263 * write something. @more is true if caller will be sending more data
264 * shortly.
265 */
266static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
267 size_t kvlen, size_t len, int more)
268{
269 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
270
271 if (more)
272 msg.msg_flags |= MSG_MORE;
273 else
274 msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
275
276 return kernel_sendmsg(sock, &msg, iov, kvlen, len);
277}
278
279
280/*
281 * Shutdown/close the socket for the given connection.
282 */
283static int con_close_socket(struct ceph_connection *con)
284{
285 int rc;
286
287 dout("con_close_socket on %p sock %p\n", con, con->sock);
288 if (!con->sock)
289 return 0;
290 set_bit(SOCK_CLOSED, &con->state);
291 rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
292 sock_release(con->sock);
293 con->sock = NULL;
294 clear_bit(SOCK_CLOSED, &con->state);
295 return rc;
296}
297
298/*
299 * Reset a connection. Discard all incoming and outgoing messages
300 * and clear *_seq state.
301 */
302static void ceph_msg_remove(struct ceph_msg *msg)
303{
304 list_del_init(&msg->list_head);
305 ceph_msg_put(msg);
306}
307static void ceph_msg_remove_list(struct list_head *head)
308{
309 while (!list_empty(head)) {
310 struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
311 list_head);
312 ceph_msg_remove(msg);
313 }
314}
315
316static void reset_connection(struct ceph_connection *con)
317{
318 /* reset connection, out_queue, msg_ and connect_seq */
319 /* discard existing out_queue and msg_seq */
320 ceph_msg_remove_list(&con->out_queue);
321 ceph_msg_remove_list(&con->out_sent);
322
323 if (con->in_msg) {
324 ceph_msg_put(con->in_msg);
325 con->in_msg = NULL;
326 }
327
328 con->connect_seq = 0;
329 con->out_seq = 0;
330 if (con->out_msg) {
331 ceph_msg_put(con->out_msg);
332 con->out_msg = NULL;
333 }
334 con->in_seq = 0;
335}
336
337/*
338 * mark a peer down. drop any open connections.
339 */
340void ceph_con_close(struct ceph_connection *con)
341{
342 dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
343 set_bit(CLOSED, &con->state); /* in case there's queued work */
344 clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
345 clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
346 clear_bit(KEEPALIVE_PENDING, &con->state);
347 clear_bit(WRITE_PENDING, &con->state);
348 mutex_lock(&con->mutex);
349 reset_connection(con);
350 cancel_delayed_work(&con->work);
351 mutex_unlock(&con->mutex);
352 queue_con(con);
353}
354
355/*
356 * Reopen a closed connection, with a new peer address.
357 */
358void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
359{
360 dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
361 set_bit(OPENING, &con->state);
362 clear_bit(CLOSED, &con->state);
363 memcpy(&con->peer_addr, addr, sizeof(*addr));
364 con->delay = 0; /* reset backoff memory */
365 queue_con(con);
366}
367
368/*
369 * generic get/put
370 */
371struct ceph_connection *ceph_con_get(struct ceph_connection *con)
372{
373 dout("con_get %p nref = %d -> %d\n", con,
374 atomic_read(&con->nref), atomic_read(&con->nref) + 1);
375 if (atomic_inc_not_zero(&con->nref))
376 return con;
377 return NULL;
378}
379
380void ceph_con_put(struct ceph_connection *con)
381{
382 dout("con_put %p nref = %d -> %d\n", con,
383 atomic_read(&con->nref), atomic_read(&con->nref) - 1);
384 BUG_ON(atomic_read(&con->nref) == 0);
385 if (atomic_dec_and_test(&con->nref)) {
386 BUG_ON(con->sock);
387 kfree(con);
388 }
389}
390
391/*
392 * initialize a new connection.
393 */
394void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
395{
396 dout("con_init %p\n", con);
397 memset(con, 0, sizeof(*con));
398 atomic_set(&con->nref, 1);
399 con->msgr = msgr;
400 mutex_init(&con->mutex);
401 INIT_LIST_HEAD(&con->out_queue);
402 INIT_LIST_HEAD(&con->out_sent);
403 INIT_DELAYED_WORK(&con->work, con_work);
404}
405
406
407/*
408 * We maintain a global counter to order connection attempts. Get
409 * a unique seq greater than @gt.
410 */
411static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
412{
413 u32 ret;
414
415 spin_lock(&msgr->global_seq_lock);
416 if (msgr->global_seq < gt)
417 msgr->global_seq = gt;
418 ret = ++msgr->global_seq;
419 spin_unlock(&msgr->global_seq_lock);
420 return ret;
421}
422
423
424/*
425 * Prepare footer for currently outgoing message, and finish things
426 * off. Assumes out_kvec* are already valid.. we just add on to the end.
427 */
428static void prepare_write_message_footer(struct ceph_connection *con, int v)
429{
430 struct ceph_msg *m = con->out_msg;
431
432 dout("prepare_write_message_footer %p\n", con);
433 con->out_kvec_is_msg = true;
434 con->out_kvec[v].iov_base = &m->footer;
435 con->out_kvec[v].iov_len = sizeof(m->footer);
436 con->out_kvec_bytes += sizeof(m->footer);
437 con->out_kvec_left++;
438 con->out_more = m->more_to_follow;
439 con->out_msg_done = true;
440}
441
442/*
443 * Prepare headers for the next outgoing message.
444 */
445static void prepare_write_message(struct ceph_connection *con)
446{
447 struct ceph_msg *m;
448 int v = 0;
449
450 con->out_kvec_bytes = 0;
451 con->out_kvec_is_msg = true;
452 con->out_msg_done = false;
453
454 /* Sneak an ack in there first? If we can get it into the same
455 * TCP packet that's a good thing. */
456 if (con->in_seq > con->in_seq_acked) {
457 con->in_seq_acked = con->in_seq;
458 con->out_kvec[v].iov_base = &tag_ack;
459 con->out_kvec[v++].iov_len = 1;
460 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
461 con->out_kvec[v].iov_base = &con->out_temp_ack;
462 con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
463 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
464 }
465
466 m = list_first_entry(&con->out_queue,
467 struct ceph_msg, list_head);
468 con->out_msg = m;
469 if (test_bit(LOSSYTX, &con->state)) {
470 list_del_init(&m->list_head);
471 } else {
472 /* put message on sent list */
473 ceph_msg_get(m);
474 list_move_tail(&m->list_head, &con->out_sent);
475 }
476
477 m->hdr.seq = cpu_to_le64(++con->out_seq);
478
479 dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
480 m, con->out_seq, le16_to_cpu(m->hdr.type),
481 le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
482 le32_to_cpu(m->hdr.data_len),
483 m->nr_pages);
484 BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
485
486 /* tag + hdr + front + middle */
487 con->out_kvec[v].iov_base = &tag_msg;
488 con->out_kvec[v++].iov_len = 1;
489 con->out_kvec[v].iov_base = &m->hdr;
490 con->out_kvec[v++].iov_len = sizeof(m->hdr);
491 con->out_kvec[v++] = m->front;
492 if (m->middle)
493 con->out_kvec[v++] = m->middle->vec;
494 con->out_kvec_left = v;
495 con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
496 (m->middle ? m->middle->vec.iov_len : 0);
497 con->out_kvec_cur = con->out_kvec;
498
499 /* fill in crc (except data pages), footer */
500 con->out_msg->hdr.crc =
501 cpu_to_le32(crc32c(0, (void *)&m->hdr,
502 sizeof(m->hdr) - sizeof(m->hdr.crc)));
503 con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
504 con->out_msg->footer.front_crc =
505 cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
506 if (m->middle)
507 con->out_msg->footer.middle_crc =
508 cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
509 m->middle->vec.iov_len));
510 else
511 con->out_msg->footer.middle_crc = 0;
512 con->out_msg->footer.data_crc = 0;
513 dout("prepare_write_message front_crc %u data_crc %u\n",
514 le32_to_cpu(con->out_msg->footer.front_crc),
515 le32_to_cpu(con->out_msg->footer.middle_crc));
516
517 /* is there a data payload? */
518 if (le32_to_cpu(m->hdr.data_len) > 0) {
519 /* initialize page iterator */
520 con->out_msg_pos.page = 0;
521 con->out_msg_pos.page_pos =
522 le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
523 con->out_msg_pos.data_pos = 0;
524 con->out_msg_pos.did_page_crc = 0;
525 con->out_more = 1; /* data + footer will follow */
526 } else {
527 /* no, queue up footer too and be done */
528 prepare_write_message_footer(con, v);
529 }
530
531 set_bit(WRITE_PENDING, &con->state);
532}
533
534/*
535 * Prepare an ack.
536 */
537static void prepare_write_ack(struct ceph_connection *con)
538{
539 dout("prepare_write_ack %p %llu -> %llu\n", con,
540 con->in_seq_acked, con->in_seq);
541 con->in_seq_acked = con->in_seq;
542
543 con->out_kvec[0].iov_base = &tag_ack;
544 con->out_kvec[0].iov_len = 1;
545 con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
546 con->out_kvec[1].iov_base = &con->out_temp_ack;
547 con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
548 con->out_kvec_left = 2;
549 con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
550 con->out_kvec_cur = con->out_kvec;
551 con->out_more = 1; /* more will follow.. eventually.. */
552 set_bit(WRITE_PENDING, &con->state);
553}
554
555/*
556 * Prepare to write keepalive byte.
557 */
558static void prepare_write_keepalive(struct ceph_connection *con)
559{
560 dout("prepare_write_keepalive %p\n", con);
561 con->out_kvec[0].iov_base = &tag_keepalive;
562 con->out_kvec[0].iov_len = 1;
563 con->out_kvec_left = 1;
564 con->out_kvec_bytes = 1;
565 con->out_kvec_cur = con->out_kvec;
566 set_bit(WRITE_PENDING, &con->state);
567}
568
569/*
570 * Connection negotiation.
571 */
572
573static void prepare_connect_authorizer(struct ceph_connection *con)
574{
575 void *auth_buf;
576 int auth_len = 0;
577 int auth_protocol = 0;
578
579 mutex_unlock(&con->mutex);
580 if (con->ops->get_authorizer)
581 con->ops->get_authorizer(con, &auth_buf, &auth_len,
582 &auth_protocol, &con->auth_reply_buf,
583 &con->auth_reply_buf_len,
584 con->auth_retry);
585 mutex_lock(&con->mutex);
586
587 con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
588 con->out_connect.authorizer_len = cpu_to_le32(auth_len);
589
590 con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
591 con->out_kvec[con->out_kvec_left].iov_len = auth_len;
592 con->out_kvec_left++;
593 con->out_kvec_bytes += auth_len;
594}
595
596/*
597 * We connected to a peer and are saying hello.
598 */
599static void prepare_write_banner(struct ceph_messenger *msgr,
600 struct ceph_connection *con)
601{
602 int len = strlen(CEPH_BANNER);
603
604 con->out_kvec[0].iov_base = CEPH_BANNER;
605 con->out_kvec[0].iov_len = len;
606 con->out_kvec[1].iov_base = &msgr->my_enc_addr;
607 con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
608 con->out_kvec_left = 2;
609 con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
610 con->out_kvec_cur = con->out_kvec;
611 con->out_more = 0;
612 set_bit(WRITE_PENDING, &con->state);
613}
614
615static void prepare_write_connect(struct ceph_messenger *msgr,
616 struct ceph_connection *con,
617 int after_banner)
618{
619 unsigned global_seq = get_global_seq(con->msgr, 0);
620 int proto;
621
622 switch (con->peer_name.type) {
623 case CEPH_ENTITY_TYPE_MON:
624 proto = CEPH_MONC_PROTOCOL;
625 break;
626 case CEPH_ENTITY_TYPE_OSD:
627 proto = CEPH_OSDC_PROTOCOL;
628 break;
629 case CEPH_ENTITY_TYPE_MDS:
630 proto = CEPH_MDSC_PROTOCOL;
631 break;
632 default:
633 BUG();
634 }
635
636 dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
637 con->connect_seq, global_seq, proto);
638
639 con->out_connect.features = CEPH_FEATURE_SUPPORTED;
640 con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
641 con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
642 con->out_connect.global_seq = cpu_to_le32(global_seq);
643 con->out_connect.protocol_version = cpu_to_le32(proto);
644 con->out_connect.flags = 0;
645
646 if (!after_banner) {
647 con->out_kvec_left = 0;
648 con->out_kvec_bytes = 0;
649 }
650 con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
651 con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
652 con->out_kvec_left++;
653 con->out_kvec_bytes += sizeof(con->out_connect);
654 con->out_kvec_cur = con->out_kvec;
655 con->out_more = 0;
656 set_bit(WRITE_PENDING, &con->state);
657
658 prepare_connect_authorizer(con);
659}
660
661
662/*
663 * write as much of pending kvecs to the socket as we can.
664 * 1 -> done
665 * 0 -> socket full, but more to do
666 * <0 -> error
667 */
668static int write_partial_kvec(struct ceph_connection *con)
669{
670 int ret;
671
672 dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
673 while (con->out_kvec_bytes > 0) {
674 ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
675 con->out_kvec_left, con->out_kvec_bytes,
676 con->out_more);
677 if (ret <= 0)
678 goto out;
679 con->out_kvec_bytes -= ret;
680 if (con->out_kvec_bytes == 0)
681 break; /* done */
682 while (ret > 0) {
683 if (ret >= con->out_kvec_cur->iov_len) {
684 ret -= con->out_kvec_cur->iov_len;
685 con->out_kvec_cur++;
686 con->out_kvec_left--;
687 } else {
688 con->out_kvec_cur->iov_len -= ret;
689 con->out_kvec_cur->iov_base += ret;
690 ret = 0;
691 break;
692 }
693 }
694 }
695 con->out_kvec_left = 0;
696 con->out_kvec_is_msg = false;
697 ret = 1;
698out:
699 dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
700 con->out_kvec_bytes, con->out_kvec_left, ret);
701 return ret; /* done! */
702}
703
704/*
705 * Write as much message data payload as we can. If we finish, queue
706 * up the footer.
707 * 1 -> done, footer is now queued in out_kvec[].
708 * 0 -> socket full, but more to do
709 * <0 -> error
710 */
711static int write_partial_msg_pages(struct ceph_connection *con)
712{
713 struct ceph_msg *msg = con->out_msg;
714 unsigned data_len = le32_to_cpu(msg->hdr.data_len);
715 size_t len;
716 int crc = con->msgr->nocrc;
717 int ret;
718
719 dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
720 con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
721 con->out_msg_pos.page_pos);
722
723 while (con->out_msg_pos.page < con->out_msg->nr_pages) {
724 struct page *page = NULL;
725 void *kaddr = NULL;
726
727 /*
728 * if we are calculating the data crc (the default), we need
729 * to map the page. if our pages[] has been revoked, use the
730 * zero page.
731 */
732 if (msg->pages) {
733 page = msg->pages[con->out_msg_pos.page];
734 if (crc)
735 kaddr = kmap(page);
736 } else if (msg->pagelist) {
737 page = list_first_entry(&msg->pagelist->head,
738 struct page, lru);
739 if (crc)
740 kaddr = kmap(page);
741 } else {
742 page = con->msgr->zero_page;
743 if (crc)
744 kaddr = page_address(con->msgr->zero_page);
745 }
746 len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
747 (int)(data_len - con->out_msg_pos.data_pos));
748 if (crc && !con->out_msg_pos.did_page_crc) {
749 void *base = kaddr + con->out_msg_pos.page_pos;
750 u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
751
752 BUG_ON(kaddr == NULL);
753 con->out_msg->footer.data_crc =
754 cpu_to_le32(crc32c(tmpcrc, base, len));
755 con->out_msg_pos.did_page_crc = 1;
756 }
757
758 ret = kernel_sendpage(con->sock, page,
759 con->out_msg_pos.page_pos, len,
760 MSG_DONTWAIT | MSG_NOSIGNAL |
761 MSG_MORE);
762
763 if (crc && (msg->pages || msg->pagelist))
764 kunmap(page);
765
766 if (ret <= 0)
767 goto out;
768
769 con->out_msg_pos.data_pos += ret;
770 con->out_msg_pos.page_pos += ret;
771 if (ret == len) {
772 con->out_msg_pos.page_pos = 0;
773 con->out_msg_pos.page++;
774 con->out_msg_pos.did_page_crc = 0;
775 if (msg->pagelist)
776 list_move_tail(&page->lru,
777 &msg->pagelist->head);
778 }
779 }
780
781 dout("write_partial_msg_pages %p msg %p done\n", con, msg);
782
783 /* prepare and queue up footer, too */
784 if (!crc)
785 con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
786 con->out_kvec_bytes = 0;
787 con->out_kvec_left = 0;
788 con->out_kvec_cur = con->out_kvec;
789 prepare_write_message_footer(con, 0);
790 ret = 1;
791out:
792 return ret;
793}
794
795/*
796 * write some zeros
797 */
798static int write_partial_skip(struct ceph_connection *con)
799{
800 int ret;
801
802 while (con->out_skip > 0) {
803 struct kvec iov = {
804 .iov_base = page_address(con->msgr->zero_page),
805 .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
806 };
807
808 ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
809 if (ret <= 0)
810 goto out;
811 con->out_skip -= ret;
812 }
813 ret = 1;
814out:
815 return ret;
816}
817
818/*
819 * Prepare to read connection handshake, or an ack.
820 */
821static void prepare_read_banner(struct ceph_connection *con)
822{
823 dout("prepare_read_banner %p\n", con);
824 con->in_base_pos = 0;
825}
826
827static void prepare_read_connect(struct ceph_connection *con)
828{
829 dout("prepare_read_connect %p\n", con);
830 con->in_base_pos = 0;
831}
832
833static void prepare_read_connect_retry(struct ceph_connection *con)
834{
835 dout("prepare_read_connect_retry %p\n", con);
836 con->in_base_pos = strlen(CEPH_BANNER) + sizeof(con->actual_peer_addr)
837 + sizeof(con->peer_addr_for_me);
838}
839
840static void prepare_read_ack(struct ceph_connection *con)
841{
842 dout("prepare_read_ack %p\n", con);
843 con->in_base_pos = 0;
844}
845
846static void prepare_read_tag(struct ceph_connection *con)
847{
848 dout("prepare_read_tag %p\n", con);
849 con->in_base_pos = 0;
850 con->in_tag = CEPH_MSGR_TAG_READY;
851}
852
853/*
854 * Prepare to read a message.
855 */
856static int prepare_read_message(struct ceph_connection *con)
857{
858 dout("prepare_read_message %p\n", con);
859 BUG_ON(con->in_msg != NULL);
860 con->in_base_pos = 0;
861 con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;
862 return 0;
863}
864
865
866static int read_partial(struct ceph_connection *con,
867 int *to, int size, void *object)
868{
869 *to += size;
870 while (con->in_base_pos < *to) {
871 int left = *to - con->in_base_pos;
872 int have = size - left;
873 int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
874 if (ret <= 0)
875 return ret;
876 con->in_base_pos += ret;
877 }
878 return 1;
879}
880
881
882/*
883 * Read all or part of the connect-side handshake on a new connection
884 */
885static int read_partial_banner(struct ceph_connection *con)
886{
887 int ret, to = 0;
888
889 dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
890
891 /* peer's banner */
892 ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
893 if (ret <= 0)
894 goto out;
895 ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
896 &con->actual_peer_addr);
897 if (ret <= 0)
898 goto out;
899 ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
900 &con->peer_addr_for_me);
901 if (ret <= 0)
902 goto out;
903out:
904 return ret;
905}
906
907static int read_partial_connect(struct ceph_connection *con)
908{
909 int ret, to = 0;
910
911 dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
912
913 ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
914 if (ret <= 0)
915 goto out;
916 ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
917 con->auth_reply_buf);
918 if (ret <= 0)
919 goto out;
920
921 dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
922 con, (int)con->in_reply.tag,
923 le32_to_cpu(con->in_reply.connect_seq),
924 le32_to_cpu(con->in_reply.global_seq));
925out:
926 return ret;
927
928}
929
930/*
931 * Verify the hello banner looks okay.
932 */
933static int verify_hello(struct ceph_connection *con)
934{
935 if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
936 pr_err("connect to %s got bad banner\n",
937 pr_addr(&con->peer_addr.in_addr));
938 con->error_msg = "protocol error, bad banner";
939 return -1;
940 }
941 return 0;
942}
943
944static bool addr_is_blank(struct sockaddr_storage *ss)
945{
946 switch (ss->ss_family) {
947 case AF_INET:
948 return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
949 case AF_INET6:
950 return
951 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
952 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
953 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
954 ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
955 }
956 return false;
957}
958
959static int addr_port(struct sockaddr_storage *ss)
960{
961 switch (ss->ss_family) {
962 case AF_INET:
963 return ntohs(((struct sockaddr_in *)ss)->sin_port);
964 case AF_INET6:
965 return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);
966 }
967 return 0;
968}
969
970static void addr_set_port(struct sockaddr_storage *ss, int p)
971{
972 switch (ss->ss_family) {
973 case AF_INET:
974 ((struct sockaddr_in *)ss)->sin_port = htons(p);
975 case AF_INET6:
976 ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
977 }
978}
979
980/*
981 * Parse an ip[:port] list into an addr array. Use the default
982 * monitor port if a port isn't specified.
983 */
984int ceph_parse_ips(const char *c, const char *end,
985 struct ceph_entity_addr *addr,
986 int max_count, int *count)
987{
988 int i;
989 const char *p = c;
990
991 dout("parse_ips on '%.*s'\n", (int)(end-c), c);
992 for (i = 0; i < max_count; i++) {
993 const char *ipend;
994 struct sockaddr_storage *ss = &addr[i].in_addr;
995 struct sockaddr_in *in4 = (void *)ss;
996 struct sockaddr_in6 *in6 = (void *)ss;
997 int port;
998
999 memset(ss, 0, sizeof(*ss));
1000 if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
1001 ',', &ipend)) {
1002 ss->ss_family = AF_INET;
1003 } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
1004 ',', &ipend)) {
1005 ss->ss_family = AF_INET6;
1006 } else {
1007 goto bad;
1008 }
1009 p = ipend;
1010
1011 /* port? */
1012 if (p < end && *p == ':') {
1013 port = 0;
1014 p++;
1015 while (p < end && *p >= '0' && *p <= '9') {
1016 port = (port * 10) + (*p - '0');
1017 p++;
1018 }
1019 if (port > 65535 || port == 0)
1020 goto bad;
1021 } else {
1022 port = CEPH_MON_PORT;
1023 }
1024
1025 addr_set_port(ss, port);
1026
1027 dout("parse_ips got %s\n", pr_addr(ss));
1028
1029 if (p == end)
1030 break;
1031 if (*p != ',')
1032 goto bad;
1033 p++;
1034 }
1035
1036 if (p != end)
1037 goto bad;
1038
1039 if (count)
1040 *count = i + 1;
1041 return 0;
1042
1043bad:
1044 pr_err("parse_ips bad ip '%s'\n", c);
1045 return -EINVAL;
1046}
1047
1048static int process_banner(struct ceph_connection *con)
1049{
1050 dout("process_banner on %p\n", con);
1051
1052 if (verify_hello(con) < 0)
1053 return -1;
1054
1055 ceph_decode_addr(&con->actual_peer_addr);
1056 ceph_decode_addr(&con->peer_addr_for_me);
1057
1058 /*
1059 * Make sure the other end is who we wanted. note that the other
1060 * end may not yet know their ip address, so if it's 0.0.0.0, give
1061 * them the benefit of the doubt.
1062 */
1063 if (memcmp(&con->peer_addr, &con->actual_peer_addr,
1064 sizeof(con->peer_addr)) != 0 &&
1065 !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
1066 con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
1067 pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
1068 pr_addr(&con->peer_addr.in_addr),
1069 le64_to_cpu(con->peer_addr.nonce),
1070 pr_addr(&con->actual_peer_addr.in_addr),
1071 le64_to_cpu(con->actual_peer_addr.nonce));
1072 con->error_msg = "wrong peer at address";
1073 return -1;
1074 }
1075
1076 /*
1077 * did we learn our address?
1078 */
1079 if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
1080 int port = addr_port(&con->msgr->inst.addr.in_addr);
1081
1082 memcpy(&con->msgr->inst.addr.in_addr,
1083 &con->peer_addr_for_me.in_addr,
1084 sizeof(con->peer_addr_for_me.in_addr));
1085 addr_set_port(&con->msgr->inst.addr.in_addr, port);
1086 encode_my_addr(con->msgr);
1087 dout("process_banner learned my addr is %s\n",
1088 pr_addr(&con->msgr->inst.addr.in_addr));
1089 }
1090
1091 set_bit(NEGOTIATING, &con->state);
1092 prepare_read_connect(con);
1093 return 0;
1094}
1095
1096static void fail_protocol(struct ceph_connection *con)
1097{
1098 reset_connection(con);
1099 set_bit(CLOSED, &con->state); /* in case there's queued work */
1100
1101 mutex_unlock(&con->mutex);
1102 if (con->ops->bad_proto)
1103 con->ops->bad_proto(con);
1104 mutex_lock(&con->mutex);
1105}
1106
1107static int process_connect(struct ceph_connection *con)
1108{
1109 u64 sup_feat = CEPH_FEATURE_SUPPORTED;
1110 u64 req_feat = CEPH_FEATURE_REQUIRED;
1111 u64 server_feat = le64_to_cpu(con->in_reply.features);
1112
1113 dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
1114
1115 switch (con->in_reply.tag) {
1116 case CEPH_MSGR_TAG_FEATURES:
1117 pr_err("%s%lld %s feature set mismatch,"
1118 " my %llx < server's %llx, missing %llx\n",
1119 ENTITY_NAME(con->peer_name),
1120 pr_addr(&con->peer_addr.in_addr),
1121 sup_feat, server_feat, server_feat & ~sup_feat);
1122 con->error_msg = "missing required protocol features";
1123 fail_protocol(con);
1124 return -1;
1125
1126 case CEPH_MSGR_TAG_BADPROTOVER:
1127 pr_err("%s%lld %s protocol version mismatch,"
1128 " my %d != server's %d\n",
1129 ENTITY_NAME(con->peer_name),
1130 pr_addr(&con->peer_addr.in_addr),
1131 le32_to_cpu(con->out_connect.protocol_version),
1132 le32_to_cpu(con->in_reply.protocol_version));
1133 con->error_msg = "protocol version mismatch";
1134 fail_protocol(con);
1135 return -1;
1136
1137 case CEPH_MSGR_TAG_BADAUTHORIZER:
1138 con->auth_retry++;
1139 dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
1140 con->auth_retry);
1141 if (con->auth_retry == 2) {
1142 con->error_msg = "connect authorization failure";
1143 reset_connection(con);
1144 set_bit(CLOSED, &con->state);
1145 return -1;
1146 }
1147 con->auth_retry = 1;
1148 prepare_write_connect(con->msgr, con, 0);
1149 prepare_read_connect_retry(con);
1150 break;
1151
1152 case CEPH_MSGR_TAG_RESETSESSION:
1153 /*
1154 * If we connected with a large connect_seq but the peer
1155 * has no record of a session with us (no connection, or
1156 * connect_seq == 0), they will send RESETSESION to indicate
1157 * that they must have reset their session, and may have
1158 * dropped messages.
1159 */
1160 dout("process_connect got RESET peer seq %u\n",
1161 le32_to_cpu(con->in_connect.connect_seq));
1162 pr_err("%s%lld %s connection reset\n",
1163 ENTITY_NAME(con->peer_name),
1164 pr_addr(&con->peer_addr.in_addr));
1165 reset_connection(con);
1166 prepare_write_connect(con->msgr, con, 0);
1167 prepare_read_connect(con);
1168
1169 /* Tell ceph about it. */
1170 mutex_unlock(&con->mutex);
1171 pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
1172 if (con->ops->peer_reset)
1173 con->ops->peer_reset(con);
1174 mutex_lock(&con->mutex);
1175 break;
1176
1177 case CEPH_MSGR_TAG_RETRY_SESSION:
1178 /*
1179 * If we sent a smaller connect_seq than the peer has, try
1180 * again with a larger value.
1181 */
1182 dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
1183 le32_to_cpu(con->out_connect.connect_seq),
1184 le32_to_cpu(con->in_connect.connect_seq));
1185 con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
1186 prepare_write_connect(con->msgr, con, 0);
1187 prepare_read_connect(con);
1188 break;
1189
1190 case CEPH_MSGR_TAG_RETRY_GLOBAL:
1191 /*
1192 * If we sent a smaller global_seq than the peer has, try
1193 * again with a larger value.
1194 */
1195 dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
1196 con->peer_global_seq,
1197 le32_to_cpu(con->in_connect.global_seq));
1198 get_global_seq(con->msgr,
1199 le32_to_cpu(con->in_connect.global_seq));
1200 prepare_write_connect(con->msgr, con, 0);
1201 prepare_read_connect(con);
1202 break;
1203
1204 case CEPH_MSGR_TAG_READY:
1205 if (req_feat & ~server_feat) {
1206 pr_err("%s%lld %s protocol feature mismatch,"
1207 " my required %llx > server's %llx, need %llx\n",
1208 ENTITY_NAME(con->peer_name),
1209 pr_addr(&con->peer_addr.in_addr),
1210 req_feat, server_feat, req_feat & ~server_feat);
1211 con->error_msg = "missing required protocol features";
1212 fail_protocol(con);
1213 return -1;
1214 }
1215 clear_bit(CONNECTING, &con->state);
1216 con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
1217 con->connect_seq++;
1218 dout("process_connect got READY gseq %d cseq %d (%d)\n",
1219 con->peer_global_seq,
1220 le32_to_cpu(con->in_reply.connect_seq),
1221 con->connect_seq);
1222 WARN_ON(con->connect_seq !=
1223 le32_to_cpu(con->in_reply.connect_seq));
1224
1225 if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
1226 set_bit(LOSSYTX, &con->state);
1227
1228 prepare_read_tag(con);
1229 break;
1230
1231 case CEPH_MSGR_TAG_WAIT:
1232 /*
1233 * If there is a connection race (we are opening
1234 * connections to each other), one of us may just have
1235 * to WAIT. This shouldn't happen if we are the
1236 * client.
1237 */
1238 pr_err("process_connect peer connecting WAIT\n");
1239
1240 default:
1241 pr_err("connect protocol error, will retry\n");
1242 con->error_msg = "protocol error, garbage tag during connect";
1243 return -1;
1244 }
1245 return 0;
1246}
1247
1248
1249/*
1250 * read (part of) an ack
1251 */
1252static int read_partial_ack(struct ceph_connection *con)
1253{
1254 int to = 0;
1255
1256 return read_partial(con, &to, sizeof(con->in_temp_ack),
1257 &con->in_temp_ack);
1258}
1259
1260
1261/*
1262 * We can finally discard anything that's been acked.
1263 */
1264static void process_ack(struct ceph_connection *con)
1265{
1266 struct ceph_msg *m;
1267 u64 ack = le64_to_cpu(con->in_temp_ack);
1268 u64 seq;
1269
1270 while (!list_empty(&con->out_sent)) {
1271 m = list_first_entry(&con->out_sent, struct ceph_msg,
1272 list_head);
1273 seq = le64_to_cpu(m->hdr.seq);
1274 if (seq > ack)
1275 break;
1276 dout("got ack for seq %llu type %d at %p\n", seq,
1277 le16_to_cpu(m->hdr.type), m);
1278 ceph_msg_remove(m);
1279 }
1280 prepare_read_tag(con);
1281}
1282
1283
1284
1285
1286static int read_partial_message_section(struct ceph_connection *con,
1287 struct kvec *section, unsigned int sec_len,
1288 u32 *crc)
1289{
1290 int left;
1291 int ret;
1292
1293 BUG_ON(!section);
1294
1295 while (section->iov_len < sec_len) {
1296 BUG_ON(section->iov_base == NULL);
1297 left = sec_len - section->iov_len;
1298 ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
1299 section->iov_len, left);
1300 if (ret <= 0)
1301 return ret;
1302 section->iov_len += ret;
1303 if (section->iov_len == sec_len)
1304 *crc = crc32c(0, section->iov_base,
1305 section->iov_len);
1306 }
1307
1308 return 1;
1309}
1310
1311static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
1312 struct ceph_msg_header *hdr,
1313 int *skip);
1314/*
1315 * read (part of) a message.
1316 */
1317static int read_partial_message(struct ceph_connection *con)
1318{
1319 struct ceph_msg *m = con->in_msg;
1320 void *p;
1321 int ret;
1322 int to, left;
1323 unsigned front_len, middle_len, data_len, data_off;
1324 int datacrc = con->msgr->nocrc;
1325 int skip;
1326
1327 dout("read_partial_message con %p msg %p\n", con, m);
1328
1329 /* header */
1330 while (con->in_base_pos < sizeof(con->in_hdr)) {
1331 left = sizeof(con->in_hdr) - con->in_base_pos;
1332 ret = ceph_tcp_recvmsg(con->sock,
1333 (char *)&con->in_hdr + con->in_base_pos,
1334 left);
1335 if (ret <= 0)
1336 return ret;
1337 con->in_base_pos += ret;
1338 if (con->in_base_pos == sizeof(con->in_hdr)) {
1339 u32 crc = crc32c(0, (void *)&con->in_hdr,
1340 sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
1341 if (crc != le32_to_cpu(con->in_hdr.crc)) {
1342 pr_err("read_partial_message bad hdr "
1343 " crc %u != expected %u\n",
1344 crc, con->in_hdr.crc);
1345 return -EBADMSG;
1346 }
1347 }
1348 }
1349 front_len = le32_to_cpu(con->in_hdr.front_len);
1350 if (front_len > CEPH_MSG_MAX_FRONT_LEN)
1351 return -EIO;
1352 middle_len = le32_to_cpu(con->in_hdr.middle_len);
1353 if (middle_len > CEPH_MSG_MAX_DATA_LEN)
1354 return -EIO;
1355 data_len = le32_to_cpu(con->in_hdr.data_len);
1356 if (data_len > CEPH_MSG_MAX_DATA_LEN)
1357 return -EIO;
1358 data_off = le16_to_cpu(con->in_hdr.data_off);
1359
1360 /* allocate message? */
1361 if (!con->in_msg) {
1362 dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
1363 con->in_hdr.front_len, con->in_hdr.data_len);
1364 con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
1365 if (skip) {
1366 /* skip this message */
1367 dout("alloc_msg returned NULL, skipping message\n");
1368 con->in_base_pos = -front_len - middle_len - data_len -
1369 sizeof(m->footer);
1370 con->in_tag = CEPH_MSGR_TAG_READY;
1371 return 0;
1372 }
1373 if (IS_ERR(con->in_msg)) {
1374 ret = PTR_ERR(con->in_msg);
1375 con->in_msg = NULL;
1376 con->error_msg =
1377 "error allocating memory for incoming message";
1378 return ret;
1379 }
1380 m = con->in_msg;
1381 m->front.iov_len = 0; /* haven't read it yet */
1382 if (m->middle)
1383 m->middle->vec.iov_len = 0;
1384
1385 con->in_msg_pos.page = 0;
1386 con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
1387 con->in_msg_pos.data_pos = 0;
1388 }
1389
1390 /* front */
1391 ret = read_partial_message_section(con, &m->front, front_len,
1392 &con->in_front_crc);
1393 if (ret <= 0)
1394 return ret;
1395
1396 /* middle */
1397 if (m->middle) {
1398 ret = read_partial_message_section(con, &m->middle->vec, middle_len,
1399 &con->in_middle_crc);
1400 if (ret <= 0)
1401 return ret;
1402 }
1403
1404 /* (page) data */
1405 while (con->in_msg_pos.data_pos < data_len) {
1406 left = min((int)(data_len - con->in_msg_pos.data_pos),
1407 (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
1408 BUG_ON(m->pages == NULL);
1409 p = kmap(m->pages[con->in_msg_pos.page]);
1410 ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
1411 left);
1412 if (ret > 0 && datacrc)
1413 con->in_data_crc =
1414 crc32c(con->in_data_crc,
1415 p + con->in_msg_pos.page_pos, ret);
1416 kunmap(m->pages[con->in_msg_pos.page]);
1417 if (ret <= 0)
1418 return ret;
1419 con->in_msg_pos.data_pos += ret;
1420 con->in_msg_pos.page_pos += ret;
1421 if (con->in_msg_pos.page_pos == PAGE_SIZE) {
1422 con->in_msg_pos.page_pos = 0;
1423 con->in_msg_pos.page++;
1424 }
1425 }
1426
1427 /* footer */
1428 to = sizeof(m->hdr) + sizeof(m->footer);
1429 while (con->in_base_pos < to) {
1430 left = to - con->in_base_pos;
1431 ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
1432 (con->in_base_pos - sizeof(m->hdr)),
1433 left);
1434 if (ret <= 0)
1435 return ret;
1436 con->in_base_pos += ret;
1437 }
1438 dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
1439 m, front_len, m->footer.front_crc, middle_len,
1440 m->footer.middle_crc, data_len, m->footer.data_crc);
1441
1442 /* crc ok? */
1443 if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
1444 pr_err("read_partial_message %p front crc %u != exp. %u\n",
1445 m, con->in_front_crc, m->footer.front_crc);
1446 return -EBADMSG;
1447 }
1448 if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
1449 pr_err("read_partial_message %p middle crc %u != exp %u\n",
1450 m, con->in_middle_crc, m->footer.middle_crc);
1451 return -EBADMSG;
1452 }
1453 if (datacrc &&
1454 (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
1455 con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
1456 pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
1457 con->in_data_crc, le32_to_cpu(m->footer.data_crc));
1458 return -EBADMSG;
1459 }
1460
1461 return 1; /* done! */
1462}
1463
1464/*
1465 * Process message. This happens in the worker thread. The callback should
1466 * be careful not to do anything that waits on other incoming messages or it
1467 * may deadlock.
1468 */
1469static void process_message(struct ceph_connection *con)
1470{
1471 struct ceph_msg *msg;
1472
1473 msg = con->in_msg;
1474 con->in_msg = NULL;
1475
1476 /* if first message, set peer_name */
1477 if (con->peer_name.type == 0)
1478 con->peer_name = msg->hdr.src.name;
1479
1480 con->in_seq++;
1481 mutex_unlock(&con->mutex);
1482
1483 dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
1484 msg, le64_to_cpu(msg->hdr.seq),
1485 ENTITY_NAME(msg->hdr.src.name),
1486 le16_to_cpu(msg->hdr.type),
1487 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1488 le32_to_cpu(msg->hdr.front_len),
1489 le32_to_cpu(msg->hdr.data_len),
1490 con->in_front_crc, con->in_middle_crc, con->in_data_crc);
1491 con->ops->dispatch(con, msg);
1492
1493 mutex_lock(&con->mutex);
1494 prepare_read_tag(con);
1495}
1496
1497
1498/*
1499 * Write something to the socket. Called in a worker thread when the
1500 * socket appears to be writeable and we have something ready to send.
1501 */
1502static int try_write(struct ceph_connection *con)
1503{
1504 struct ceph_messenger *msgr = con->msgr;
1505 int ret = 1;
1506
1507 dout("try_write start %p state %lu nref %d\n", con, con->state,
1508 atomic_read(&con->nref));
1509
1510 mutex_lock(&con->mutex);
1511more:
1512 dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
1513
1514 /* open the socket first? */
1515 if (con->sock == NULL) {
1516 /*
1517 * if we were STANDBY and are reconnecting _this_
1518 * connection, bump connect_seq now. Always bump
1519 * global_seq.
1520 */
1521 if (test_and_clear_bit(STANDBY, &con->state))
1522 con->connect_seq++;
1523
1524 prepare_write_banner(msgr, con);
1525 prepare_write_connect(msgr, con, 1);
1526 prepare_read_banner(con);
1527 set_bit(CONNECTING, &con->state);
1528 clear_bit(NEGOTIATING, &con->state);
1529
1530 BUG_ON(con->in_msg);
1531 con->in_tag = CEPH_MSGR_TAG_READY;
1532 dout("try_write initiating connect on %p new state %lu\n",
1533 con, con->state);
1534 con->sock = ceph_tcp_connect(con);
1535 if (IS_ERR(con->sock)) {
1536 con->sock = NULL;
1537 con->error_msg = "connect error";
1538 ret = -1;
1539 goto out;
1540 }
1541 }
1542
1543more_kvec:
1544 /* kvec data queued? */
1545 if (con->out_skip) {
1546 ret = write_partial_skip(con);
1547 if (ret <= 0)
1548 goto done;
1549 if (ret < 0) {
1550 dout("try_write write_partial_skip err %d\n", ret);
1551 goto done;
1552 }
1553 }
1554 if (con->out_kvec_left) {
1555 ret = write_partial_kvec(con);
1556 if (ret <= 0)
1557 goto done;
1558 }
1559
1560 /* msg pages? */
1561 if (con->out_msg) {
1562 if (con->out_msg_done) {
1563 ceph_msg_put(con->out_msg);
1564 con->out_msg = NULL; /* we're done with this one */
1565 goto do_next;
1566 }
1567
1568 ret = write_partial_msg_pages(con);
1569 if (ret == 1)
1570 goto more_kvec; /* we need to send the footer, too! */
1571 if (ret == 0)
1572 goto done;
1573 if (ret < 0) {
1574 dout("try_write write_partial_msg_pages err %d\n",
1575 ret);
1576 goto done;
1577 }
1578 }
1579
1580do_next:
1581 if (!test_bit(CONNECTING, &con->state)) {
1582 /* is anything else pending? */
1583 if (!list_empty(&con->out_queue)) {
1584 prepare_write_message(con);
1585 goto more;
1586 }
1587 if (con->in_seq > con->in_seq_acked) {
1588 prepare_write_ack(con);
1589 goto more;
1590 }
1591 if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
1592 prepare_write_keepalive(con);
1593 goto more;
1594 }
1595 }
1596
1597 /* Nothing to do! */
1598 clear_bit(WRITE_PENDING, &con->state);
1599 dout("try_write nothing else to write.\n");
1600done:
1601 ret = 0;
1602out:
1603 mutex_unlock(&con->mutex);
1604 dout("try_write done on %p\n", con);
1605 return ret;
1606}
1607
1608
1609
1610/*
1611 * Read what we can from the socket.
1612 */
1613static int try_read(struct ceph_connection *con)
1614{
1615 struct ceph_messenger *msgr;
1616 int ret = -1;
1617
1618 if (!con->sock)
1619 return 0;
1620
1621 if (test_bit(STANDBY, &con->state))
1622 return 0;
1623
1624 dout("try_read start on %p\n", con);
1625 msgr = con->msgr;
1626
1627 mutex_lock(&con->mutex);
1628
1629more:
1630 dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
1631 con->in_base_pos);
1632 if (test_bit(CONNECTING, &con->state)) {
1633 if (!test_bit(NEGOTIATING, &con->state)) {
1634 dout("try_read connecting\n");
1635 ret = read_partial_banner(con);
1636 if (ret <= 0)
1637 goto done;
1638 if (process_banner(con) < 0) {
1639 ret = -1;
1640 goto out;
1641 }
1642 }
1643 ret = read_partial_connect(con);
1644 if (ret <= 0)
1645 goto done;
1646 if (process_connect(con) < 0) {
1647 ret = -1;
1648 goto out;
1649 }
1650 goto more;
1651 }
1652
1653 if (con->in_base_pos < 0) {
1654 /*
1655 * skipping + discarding content.
1656 *
1657 * FIXME: there must be a better way to do this!
1658 */
1659 static char buf[1024];
1660 int skip = min(1024, -con->in_base_pos);
1661 dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
1662 ret = ceph_tcp_recvmsg(con->sock, buf, skip);
1663 if (ret <= 0)
1664 goto done;
1665 con->in_base_pos += ret;
1666 if (con->in_base_pos)
1667 goto more;
1668 }
1669 if (con->in_tag == CEPH_MSGR_TAG_READY) {
1670 /*
1671 * what's next?
1672 */
1673 ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
1674 if (ret <= 0)
1675 goto done;
1676 dout("try_read got tag %d\n", (int)con->in_tag);
1677 switch (con->in_tag) {
1678 case CEPH_MSGR_TAG_MSG:
1679 prepare_read_message(con);
1680 break;
1681 case CEPH_MSGR_TAG_ACK:
1682 prepare_read_ack(con);
1683 break;
1684 case CEPH_MSGR_TAG_CLOSE:
1685 set_bit(CLOSED, &con->state); /* fixme */
1686 goto done;
1687 default:
1688 goto bad_tag;
1689 }
1690 }
1691 if (con->in_tag == CEPH_MSGR_TAG_MSG) {
1692 ret = read_partial_message(con);
1693 if (ret <= 0) {
1694 switch (ret) {
1695 case -EBADMSG:
1696 con->error_msg = "bad crc";
1697 ret = -EIO;
1698 goto out;
1699 case -EIO:
1700 con->error_msg = "io error";
1701 goto out;
1702 default:
1703 goto done;
1704 }
1705 }
1706 if (con->in_tag == CEPH_MSGR_TAG_READY)
1707 goto more;
1708 process_message(con);
1709 goto more;
1710 }
1711 if (con->in_tag == CEPH_MSGR_TAG_ACK) {
1712 ret = read_partial_ack(con);
1713 if (ret <= 0)
1714 goto done;
1715 process_ack(con);
1716 goto more;
1717 }
1718
1719done:
1720 ret = 0;
1721out:
1722 mutex_unlock(&con->mutex);
1723 dout("try_read done on %p\n", con);
1724 return ret;
1725
1726bad_tag:
1727 pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
1728 con->error_msg = "protocol error, garbage tag";
1729 ret = -1;
1730 goto out;
1731}
1732
1733
1734/*
1735 * Atomically queue work on a connection. Bump @con reference to
1736 * avoid races with connection teardown.
1737 *
1738 * There is some trickery going on with QUEUED and BUSY because we
1739 * only want a _single_ thread operating on each connection at any
1740 * point in time, but we want to use all available CPUs.
1741 *
1742 * The worker thread only proceeds if it can atomically set BUSY. It
1743 * clears QUEUED and does it's thing. When it thinks it's done, it
1744 * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
1745 * (tries again to set BUSY).
1746 *
1747 * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
1748 * try to queue work. If that fails (work is already queued, or BUSY)
1749 * we give up (work also already being done or is queued) but leave QUEUED
1750 * set so that the worker thread will loop if necessary.
1751 */
1752static void queue_con(struct ceph_connection *con)
1753{
1754 if (test_bit(DEAD, &con->state)) {
1755 dout("queue_con %p ignoring: DEAD\n",
1756 con);
1757 return;
1758 }
1759
1760 if (!con->ops->get(con)) {
1761 dout("queue_con %p ref count 0\n", con);
1762 return;
1763 }
1764
1765 set_bit(QUEUED, &con->state);
1766 if (test_bit(BUSY, &con->state)) {
1767 dout("queue_con %p - already BUSY\n", con);
1768 con->ops->put(con);
1769 } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
1770 dout("queue_con %p - already queued\n", con);
1771 con->ops->put(con);
1772 } else {
1773 dout("queue_con %p\n", con);
1774 }
1775}
1776
1777/*
1778 * Do some work on a connection. Drop a connection ref when we're done.
1779 */
1780static void con_work(struct work_struct *work)
1781{
1782 struct ceph_connection *con = container_of(work, struct ceph_connection,
1783 work.work);
1784 int backoff = 0;
1785
1786more:
1787 if (test_and_set_bit(BUSY, &con->state) != 0) {
1788 dout("con_work %p BUSY already set\n", con);
1789 goto out;
1790 }
1791 dout("con_work %p start, clearing QUEUED\n", con);
1792 clear_bit(QUEUED, &con->state);
1793
1794 if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
1795 dout("con_work CLOSED\n");
1796 con_close_socket(con);
1797 goto done;
1798 }
1799 if (test_and_clear_bit(OPENING, &con->state)) {
1800 /* reopen w/ new peer */
1801 dout("con_work OPENING\n");
1802 con_close_socket(con);
1803 }
1804
1805 if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
1806 try_read(con) < 0 ||
1807 try_write(con) < 0) {
1808 backoff = 1;
1809 ceph_fault(con); /* error/fault path */
1810 }
1811
1812done:
1813 clear_bit(BUSY, &con->state);
1814 dout("con->state=%lu\n", con->state);
1815 if (test_bit(QUEUED, &con->state)) {
1816 if (!backoff || test_bit(OPENING, &con->state)) {
1817 dout("con_work %p QUEUED reset, looping\n", con);
1818 goto more;
1819 }
1820 dout("con_work %p QUEUED reset, but just faulted\n", con);
1821 clear_bit(QUEUED, &con->state);
1822 }
1823 dout("con_work %p done\n", con);
1824
1825out:
1826 con->ops->put(con);
1827}
1828
1829
1830/*
1831 * Generic error/fault handler. A retry mechanism is used with
1832 * exponential backoff
1833 */
1834static void ceph_fault(struct ceph_connection *con)
1835{
1836 pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
1837 pr_addr(&con->peer_addr.in_addr), con->error_msg);
1838 dout("fault %p state %lu to peer %s\n",
1839 con, con->state, pr_addr(&con->peer_addr.in_addr));
1840
1841 if (test_bit(LOSSYTX, &con->state)) {
1842 dout("fault on LOSSYTX channel\n");
1843 goto out;
1844 }
1845
1846 clear_bit(BUSY, &con->state); /* to avoid an improbable race */
1847
1848 mutex_lock(&con->mutex);
1849 if (test_bit(CLOSED, &con->state))
1850 goto out_unlock;
1851
1852 con_close_socket(con);
1853
1854 if (con->in_msg) {
1855 ceph_msg_put(con->in_msg);
1856 con->in_msg = NULL;
1857 }
1858
1859 /* Requeue anything that hasn't been acked */
1860 list_splice_init(&con->out_sent, &con->out_queue);
1861
1862 /* If there are no messages in the queue, place the connection
1863 * in a STANDBY state (i.e., don't try to reconnect just yet). */
1864 if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
1865 dout("fault setting STANDBY\n");
1866 set_bit(STANDBY, &con->state);
1867 } else {
1868 /* retry after a delay. */
1869 if (con->delay == 0)
1870 con->delay = BASE_DELAY_INTERVAL;
1871 else if (con->delay < MAX_DELAY_INTERVAL)
1872 con->delay *= 2;
1873 dout("fault queueing %p delay %lu\n", con, con->delay);
1874 con->ops->get(con);
1875 if (queue_delayed_work(ceph_msgr_wq, &con->work,
1876 round_jiffies_relative(con->delay)) == 0)
1877 con->ops->put(con);
1878 }
1879
1880out_unlock:
1881 mutex_unlock(&con->mutex);
1882out:
1883 /*
1884 * in case we faulted due to authentication, invalidate our
1885 * current tickets so that we can get new ones.
1886 */
1887 if (con->auth_retry && con->ops->invalidate_authorizer) {
1888 dout("calling invalidate_authorizer()\n");
1889 con->ops->invalidate_authorizer(con);
1890 }
1891
1892 if (con->ops->fault)
1893 con->ops->fault(con);
1894}
1895
1896
1897
1898/*
1899 * create a new messenger instance
1900 */
1901struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
1902{
1903 struct ceph_messenger *msgr;
1904
1905 msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
1906 if (msgr == NULL)
1907 return ERR_PTR(-ENOMEM);
1908
1909 spin_lock_init(&msgr->global_seq_lock);
1910
1911 /* the zero page is needed if a request is "canceled" while the message
1912 * is being written over the socket */
1913 msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
1914 if (!msgr->zero_page) {
1915 kfree(msgr);
1916 return ERR_PTR(-ENOMEM);
1917 }
1918 kmap(msgr->zero_page);
1919
1920 if (myaddr)
1921 msgr->inst.addr = *myaddr;
1922
1923 /* select a random nonce */
1924 msgr->inst.addr.type = 0;
1925 get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
1926 encode_my_addr(msgr);
1927
1928 dout("messenger_create %p\n", msgr);
1929 return msgr;
1930}
1931
1932void ceph_messenger_destroy(struct ceph_messenger *msgr)
1933{
1934 dout("destroy %p\n", msgr);
1935 kunmap(msgr->zero_page);
1936 __free_page(msgr->zero_page);
1937 kfree(msgr);
1938 dout("destroyed messenger %p\n", msgr);
1939}
1940
1941/*
1942 * Queue up an outgoing message on the given connection.
1943 */
1944void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
1945{
1946 if (test_bit(CLOSED, &con->state)) {
1947 dout("con_send %p closed, dropping %p\n", con, msg);
1948 ceph_msg_put(msg);
1949 return;
1950 }
1951
1952 /* set src+dst */
1953 msg->hdr.src.name = con->msgr->inst.name;
1954 msg->hdr.src.addr = con->msgr->my_enc_addr;
1955 msg->hdr.orig_src = msg->hdr.src;
1956
1957 BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
1958
1959 /* queue */
1960 mutex_lock(&con->mutex);
1961 BUG_ON(!list_empty(&msg->list_head));
1962 list_add_tail(&msg->list_head, &con->out_queue);
1963 dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
1964 ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
1965 ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
1966 le32_to_cpu(msg->hdr.front_len),
1967 le32_to_cpu(msg->hdr.middle_len),
1968 le32_to_cpu(msg->hdr.data_len));
1969 mutex_unlock(&con->mutex);
1970
1971 /* if there wasn't anything waiting to send before, queue
1972 * new work */
1973 if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
1974 queue_con(con);
1975}
1976
1977/*
1978 * Revoke a message that was previously queued for send
1979 */
1980void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
1981{
1982 mutex_lock(&con->mutex);
1983 if (!list_empty(&msg->list_head)) {
1984 dout("con_revoke %p msg %p\n", con, msg);
1985 list_del_init(&msg->list_head);
1986 ceph_msg_put(msg);
1987 msg->hdr.seq = 0;
1988 if (con->out_msg == msg) {
1989 ceph_msg_put(con->out_msg);
1990 con->out_msg = NULL;
1991 }
1992 if (con->out_kvec_is_msg) {
1993 con->out_skip = con->out_kvec_bytes;
1994 con->out_kvec_is_msg = false;
1995 }
1996 } else {
1997 dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
1998 }
1999 mutex_unlock(&con->mutex);
2000}
2001
2002/*
2003 * Revoke a message that we may be reading data into
2004 */
2005void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
2006{
2007 mutex_lock(&con->mutex);
2008 if (con->in_msg && con->in_msg == msg) {
2009 unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
2010 unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
2011 unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
2012
2013 /* skip rest of message */
2014 dout("con_revoke_pages %p msg %p revoked\n", con, msg);
2015 con->in_base_pos = con->in_base_pos -
2016 sizeof(struct ceph_msg_header) -
2017 front_len -
2018 middle_len -
2019 data_len -
2020 sizeof(struct ceph_msg_footer);
2021 ceph_msg_put(con->in_msg);
2022 con->in_msg = NULL;
2023 con->in_tag = CEPH_MSGR_TAG_READY;
2024 } else {
2025 dout("con_revoke_pages %p msg %p pages %p no-op\n",
2026 con, con->in_msg, msg);
2027 }
2028 mutex_unlock(&con->mutex);
2029}
2030
2031/*
2032 * Queue a keepalive byte to ensure the tcp connection is alive.
2033 */
2034void ceph_con_keepalive(struct ceph_connection *con)
2035{
2036 if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
2037 test_and_set_bit(WRITE_PENDING, &con->state) == 0)
2038 queue_con(con);
2039}
2040
2041
2042/*
2043 * construct a new message with given type, size
2044 * the new msg has a ref count of 1.
2045 */
2046struct ceph_msg *ceph_msg_new(int type, int front_len,
2047 int page_len, int page_off, struct page **pages)
2048{
2049 struct ceph_msg *m;
2050
2051 m = kmalloc(sizeof(*m), GFP_NOFS);
2052 if (m == NULL)
2053 goto out;
2054 kref_init(&m->kref);
2055 INIT_LIST_HEAD(&m->list_head);
2056
2057 m->hdr.type = cpu_to_le16(type);
2058 m->hdr.front_len = cpu_to_le32(front_len);
2059 m->hdr.middle_len = 0;
2060 m->hdr.data_len = cpu_to_le32(page_len);
2061 m->hdr.data_off = cpu_to_le16(page_off);
2062 m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
2063 m->footer.front_crc = 0;
2064 m->footer.middle_crc = 0;
2065 m->footer.data_crc = 0;
2066 m->front_max = front_len;
2067 m->front_is_vmalloc = false;
2068 m->more_to_follow = false;
2069 m->pool = NULL;
2070
2071 /* front */
2072 if (front_len) {
2073 if (front_len > PAGE_CACHE_SIZE) {
2074 m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
2075 PAGE_KERNEL);
2076 m->front_is_vmalloc = true;
2077 } else {
2078 m->front.iov_base = kmalloc(front_len, GFP_NOFS);
2079 }
2080 if (m->front.iov_base == NULL) {
2081 pr_err("msg_new can't allocate %d bytes\n",
2082 front_len);
2083 goto out2;
2084 }
2085 } else {
2086 m->front.iov_base = NULL;
2087 }
2088 m->front.iov_len = front_len;
2089
2090 /* middle */
2091 m->middle = NULL;
2092
2093 /* data */
2094 m->nr_pages = calc_pages_for(page_off, page_len);
2095 m->pages = pages;
2096 m->pagelist = NULL;
2097
2098 dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
2099 m->nr_pages);
2100 return m;
2101
2102out2:
2103 ceph_msg_put(m);
2104out:
2105 pr_err("msg_new can't create type %d len %d\n", type, front_len);
2106 return ERR_PTR(-ENOMEM);
2107}
2108
2109/*
2110 * Allocate "middle" portion of a message, if it is needed and wasn't
2111 * allocated by alloc_msg. This allows us to read a small fixed-size
2112 * per-type header in the front and then gracefully fail (i.e.,
2113 * propagate the error to the caller based on info in the front) when
2114 * the middle is too large.
2115 */
2116static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
2117{
2118 int type = le16_to_cpu(msg->hdr.type);
2119 int middle_len = le32_to_cpu(msg->hdr.middle_len);
2120
2121 dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
2122 ceph_msg_type_name(type), middle_len);
2123 BUG_ON(!middle_len);
2124 BUG_ON(msg->middle);
2125
2126 msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
2127 if (!msg->middle)
2128 return -ENOMEM;
2129 return 0;
2130}
2131
2132/*
2133 * Generic message allocator, for incoming messages.
2134 */
2135static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
2136 struct ceph_msg_header *hdr,
2137 int *skip)
2138{
2139 int type = le16_to_cpu(hdr->type);
2140 int front_len = le32_to_cpu(hdr->front_len);
2141 int middle_len = le32_to_cpu(hdr->middle_len);
2142 struct ceph_msg *msg = NULL;
2143 int ret;
2144
2145 if (con->ops->alloc_msg) {
2146 mutex_unlock(&con->mutex);
2147 msg = con->ops->alloc_msg(con, hdr, skip);
2148 mutex_lock(&con->mutex);
2149 if (IS_ERR(msg))
2150 return msg;
2151
2152 if (*skip)
2153 return NULL;
2154 }
2155 if (!msg) {
2156 *skip = 0;
2157 msg = ceph_msg_new(type, front_len, 0, 0, NULL);
2158 if (!msg) {
2159 pr_err("unable to allocate msg type %d len %d\n",
2160 type, front_len);
2161 return ERR_PTR(-ENOMEM);
2162 }
2163 }
2164 memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
2165
2166 if (middle_len) {
2167 ret = ceph_alloc_middle(con, msg);
2168
2169 if (ret < 0) {
2170 ceph_msg_put(msg);
2171 return msg;
2172 }
2173 }
2174
2175 return msg;
2176}
2177
2178
2179/*
2180 * Free a generically kmalloc'd message.
2181 */
2182void ceph_msg_kfree(struct ceph_msg *m)
2183{
2184 dout("msg_kfree %p\n", m);
2185 if (m->front_is_vmalloc)
2186 vfree(m->front.iov_base);
2187 else
2188 kfree(m->front.iov_base);
2189 kfree(m);
2190}
2191
2192/*
2193 * Drop a msg ref. Destroy as needed.
2194 */
2195void ceph_msg_last_put(struct kref *kref)
2196{
2197 struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
2198
2199 dout("ceph_msg_put last one on %p\n", m);
2200 WARN_ON(!list_empty(&m->list_head));
2201
2202 /* drop middle, data, if any */
2203 if (m->middle) {
2204 ceph_buffer_put(m->middle);
2205 m->middle = NULL;
2206 }
2207 m->nr_pages = 0;
2208 m->pages = NULL;
2209
2210 if (m->pagelist) {
2211 ceph_pagelist_release(m->pagelist);
2212 kfree(m->pagelist);
2213 m->pagelist = NULL;
2214 }
2215
2216 if (m->pool)
2217 ceph_msgpool_put(m->pool, m);
2218 else
2219 ceph_msg_kfree(m);
2220}
2221
2222void ceph_msg_dump(struct ceph_msg *msg)
2223{
2224 pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
2225 msg->front_max, msg->nr_pages);
2226 print_hex_dump(KERN_DEBUG, "header: ",
2227 DUMP_PREFIX_OFFSET, 16, 1,
2228 &msg->hdr, sizeof(msg->hdr), true);
2229 print_hex_dump(KERN_DEBUG, " front: ",
2230 DUMP_PREFIX_OFFSET, 16, 1,
2231 msg->front.iov_base, msg->front.iov_len, true);
2232 if (msg->middle)
2233 print_hex_dump(KERN_DEBUG, "middle: ",
2234 DUMP_PREFIX_OFFSET, 16, 1,
2235 msg->middle->vec.iov_base,
2236 msg->middle->vec.iov_len, true);
2237 print_hex_dump(KERN_DEBUG, "footer: ",
2238 DUMP_PREFIX_OFFSET, 16, 1,
2239 &msg->footer, sizeof(msg->footer), true);
2240}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
new file mode 100644
index 000000000000..4caaa5911110
--- /dev/null
+++ b/fs/ceph/messenger.h
@@ -0,0 +1,254 @@
1#ifndef __FS_CEPH_MESSENGER_H
2#define __FS_CEPH_MESSENGER_H
3
4#include <linux/kref.h>
5#include <linux/mutex.h>
6#include <linux/net.h>
7#include <linux/radix-tree.h>
8#include <linux/uio.h>
9#include <linux/version.h>
10#include <linux/workqueue.h>
11
12#include "types.h"
13#include "buffer.h"
14
15struct ceph_msg;
16struct ceph_connection;
17
18extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
19
20/*
21 * Ceph defines these callbacks for handling connection events.
22 */
23struct ceph_connection_operations {
24 struct ceph_connection *(*get)(struct ceph_connection *);
25 void (*put)(struct ceph_connection *);
26
27 /* handle an incoming message. */
28 void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
29
30 /* authorize an outgoing connection */
31 int (*get_authorizer) (struct ceph_connection *con,
32 void **buf, int *len, int *proto,
33 void **reply_buf, int *reply_len, int force_new);
34 int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
35 int (*invalidate_authorizer)(struct ceph_connection *con);
36
37 /* protocol version mismatch */
38 void (*bad_proto) (struct ceph_connection *con);
39
40 /* there was some error on the socket (disconnect, whatever) */
41 void (*fault) (struct ceph_connection *con);
42
43 /* a remote host as terminated a message exchange session, and messages
44 * we sent (or they tried to send us) may be lost. */
45 void (*peer_reset) (struct ceph_connection *con);
46
47 struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,
48 struct ceph_msg_header *hdr,
49 int *skip);
50};
51
52extern const char *ceph_name_type_str(int t);
53
54/* use format string %s%d */
55#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
56
57struct ceph_messenger {
58 struct ceph_entity_inst inst; /* my name+address */
59 struct ceph_entity_addr my_enc_addr;
60 struct page *zero_page; /* used in certain error cases */
61
62 bool nocrc;
63
64 /*
65 * the global_seq counts connections i (attempt to) initiate
66 * in order to disambiguate certain connect race conditions.
67 */
68 u32 global_seq;
69 spinlock_t global_seq_lock;
70};
71
72/*
73 * a single message. it contains a header (src, dest, message type, etc.),
74 * footer (crc values, mainly), a "front" message body, and possibly a
75 * data payload (stored in some number of pages).
76 */
77struct ceph_msg {
78 struct ceph_msg_header hdr; /* header */
79 struct ceph_msg_footer footer; /* footer */
80 struct kvec front; /* unaligned blobs of message */
81 struct ceph_buffer *middle;
82 struct page **pages; /* data payload. NOT OWNER. */
83 unsigned nr_pages; /* size of page array */
84 struct ceph_pagelist *pagelist; /* instead of pages */
85 struct list_head list_head;
86 struct kref kref;
87 bool front_is_vmalloc;
88 bool more_to_follow;
89 int front_max;
90
91 struct ceph_msgpool *pool;
92};
93
94struct ceph_msg_pos {
95 int page, page_pos; /* which page; offset in page */
96 int data_pos; /* offset in data payload */
97 int did_page_crc; /* true if we've calculated crc for current page */
98};
99
100/* ceph connection fault delay defaults, for exponential backoff */
101#define BASE_DELAY_INTERVAL (HZ/2)
102#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
103
104/*
105 * ceph_connection state bit flags
106 *
107 * QUEUED and BUSY are used together to ensure that only a single
108 * thread is currently opening, reading or writing data to the socket.
109 */
110#define LOSSYTX 0 /* we can close channel or drop messages on errors */
111#define CONNECTING 1
112#define NEGOTIATING 2
113#define KEEPALIVE_PENDING 3
114#define WRITE_PENDING 4 /* we have data ready to send */
115#define QUEUED 5 /* there is work queued on this connection */
116#define BUSY 6 /* work is being done */
117#define STANDBY 8 /* no outgoing messages, socket closed. we keep
118 * the ceph_connection around to maintain shared
119 * state with the peer. */
120#define CLOSED 10 /* we've closed the connection */
121#define SOCK_CLOSED 11 /* socket state changed to closed */
122#define OPENING 13 /* open connection w/ (possibly new) peer */
123#define DEAD 14 /* dead, about to kfree */
124
125/*
126 * A single connection with another host.
127 *
128 * We maintain a queue of outgoing messages, and some session state to
129 * ensure that we can preserve the lossless, ordered delivery of
130 * messages in the case of a TCP disconnect.
131 */
132struct ceph_connection {
133 void *private;
134 atomic_t nref;
135
136 const struct ceph_connection_operations *ops;
137
138 struct ceph_messenger *msgr;
139 struct socket *sock;
140 unsigned long state; /* connection state (see flags above) */
141 const char *error_msg; /* error message, if any */
142
143 struct ceph_entity_addr peer_addr; /* peer address */
144 struct ceph_entity_name peer_name; /* peer name */
145 struct ceph_entity_addr peer_addr_for_me;
146 u32 connect_seq; /* identify the most recent connection
147 attempt for this connection, client */
148 u32 peer_global_seq; /* peer's global seq for this connection */
149
150 int auth_retry; /* true if we need a newer authorizer */
151 void *auth_reply_buf; /* where to put the authorizer reply */
152 int auth_reply_buf_len;
153
154 struct mutex mutex;
155
156 /* out queue */
157 struct list_head out_queue;
158 struct list_head out_sent; /* sending or sent but unacked */
159 u64 out_seq; /* last message queued for send */
160 u64 out_seq_sent; /* last message sent */
161 bool out_keepalive_pending;
162
163 u64 in_seq, in_seq_acked; /* last message received, acked */
164
165 /* connection negotiation temps */
166 char in_banner[CEPH_BANNER_MAX_LEN];
167 union {
168 struct { /* outgoing connection */
169 struct ceph_msg_connect out_connect;
170 struct ceph_msg_connect_reply in_reply;
171 };
172 struct { /* incoming */
173 struct ceph_msg_connect in_connect;
174 struct ceph_msg_connect_reply out_reply;
175 };
176 };
177 struct ceph_entity_addr actual_peer_addr;
178
179 /* message out temps */
180 struct ceph_msg *out_msg; /* sending message (== tail of
181 out_sent) */
182 bool out_msg_done;
183 struct ceph_msg_pos out_msg_pos;
184
185 struct kvec out_kvec[8], /* sending header/footer data */
186 *out_kvec_cur;
187 int out_kvec_left; /* kvec's left in out_kvec */
188 int out_skip; /* skip this many bytes */
189 int out_kvec_bytes; /* total bytes left */
190 bool out_kvec_is_msg; /* kvec refers to out_msg */
191 int out_more; /* there is more data after the kvecs */
192 __le64 out_temp_ack; /* for writing an ack */
193
194 /* message in temps */
195 struct ceph_msg_header in_hdr;
196 struct ceph_msg *in_msg;
197 struct ceph_msg_pos in_msg_pos;
198 u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
199
200 char in_tag; /* protocol control byte */
201 int in_base_pos; /* bytes read */
202 __le64 in_temp_ack; /* for reading an ack */
203
204 struct delayed_work work; /* send|recv work */
205 unsigned long delay; /* current delay interval */
206};
207
208
209extern const char *pr_addr(const struct sockaddr_storage *ss);
210extern int ceph_parse_ips(const char *c, const char *end,
211 struct ceph_entity_addr *addr,
212 int max_count, int *count);
213
214
215extern int ceph_msgr_init(void);
216extern void ceph_msgr_exit(void);
217
218extern struct ceph_messenger *ceph_messenger_create(
219 struct ceph_entity_addr *myaddr);
220extern void ceph_messenger_destroy(struct ceph_messenger *);
221
222extern void ceph_con_init(struct ceph_messenger *msgr,
223 struct ceph_connection *con);
224extern void ceph_con_open(struct ceph_connection *con,
225 struct ceph_entity_addr *addr);
226extern void ceph_con_close(struct ceph_connection *con);
227extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
228extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
229extern void ceph_con_revoke_message(struct ceph_connection *con,
230 struct ceph_msg *msg);
231extern void ceph_con_keepalive(struct ceph_connection *con);
232extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
233extern void ceph_con_put(struct ceph_connection *con);
234
235extern struct ceph_msg *ceph_msg_new(int type, int front_len,
236 int page_len, int page_off,
237 struct page **pages);
238extern void ceph_msg_kfree(struct ceph_msg *m);
239
240
241static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
242{
243 kref_get(&msg->kref);
244 return msg;
245}
246extern void ceph_msg_last_put(struct kref *kref);
247static inline void ceph_msg_put(struct ceph_msg *msg)
248{
249 kref_put(&msg->kref, ceph_msg_last_put);
250}
251
252extern void ceph_msg_dump(struct ceph_msg *msg);
253
254#endif
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
new file mode 100644
index 000000000000..890597c09d43
--- /dev/null
+++ b/fs/ceph/mon_client.c
@@ -0,0 +1,834 @@
1#include "ceph_debug.h"
2
3#include <linux/types.h>
4#include <linux/random.h>
5#include <linux/sched.h>
6
7#include "mon_client.h"
8#include "super.h"
9#include "auth.h"
10#include "decode.h"
11
12/*
13 * Interact with Ceph monitor cluster. Handle requests for new map
14 * versions, and periodically resend as needed. Also implement
15 * statfs() and umount().
16 *
17 * A small cluster of Ceph "monitors" are responsible for managing critical
18 * cluster configuration and state information. An odd number (e.g., 3, 5)
19 * of cmon daemons use a modified version of the Paxos part-time parliament
20 * algorithm to manage the MDS map (mds cluster membership), OSD map, and
21 * list of clients who have mounted the file system.
22 *
23 * We maintain an open, active session with a monitor at all times in order to
24 * receive timely MDSMap updates. We periodically send a keepalive byte on the
25 * TCP socket to ensure we detect a failure. If the connection does break, we
26 * randomly hunt for a new monitor. Once the connection is reestablished, we
27 * resend any outstanding requests.
28 */
29
30const static struct ceph_connection_operations mon_con_ops;
31
32static int __validate_auth(struct ceph_mon_client *monc);
33
34/*
35 * Decode a monmap blob (e.g., during mount).
36 */
37struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
38{
39 struct ceph_monmap *m = NULL;
40 int i, err = -EINVAL;
41 struct ceph_fsid fsid;
42 u32 epoch, num_mon;
43 u16 version;
44 u32 len;
45
46 ceph_decode_32_safe(&p, end, len, bad);
47 ceph_decode_need(&p, end, len, bad);
48
49 dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
50
51 ceph_decode_16_safe(&p, end, version, bad);
52
53 ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
54 ceph_decode_copy(&p, &fsid, sizeof(fsid));
55 epoch = ceph_decode_32(&p);
56
57 num_mon = ceph_decode_32(&p);
58 ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
59
60 if (num_mon >= CEPH_MAX_MON)
61 goto bad;
62 m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
63 if (m == NULL)
64 return ERR_PTR(-ENOMEM);
65 m->fsid = fsid;
66 m->epoch = epoch;
67 m->num_mon = num_mon;
68 ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
69 for (i = 0; i < num_mon; i++)
70 ceph_decode_addr(&m->mon_inst[i].addr);
71
72 dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
73 m->num_mon);
74 for (i = 0; i < m->num_mon; i++)
75 dout("monmap_decode mon%d is %s\n", i,
76 pr_addr(&m->mon_inst[i].addr.in_addr));
77 return m;
78
79bad:
80 dout("monmap_decode failed with %d\n", err);
81 kfree(m);
82 return ERR_PTR(err);
83}
84
85/*
86 * return true if *addr is included in the monmap.
87 */
88int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
89{
90 int i;
91
92 for (i = 0; i < m->num_mon; i++)
93 if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
94 return 1;
95 return 0;
96}
97
98/*
99 * Send an auth request.
100 */
101static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
102{
103 monc->pending_auth = 1;
104 monc->m_auth->front.iov_len = len;
105 monc->m_auth->hdr.front_len = cpu_to_le32(len);
106 ceph_msg_get(monc->m_auth); /* keep our ref */
107 ceph_con_send(monc->con, monc->m_auth);
108}
109
110/*
111 * Close monitor session, if any.
112 */
113static void __close_session(struct ceph_mon_client *monc)
114{
115 if (monc->con) {
116 dout("__close_session closing mon%d\n", monc->cur_mon);
117 ceph_con_revoke(monc->con, monc->m_auth);
118 ceph_con_close(monc->con);
119 monc->cur_mon = -1;
120 monc->pending_auth = 0;
121 ceph_auth_reset(monc->auth);
122 }
123}
124
125/*
126 * Open a session with a (new) monitor.
127 */
128static int __open_session(struct ceph_mon_client *monc)
129{
130 char r;
131 int ret;
132
133 if (monc->cur_mon < 0) {
134 get_random_bytes(&r, 1);
135 monc->cur_mon = r % monc->monmap->num_mon;
136 dout("open_session num=%d r=%d -> mon%d\n",
137 monc->monmap->num_mon, r, monc->cur_mon);
138 monc->sub_sent = 0;
139 monc->sub_renew_after = jiffies; /* i.e., expired */
140 monc->want_next_osdmap = !!monc->want_next_osdmap;
141
142 dout("open_session mon%d opening\n", monc->cur_mon);
143 monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
144 monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
145 ceph_con_open(monc->con,
146 &monc->monmap->mon_inst[monc->cur_mon].addr);
147
148 /* initiatiate authentication handshake */
149 ret = ceph_auth_build_hello(monc->auth,
150 monc->m_auth->front.iov_base,
151 monc->m_auth->front_max);
152 __send_prepared_auth_request(monc, ret);
153 } else {
154 dout("open_session mon%d already open\n", monc->cur_mon);
155 }
156 return 0;
157}
158
159static bool __sub_expired(struct ceph_mon_client *monc)
160{
161 return time_after_eq(jiffies, monc->sub_renew_after);
162}
163
164/*
165 * Reschedule delayed work timer.
166 */
167static void __schedule_delayed(struct ceph_mon_client *monc)
168{
169 unsigned delay;
170
171 if (monc->cur_mon < 0 || __sub_expired(monc))
172 delay = 10 * HZ;
173 else
174 delay = 20 * HZ;
175 dout("__schedule_delayed after %u\n", delay);
176 schedule_delayed_work(&monc->delayed_work, delay);
177}
178
179/*
180 * Send subscribe request for mdsmap and/or osdmap.
181 */
182static void __send_subscribe(struct ceph_mon_client *monc)
183{
184 dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
185 (unsigned)monc->sub_sent, __sub_expired(monc),
186 monc->want_next_osdmap);
187 if ((__sub_expired(monc) && !monc->sub_sent) ||
188 monc->want_next_osdmap == 1) {
189 struct ceph_msg *msg;
190 struct ceph_mon_subscribe_item *i;
191 void *p, *end;
192
193 msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
194 if (!msg)
195 return;
196
197 p = msg->front.iov_base;
198 end = p + msg->front.iov_len;
199
200 dout("__send_subscribe to 'mdsmap' %u+\n",
201 (unsigned)monc->have_mdsmap);
202 if (monc->want_next_osdmap) {
203 dout("__send_subscribe to 'osdmap' %u\n",
204 (unsigned)monc->have_osdmap);
205 ceph_encode_32(&p, 3);
206 ceph_encode_string(&p, end, "osdmap", 6);
207 i = p;
208 i->have = cpu_to_le64(monc->have_osdmap);
209 i->onetime = 1;
210 p += sizeof(*i);
211 monc->want_next_osdmap = 2; /* requested */
212 } else {
213 ceph_encode_32(&p, 2);
214 }
215 ceph_encode_string(&p, end, "mdsmap", 6);
216 i = p;
217 i->have = cpu_to_le64(monc->have_mdsmap);
218 i->onetime = 0;
219 p += sizeof(*i);
220 ceph_encode_string(&p, end, "monmap", 6);
221 i = p;
222 i->have = 0;
223 i->onetime = 0;
224 p += sizeof(*i);
225
226 msg->front.iov_len = p - msg->front.iov_base;
227 msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
228 ceph_con_send(monc->con, msg);
229
230 monc->sub_sent = jiffies | 1; /* never 0 */
231 }
232}
233
234static void handle_subscribe_ack(struct ceph_mon_client *monc,
235 struct ceph_msg *msg)
236{
237 unsigned seconds;
238 struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
239
240 if (msg->front.iov_len < sizeof(*h))
241 goto bad;
242 seconds = le32_to_cpu(h->duration);
243
244 mutex_lock(&monc->mutex);
245 if (monc->hunting) {
246 pr_info("mon%d %s session established\n",
247 monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
248 monc->hunting = false;
249 }
250 dout("handle_subscribe_ack after %d seconds\n", seconds);
251 monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
252 monc->sub_sent = 0;
253 mutex_unlock(&monc->mutex);
254 return;
255bad:
256 pr_err("got corrupt subscribe-ack msg\n");
257 ceph_msg_dump(msg);
258}
259
260/*
261 * Keep track of which maps we have
262 */
263int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
264{
265 mutex_lock(&monc->mutex);
266 monc->have_mdsmap = got;
267 mutex_unlock(&monc->mutex);
268 return 0;
269}
270
271int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
272{
273 mutex_lock(&monc->mutex);
274 monc->have_osdmap = got;
275 monc->want_next_osdmap = 0;
276 mutex_unlock(&monc->mutex);
277 return 0;
278}
279
280/*
281 * Register interest in the next osdmap
282 */
283void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
284{
285 dout("request_next_osdmap have %u\n", monc->have_osdmap);
286 mutex_lock(&monc->mutex);
287 if (!monc->want_next_osdmap)
288 monc->want_next_osdmap = 1;
289 if (monc->want_next_osdmap < 2)
290 __send_subscribe(monc);
291 mutex_unlock(&monc->mutex);
292}
293
294/*
295 *
296 */
297int ceph_monc_open_session(struct ceph_mon_client *monc)
298{
299 if (!monc->con) {
300 monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
301 if (!monc->con)
302 return -ENOMEM;
303 ceph_con_init(monc->client->msgr, monc->con);
304 monc->con->private = monc;
305 monc->con->ops = &mon_con_ops;
306 }
307
308 mutex_lock(&monc->mutex);
309 __open_session(monc);
310 __schedule_delayed(monc);
311 mutex_unlock(&monc->mutex);
312 return 0;
313}
314
315/*
316 * The monitor responds with mount ack indicate mount success. The
317 * included client ticket allows the client to talk to MDSs and OSDs.
318 */
319static void ceph_monc_handle_map(struct ceph_mon_client *monc,
320 struct ceph_msg *msg)
321{
322 struct ceph_client *client = monc->client;
323 struct ceph_monmap *monmap = NULL, *old = monc->monmap;
324 void *p, *end;
325
326 mutex_lock(&monc->mutex);
327
328 dout("handle_monmap\n");
329 p = msg->front.iov_base;
330 end = p + msg->front.iov_len;
331
332 monmap = ceph_monmap_decode(p, end);
333 if (IS_ERR(monmap)) {
334 pr_err("problem decoding monmap, %d\n",
335 (int)PTR_ERR(monmap));
336 goto out;
337 }
338
339 if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
340 kfree(monmap);
341 goto out;
342 }
343
344 client->monc.monmap = monmap;
345 kfree(old);
346
347out:
348 mutex_unlock(&monc->mutex);
349 wake_up(&client->auth_wq);
350}
351
352/*
353 * statfs
354 */
355static struct ceph_mon_statfs_request *__lookup_statfs(
356 struct ceph_mon_client *monc, u64 tid)
357{
358 struct ceph_mon_statfs_request *req;
359 struct rb_node *n = monc->statfs_request_tree.rb_node;
360
361 while (n) {
362 req = rb_entry(n, struct ceph_mon_statfs_request, node);
363 if (tid < req->tid)
364 n = n->rb_left;
365 else if (tid > req->tid)
366 n = n->rb_right;
367 else
368 return req;
369 }
370 return NULL;
371}
372
373static void __insert_statfs(struct ceph_mon_client *monc,
374 struct ceph_mon_statfs_request *new)
375{
376 struct rb_node **p = &monc->statfs_request_tree.rb_node;
377 struct rb_node *parent = NULL;
378 struct ceph_mon_statfs_request *req = NULL;
379
380 while (*p) {
381 parent = *p;
382 req = rb_entry(parent, struct ceph_mon_statfs_request, node);
383 if (new->tid < req->tid)
384 p = &(*p)->rb_left;
385 else if (new->tid > req->tid)
386 p = &(*p)->rb_right;
387 else
388 BUG();
389 }
390
391 rb_link_node(&new->node, parent, p);
392 rb_insert_color(&new->node, &monc->statfs_request_tree);
393}
394
395static void handle_statfs_reply(struct ceph_mon_client *monc,
396 struct ceph_msg *msg)
397{
398 struct ceph_mon_statfs_request *req;
399 struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
400 u64 tid;
401
402 if (msg->front.iov_len != sizeof(*reply))
403 goto bad;
404 tid = le64_to_cpu(msg->hdr.tid);
405 dout("handle_statfs_reply %p tid %llu\n", msg, tid);
406
407 mutex_lock(&monc->mutex);
408 req = __lookup_statfs(monc, tid);
409 if (req) {
410 *req->buf = reply->st;
411 req->result = 0;
412 }
413 mutex_unlock(&monc->mutex);
414 if (req)
415 complete(&req->completion);
416 return;
417
418bad:
419 pr_err("corrupt statfs reply, no tid\n");
420 ceph_msg_dump(msg);
421}
422
423/*
424 * (re)send a statfs request
425 */
426static int send_statfs(struct ceph_mon_client *monc,
427 struct ceph_mon_statfs_request *req)
428{
429 struct ceph_msg *msg;
430 struct ceph_mon_statfs *h;
431
432 dout("send_statfs tid %llu\n", req->tid);
433 msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
434 if (IS_ERR(msg))
435 return PTR_ERR(msg);
436 req->request = msg;
437 msg->hdr.tid = cpu_to_le64(req->tid);
438 h = msg->front.iov_base;
439 h->monhdr.have_version = 0;
440 h->monhdr.session_mon = cpu_to_le16(-1);
441 h->monhdr.session_mon_tid = 0;
442 h->fsid = monc->monmap->fsid;
443 ceph_con_send(monc->con, msg);
444 return 0;
445}
446
447/*
448 * Do a synchronous statfs().
449 */
450int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
451{
452 struct ceph_mon_statfs_request req;
453 int err;
454
455 req.buf = buf;
456 init_completion(&req.completion);
457
458 /* allocate memory for reply */
459 err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
460 if (err)
461 return err;
462
463 /* register request */
464 mutex_lock(&monc->mutex);
465 req.tid = ++monc->last_tid;
466 req.last_attempt = jiffies;
467 req.delay = BASE_DELAY_INTERVAL;
468 __insert_statfs(monc, &req);
469 monc->num_statfs_requests++;
470 mutex_unlock(&monc->mutex);
471
472 /* send request and wait */
473 err = send_statfs(monc, &req);
474 if (!err)
475 err = wait_for_completion_interruptible(&req.completion);
476
477 mutex_lock(&monc->mutex);
478 rb_erase(&req.node, &monc->statfs_request_tree);
479 monc->num_statfs_requests--;
480 ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
481 mutex_unlock(&monc->mutex);
482
483 if (!err)
484 err = req.result;
485 return err;
486}
487
488/*
489 * Resend pending statfs requests.
490 */
491static void __resend_statfs(struct ceph_mon_client *monc)
492{
493 struct ceph_mon_statfs_request *req;
494 struct rb_node *p;
495
496 for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
497 req = rb_entry(p, struct ceph_mon_statfs_request, node);
498 send_statfs(monc, req);
499 }
500}
501
502/*
503 * Delayed work. If we haven't mounted yet, retry. Otherwise,
504 * renew/retry subscription as needed (in case it is timing out, or we
505 * got an ENOMEM). And keep the monitor connection alive.
506 */
507static void delayed_work(struct work_struct *work)
508{
509 struct ceph_mon_client *monc =
510 container_of(work, struct ceph_mon_client, delayed_work.work);
511
512 dout("monc delayed_work\n");
513 mutex_lock(&monc->mutex);
514 if (monc->hunting) {
515 __close_session(monc);
516 __open_session(monc); /* continue hunting */
517 } else {
518 ceph_con_keepalive(monc->con);
519
520 __validate_auth(monc);
521
522 if (monc->auth->ops->is_authenticated(monc->auth))
523 __send_subscribe(monc);
524 }
525 __schedule_delayed(monc);
526 mutex_unlock(&monc->mutex);
527}
528
529/*
530 * On startup, we build a temporary monmap populated with the IPs
531 * provided by mount(2).
532 */
533static int build_initial_monmap(struct ceph_mon_client *monc)
534{
535 struct ceph_mount_args *args = monc->client->mount_args;
536 struct ceph_entity_addr *mon_addr = args->mon_addr;
537 int num_mon = args->num_mon;
538 int i;
539
540 /* build initial monmap */
541 monc->monmap = kzalloc(sizeof(*monc->monmap) +
542 num_mon*sizeof(monc->monmap->mon_inst[0]),
543 GFP_KERNEL);
544 if (!monc->monmap)
545 return -ENOMEM;
546 for (i = 0; i < num_mon; i++) {
547 monc->monmap->mon_inst[i].addr = mon_addr[i];
548 monc->monmap->mon_inst[i].addr.nonce = 0;
549 monc->monmap->mon_inst[i].name.type =
550 CEPH_ENTITY_TYPE_MON;
551 monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
552 }
553 monc->monmap->num_mon = num_mon;
554 monc->have_fsid = false;
555
556 /* release addr memory */
557 kfree(args->mon_addr);
558 args->mon_addr = NULL;
559 args->num_mon = 0;
560 return 0;
561}
562
563int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
564{
565 int err = 0;
566
567 dout("init\n");
568 memset(monc, 0, sizeof(*monc));
569 monc->client = cl;
570 monc->monmap = NULL;
571 mutex_init(&monc->mutex);
572
573 err = build_initial_monmap(monc);
574 if (err)
575 goto out;
576
577 monc->con = NULL;
578
579 /* authentication */
580 monc->auth = ceph_auth_init(cl->mount_args->name,
581 cl->mount_args->secret);
582 if (IS_ERR(monc->auth))
583 return PTR_ERR(monc->auth);
584 monc->auth->want_keys =
585 CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
586 CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
587
588 /* msg pools */
589 err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
590 sizeof(struct ceph_mon_subscribe_ack), 1, false);
591 if (err < 0)
592 goto out_monmap;
593 err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
594 sizeof(struct ceph_mon_statfs_reply), 0, false);
595 if (err < 0)
596 goto out_pool1;
597 err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
598 if (err < 0)
599 goto out_pool2;
600
601 monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
602 monc->pending_auth = 0;
603 if (IS_ERR(monc->m_auth)) {
604 err = PTR_ERR(monc->m_auth);
605 monc->m_auth = NULL;
606 goto out_pool3;
607 }
608
609 monc->cur_mon = -1;
610 monc->hunting = true;
611 monc->sub_renew_after = jiffies;
612 monc->sub_sent = 0;
613
614 INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
615 monc->statfs_request_tree = RB_ROOT;
616 monc->num_statfs_requests = 0;
617 monc->last_tid = 0;
618
619 monc->have_mdsmap = 0;
620 monc->have_osdmap = 0;
621 monc->want_next_osdmap = 1;
622 return 0;
623
624out_pool3:
625 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
626out_pool2:
627 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
628out_pool1:
629 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
630out_monmap:
631 kfree(monc->monmap);
632out:
633 return err;
634}
635
636void ceph_monc_stop(struct ceph_mon_client *monc)
637{
638 dout("stop\n");
639 cancel_delayed_work_sync(&monc->delayed_work);
640
641 mutex_lock(&monc->mutex);
642 __close_session(monc);
643 if (monc->con) {
644 monc->con->private = NULL;
645 monc->con->ops->put(monc->con);
646 monc->con = NULL;
647 }
648 mutex_unlock(&monc->mutex);
649
650 ceph_auth_destroy(monc->auth);
651
652 ceph_msg_put(monc->m_auth);
653 ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
654 ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
655 ceph_msgpool_destroy(&monc->msgpool_auth_reply);
656
657 kfree(monc->monmap);
658}
659
660static void handle_auth_reply(struct ceph_mon_client *monc,
661 struct ceph_msg *msg)
662{
663 int ret;
664
665 mutex_lock(&monc->mutex);
666 monc->pending_auth = 0;
667 ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
668 msg->front.iov_len,
669 monc->m_auth->front.iov_base,
670 monc->m_auth->front_max);
671 if (ret < 0) {
672 monc->client->auth_err = ret;
673 wake_up(&monc->client->auth_wq);
674 } else if (ret > 0) {
675 __send_prepared_auth_request(monc, ret);
676 } else if (monc->auth->ops->is_authenticated(monc->auth)) {
677 dout("authenticated, starting session\n");
678
679 monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
680 monc->client->msgr->inst.name.num = monc->auth->global_id;
681
682 __send_subscribe(monc);
683 __resend_statfs(monc);
684 }
685 mutex_unlock(&monc->mutex);
686}
687
688static int __validate_auth(struct ceph_mon_client *monc)
689{
690 int ret;
691
692 if (monc->pending_auth)
693 return 0;
694
695 ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
696 monc->m_auth->front_max);
697 if (ret <= 0)
698 return ret; /* either an error, or no need to authenticate */
699 __send_prepared_auth_request(monc, ret);
700 return 0;
701}
702
703int ceph_monc_validate_auth(struct ceph_mon_client *monc)
704{
705 int ret;
706
707 mutex_lock(&monc->mutex);
708 ret = __validate_auth(monc);
709 mutex_unlock(&monc->mutex);
710 return ret;
711}
712
713/*
714 * handle incoming message
715 */
716static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
717{
718 struct ceph_mon_client *monc = con->private;
719 int type = le16_to_cpu(msg->hdr.type);
720
721 if (!monc)
722 return;
723
724 switch (type) {
725 case CEPH_MSG_AUTH_REPLY:
726 handle_auth_reply(monc, msg);
727 break;
728
729 case CEPH_MSG_MON_SUBSCRIBE_ACK:
730 handle_subscribe_ack(monc, msg);
731 break;
732
733 case CEPH_MSG_STATFS_REPLY:
734 handle_statfs_reply(monc, msg);
735 break;
736
737 case CEPH_MSG_MON_MAP:
738 ceph_monc_handle_map(monc, msg);
739 break;
740
741 case CEPH_MSG_MDS_MAP:
742 ceph_mdsc_handle_map(&monc->client->mdsc, msg);
743 break;
744
745 case CEPH_MSG_OSD_MAP:
746 ceph_osdc_handle_map(&monc->client->osdc, msg);
747 break;
748
749 default:
750 pr_err("received unknown message type %d %s\n", type,
751 ceph_msg_type_name(type));
752 }
753 ceph_msg_put(msg);
754}
755
756/*
757 * Allocate memory for incoming message
758 */
759static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
760 struct ceph_msg_header *hdr,
761 int *skip)
762{
763 struct ceph_mon_client *monc = con->private;
764 int type = le16_to_cpu(hdr->type);
765 int front_len = le32_to_cpu(hdr->front_len);
766 struct ceph_msg *m = NULL;
767
768 *skip = 0;
769
770 switch (type) {
771 case CEPH_MSG_MON_SUBSCRIBE_ACK:
772 m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
773 break;
774 case CEPH_MSG_STATFS_REPLY:
775 m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
776 break;
777 case CEPH_MSG_AUTH_REPLY:
778 m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
779 break;
780 case CEPH_MSG_MON_MAP:
781 case CEPH_MSG_MDS_MAP:
782 case CEPH_MSG_OSD_MAP:
783 m = ceph_msg_new(type, front_len, 0, 0, NULL);
784 break;
785 }
786
787 if (!m) {
788 pr_info("alloc_msg unknown type %d\n", type);
789 *skip = 1;
790 }
791 return m;
792}
793
794/*
795 * If the monitor connection resets, pick a new monitor and resubmit
796 * any pending requests.
797 */
798static void mon_fault(struct ceph_connection *con)
799{
800 struct ceph_mon_client *monc = con->private;
801
802 if (!monc)
803 return;
804
805 dout("mon_fault\n");
806 mutex_lock(&monc->mutex);
807 if (!con->private)
808 goto out;
809
810 if (monc->con && !monc->hunting)
811 pr_info("mon%d %s session lost, "
812 "hunting for new mon\n", monc->cur_mon,
813 pr_addr(&monc->con->peer_addr.in_addr));
814
815 __close_session(monc);
816 if (!monc->hunting) {
817 /* start hunting */
818 monc->hunting = true;
819 __open_session(monc);
820 } else {
821 /* already hunting, let's wait a bit */
822 __schedule_delayed(monc);
823 }
824out:
825 mutex_unlock(&monc->mutex);
826}
827
828const static struct ceph_connection_operations mon_con_ops = {
829 .get = ceph_con_get,
830 .put = ceph_con_put,
831 .dispatch = dispatch,
832 .fault = mon_fault,
833 .alloc_msg = mon_alloc_msg,
834};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
new file mode 100644
index 000000000000..b958ad5afa06
--- /dev/null
+++ b/fs/ceph/mon_client.h
@@ -0,0 +1,119 @@
1#ifndef _FS_CEPH_MON_CLIENT_H
2#define _FS_CEPH_MON_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/rbtree.h>
6
7#include "messenger.h"
8#include "msgpool.h"
9
10struct ceph_client;
11struct ceph_mount_args;
12struct ceph_auth_client;
13
14/*
15 * The monitor map enumerates the set of all monitors.
16 */
17struct ceph_monmap {
18 struct ceph_fsid fsid;
19 u32 epoch;
20 u32 num_mon;
21 struct ceph_entity_inst mon_inst[0];
22};
23
24struct ceph_mon_client;
25struct ceph_mon_statfs_request;
26
27
28/*
29 * Generic mechanism for resending monitor requests.
30 */
31typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
32 int newmon);
33
34/* a pending monitor request */
35struct ceph_mon_request {
36 struct ceph_mon_client *monc;
37 struct delayed_work delayed_work;
38 unsigned long delay;
39 ceph_monc_request_func_t do_request;
40};
41
42/*
43 * statfs() is done a bit differently because we need to get data back
44 * to the caller
45 */
46struct ceph_mon_statfs_request {
47 u64 tid;
48 struct rb_node node;
49 int result;
50 struct ceph_statfs *buf;
51 struct completion completion;
52 unsigned long last_attempt, delay; /* jiffies */
53 struct ceph_msg *request; /* original request */
54};
55
56struct ceph_mon_client {
57 struct ceph_client *client;
58 struct ceph_monmap *monmap;
59
60 struct mutex mutex;
61 struct delayed_work delayed_work;
62
63 struct ceph_auth_client *auth;
64 struct ceph_msg *m_auth;
65 int pending_auth;
66
67 bool hunting;
68 int cur_mon; /* last monitor i contacted */
69 unsigned long sub_sent, sub_renew_after;
70 struct ceph_connection *con;
71 bool have_fsid;
72
73 /* msg pools */
74 struct ceph_msgpool msgpool_subscribe_ack;
75 struct ceph_msgpool msgpool_statfs_reply;
76 struct ceph_msgpool msgpool_auth_reply;
77
78 /* pending statfs requests */
79 struct rb_root statfs_request_tree;
80 int num_statfs_requests;
81 u64 last_tid;
82
83 /* mds/osd map */
84 int want_next_osdmap; /* 1 = want, 2 = want+asked */
85 u32 have_osdmap, have_mdsmap;
86
87#ifdef CONFIG_DEBUG_FS
88 struct dentry *debugfs_file;
89#endif
90};
91
92extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
93extern int ceph_monmap_contains(struct ceph_monmap *m,
94 struct ceph_entity_addr *addr);
95
96extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
97extern void ceph_monc_stop(struct ceph_mon_client *monc);
98
99/*
100 * The model here is to indicate that we need a new map of at least
101 * epoch @want, and also call in when we receive a map. We will
102 * periodically rerequest the map from the monitor cluster until we
103 * get what we want.
104 */
105extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
106extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
107
108extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
109
110extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
111 struct ceph_statfs *buf);
112
113extern int ceph_monc_open_session(struct ceph_mon_client *monc);
114
115extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
116
117
118
119#endif
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
new file mode 100644
index 000000000000..ca3b44a89f2d
--- /dev/null
+++ b/fs/ceph/msgpool.c
@@ -0,0 +1,186 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/sched.h>
5#include <linux/types.h>
6#include <linux/vmalloc.h>
7
8#include "msgpool.h"
9
10/*
11 * We use msg pools to preallocate memory for messages we expect to
12 * receive over the wire, to avoid getting ourselves into OOM
13 * conditions at unexpected times. We take use a few different
14 * strategies:
15 *
16 * - for request/response type interactions, we preallocate the
17 * memory needed for the response when we generate the request.
18 *
19 * - for messages we can receive at any time from the MDS, we preallocate
20 * a pool of messages we can re-use.
21 *
22 * - for writeback, we preallocate some number of messages to use for
23 * requests and their replies, so that we always make forward
24 * progress.
25 *
26 * The msgpool behaves like a mempool_t, but keeps preallocated
27 * ceph_msgs strung together on a list_head instead of using a pointer
28 * vector. This avoids vector reallocation when we adjust the number
29 * of preallocated items (which happens frequently).
30 */
31
32
33/*
34 * Allocate or release as necessary to meet our target pool size.
35 */
36static int __fill_msgpool(struct ceph_msgpool *pool)
37{
38 struct ceph_msg *msg;
39
40 while (pool->num < pool->min) {
41 dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
42 pool->min);
43 spin_unlock(&pool->lock);
44 msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
45 spin_lock(&pool->lock);
46 if (IS_ERR(msg))
47 return PTR_ERR(msg);
48 msg->pool = pool;
49 list_add(&msg->list_head, &pool->msgs);
50 pool->num++;
51 }
52 while (pool->num > pool->min) {
53 msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
54 dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
55 pool->min, msg);
56 list_del_init(&msg->list_head);
57 pool->num--;
58 ceph_msg_kfree(msg);
59 }
60 return 0;
61}
62
63int ceph_msgpool_init(struct ceph_msgpool *pool,
64 int front_len, int min, bool blocking)
65{
66 int ret;
67
68 dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
69 spin_lock_init(&pool->lock);
70 pool->front_len = front_len;
71 INIT_LIST_HEAD(&pool->msgs);
72 pool->num = 0;
73 pool->min = min;
74 pool->blocking = blocking;
75 init_waitqueue_head(&pool->wait);
76
77 spin_lock(&pool->lock);
78 ret = __fill_msgpool(pool);
79 spin_unlock(&pool->lock);
80 return ret;
81}
82
83void ceph_msgpool_destroy(struct ceph_msgpool *pool)
84{
85 dout("msgpool_destroy %p\n", pool);
86 spin_lock(&pool->lock);
87 pool->min = 0;
88 __fill_msgpool(pool);
89 spin_unlock(&pool->lock);
90}
91
92int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
93{
94 int ret;
95
96 spin_lock(&pool->lock);
97 dout("msgpool_resv %p delta %d\n", pool, delta);
98 pool->min += delta;
99 ret = __fill_msgpool(pool);
100 spin_unlock(&pool->lock);
101 return ret;
102}
103
104struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
105{
106 wait_queue_t wait;
107 struct ceph_msg *msg;
108
109 if (front_len && front_len > pool->front_len) {
110 pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
111 pool, front_len, pool->front_len);
112 WARN_ON(1);
113
114 /* try to alloc a fresh message */
115 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
116 if (!IS_ERR(msg))
117 return msg;
118 }
119
120 if (!front_len)
121 front_len = pool->front_len;
122
123 if (pool->blocking) {
124 /* mempool_t behavior; first try to alloc */
125 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
126 if (!IS_ERR(msg))
127 return msg;
128 }
129
130 while (1) {
131 spin_lock(&pool->lock);
132 if (likely(pool->num)) {
133 msg = list_entry(pool->msgs.next, struct ceph_msg,
134 list_head);
135 list_del_init(&msg->list_head);
136 pool->num--;
137 dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
138 pool->num, pool->min);
139 spin_unlock(&pool->lock);
140 return msg;
141 }
142 pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
143 pool->min, pool->blocking ? "waiting" : "may fail");
144 spin_unlock(&pool->lock);
145
146 if (!pool->blocking) {
147 WARN_ON(1);
148
149 /* maybe we can allocate it now? */
150 msg = ceph_msg_new(0, front_len, 0, 0, NULL);
151 if (!IS_ERR(msg))
152 return msg;
153
154 pr_err("msgpool_get %p empty + alloc failed\n", pool);
155 return ERR_PTR(-ENOMEM);
156 }
157
158 init_wait(&wait);
159 prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
160 schedule();
161 finish_wait(&pool->wait, &wait);
162 }
163}
164
165void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
166{
167 spin_lock(&pool->lock);
168 if (pool->num < pool->min) {
169 /* reset msg front_len; user may have changed it */
170 msg->front.iov_len = pool->front_len;
171 msg->hdr.front_len = cpu_to_le32(pool->front_len);
172
173 kref_set(&msg->kref, 1); /* retake a single ref */
174 list_add(&msg->list_head, &pool->msgs);
175 pool->num++;
176 dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
177 pool->num, pool->min);
178 spin_unlock(&pool->lock);
179 wake_up(&pool->wait);
180 } else {
181 dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
182 pool->num, pool->min);
183 spin_unlock(&pool->lock);
184 ceph_msg_kfree(msg);
185 }
186}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
new file mode 100644
index 000000000000..bc834bfcd720
--- /dev/null
+++ b/fs/ceph/msgpool.h
@@ -0,0 +1,27 @@
1#ifndef _FS_CEPH_MSGPOOL
2#define _FS_CEPH_MSGPOOL
3
4#include "messenger.h"
5
6/*
7 * we use memory pools for preallocating messages we may receive, to
8 * avoid unexpected OOM conditions.
9 */
10struct ceph_msgpool {
11 spinlock_t lock;
12 int front_len; /* preallocated payload size */
13 struct list_head msgs; /* msgs in the pool; each has 1 ref */
14 int num, min; /* cur, min # msgs in the pool */
15 bool blocking;
16 wait_queue_head_t wait;
17};
18
19extern int ceph_msgpool_init(struct ceph_msgpool *pool,
20 int front_len, int size, bool blocking);
21extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
22extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
23extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
24 int front_len);
25extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
26
27#endif
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
new file mode 100644
index 000000000000..8aaab414f3f8
--- /dev/null
+++ b/fs/ceph/msgr.h
@@ -0,0 +1,158 @@
1#ifndef __MSGR_H
2#define __MSGR_H
3
4/*
5 * Data types for message passing layer used by Ceph.
6 */
7
8#define CEPH_MON_PORT 6789 /* default monitor port */
9
10/*
11 * client-side processes will try to bind to ports in this
12 * range, simply for the benefit of tools like nmap or wireshark
13 * that would like to identify the protocol.
14 */
15#define CEPH_PORT_FIRST 6789
16#define CEPH_PORT_START 6800 /* non-monitors start here */
17#define CEPH_PORT_LAST 6900
18
19/*
20 * tcp connection banner. include a protocol version. and adjust
21 * whenever the wire protocol changes. try to keep this string length
22 * constant.
23 */
24#define CEPH_BANNER "ceph v027"
25#define CEPH_BANNER_MAX_LEN 30
26
27
28/*
29 * Rollover-safe type and comparator for 32-bit sequence numbers.
30 * Comparator returns -1, 0, or 1.
31 */
32typedef __u32 ceph_seq_t;
33
34static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
35{
36 return (__s32)a - (__s32)b;
37}
38
39
40/*
41 * entity_name -- logical name for a process participating in the
42 * network, e.g. 'mds0' or 'osd3'.
43 */
44struct ceph_entity_name {
45 __u8 type; /* CEPH_ENTITY_TYPE_* */
46 __le64 num;
47} __attribute__ ((packed));
48
49#define CEPH_ENTITY_TYPE_MON 0x01
50#define CEPH_ENTITY_TYPE_MDS 0x02
51#define CEPH_ENTITY_TYPE_OSD 0x04
52#define CEPH_ENTITY_TYPE_CLIENT 0x08
53#define CEPH_ENTITY_TYPE_ADMIN 0x10
54#define CEPH_ENTITY_TYPE_AUTH 0x20
55
56#define CEPH_ENTITY_TYPE_ANY 0xFF
57
58extern const char *ceph_entity_type_name(int type);
59
60/*
61 * entity_addr -- network address
62 */
63struct ceph_entity_addr {
64 __le32 type;
65 __le32 nonce; /* unique id for process (e.g. pid) */
66 struct sockaddr_storage in_addr;
67} __attribute__ ((packed));
68
69struct ceph_entity_inst {
70 struct ceph_entity_name name;
71 struct ceph_entity_addr addr;
72} __attribute__ ((packed));
73
74
75/* used by message exchange protocol */
76#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
77#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
78#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
79 incoming connection */
80#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
81 with higher cseq */
82#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
83 with higher gseq */
84#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
85#define CEPH_MSGR_TAG_MSG 7 /* message */
86#define CEPH_MSGR_TAG_ACK 8 /* message ack */
87#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
88#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
89#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
90#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
91
92
93/*
94 * connection negotiation
95 */
96struct ceph_msg_connect {
97 __le64 features; /* supported feature bits */
98 __le32 host_type; /* CEPH_ENTITY_TYPE_* */
99 __le32 global_seq; /* count connections initiated by this host */
100 __le32 connect_seq; /* count connections initiated in this session */
101 __le32 protocol_version;
102 __le32 authorizer_protocol;
103 __le32 authorizer_len;
104 __u8 flags; /* CEPH_MSG_CONNECT_* */
105} __attribute__ ((packed));
106
107struct ceph_msg_connect_reply {
108 __u8 tag;
109 __le64 features; /* feature bits for this session */
110 __le32 global_seq;
111 __le32 connect_seq;
112 __le32 protocol_version;
113 __le32 authorizer_len;
114 __u8 flags;
115} __attribute__ ((packed));
116
117#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
118
119
120/*
121 * message header
122 */
123struct ceph_msg_header {
124 __le64 seq; /* message seq# for this session */
125 __le64 tid; /* transaction id */
126 __le16 type; /* message type */
127 __le16 priority; /* priority. higher value == higher priority */
128 __le16 version; /* version of message encoding */
129
130 __le32 front_len; /* bytes in main payload */
131 __le32 middle_len;/* bytes in middle payload */
132 __le32 data_len; /* bytes of data payload */
133 __le16 data_off; /* sender: include full offset;
134 receiver: mask against ~PAGE_MASK */
135
136 struct ceph_entity_inst src, orig_src;
137 __le32 reserved;
138 __le32 crc; /* header crc32c */
139} __attribute__ ((packed));
140
141#define CEPH_MSG_PRIO_LOW 64
142#define CEPH_MSG_PRIO_DEFAULT 127
143#define CEPH_MSG_PRIO_HIGH 196
144#define CEPH_MSG_PRIO_HIGHEST 255
145
146/*
147 * follows data payload
148 */
149struct ceph_msg_footer {
150 __le32 front_crc, middle_crc, data_crc;
151 __u8 flags;
152} __attribute__ ((packed));
153
154#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
155#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
156
157
158#endif
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
new file mode 100644
index 000000000000..dbe63db9762f
--- /dev/null
+++ b/fs/ceph/osd_client.c
@@ -0,0 +1,1537 @@
1#include "ceph_debug.h"
2
3#include <linux/err.h>
4#include <linux/highmem.h>
5#include <linux/mm.h>
6#include <linux/pagemap.h>
7#include <linux/slab.h>
8#include <linux/uaccess.h>
9
10#include "super.h"
11#include "osd_client.h"
12#include "messenger.h"
13#include "decode.h"
14#include "auth.h"
15
16#define OSD_OP_FRONT_LEN 4096
17#define OSD_OPREPLY_FRONT_LEN 512
18
19const static struct ceph_connection_operations osd_con_ops;
20static int __kick_requests(struct ceph_osd_client *osdc,
21 struct ceph_osd *kickosd);
22
23static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
24
25/*
26 * Implement client access to distributed object storage cluster.
27 *
28 * All data objects are stored within a cluster/cloud of OSDs, or
29 * "object storage devices." (Note that Ceph OSDs have _nothing_ to
30 * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
31 * remote daemons serving up and coordinating consistent and safe
32 * access to storage.
33 *
34 * Cluster membership and the mapping of data objects onto storage devices
35 * are described by the osd map.
36 *
37 * We keep track of pending OSD requests (read, write), resubmit
38 * requests to different OSDs when the cluster topology/data layout
39 * change, or retry the affected requests when the communications
40 * channel with an OSD is reset.
41 */
42
43/*
44 * calculate the mapping of a file extent onto an object, and fill out the
45 * request accordingly. shorten extent as necessary if it crosses an
46 * object boundary.
47 *
48 * fill osd op in request message.
49 */
50static void calc_layout(struct ceph_osd_client *osdc,
51 struct ceph_vino vino, struct ceph_file_layout *layout,
52 u64 off, u64 *plen,
53 struct ceph_osd_request *req)
54{
55 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
56 struct ceph_osd_op *op = (void *)(reqhead + 1);
57 u64 orig_len = *plen;
58 u64 objoff, objlen; /* extent in object */
59 u64 bno;
60
61 reqhead->snapid = cpu_to_le64(vino.snap);
62
63 /* object extent? */
64 ceph_calc_file_object_mapping(layout, off, plen, &bno,
65 &objoff, &objlen);
66 if (*plen < orig_len)
67 dout(" skipping last %llu, final file extent %llu~%llu\n",
68 orig_len - *plen, off, *plen);
69
70 sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
71 req->r_oid_len = strlen(req->r_oid);
72
73 op->extent.offset = cpu_to_le64(objoff);
74 op->extent.length = cpu_to_le64(objlen);
75 req->r_num_pages = calc_pages_for(off, *plen);
76
77 dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
78 req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
79}
80
81/*
82 * requests
83 */
84void ceph_osdc_release_request(struct kref *kref)
85{
86 struct ceph_osd_request *req = container_of(kref,
87 struct ceph_osd_request,
88 r_kref);
89
90 if (req->r_request)
91 ceph_msg_put(req->r_request);
92 if (req->r_reply)
93 ceph_msg_put(req->r_reply);
94 if (req->r_con_filling_msg) {
95 dout("release_request revoking pages %p from con %p\n",
96 req->r_pages, req->r_con_filling_msg);
97 ceph_con_revoke_message(req->r_con_filling_msg,
98 req->r_reply);
99 ceph_con_put(req->r_con_filling_msg);
100 }
101 if (req->r_own_pages)
102 ceph_release_page_vector(req->r_pages,
103 req->r_num_pages);
104 ceph_put_snap_context(req->r_snapc);
105 if (req->r_mempool)
106 mempool_free(req, req->r_osdc->req_mempool);
107 else
108 kfree(req);
109}
110
111/*
112 * build new request AND message, calculate layout, and adjust file
113 * extent as needed.
114 *
115 * if the file was recently truncated, we include information about its
116 * old and new size so that the object can be updated appropriately. (we
117 * avoid synchronously deleting truncated objects because it's slow.)
118 *
119 * if @do_sync, include a 'startsync' command so that the osd will flush
120 * data quickly.
121 */
122struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
123 struct ceph_file_layout *layout,
124 struct ceph_vino vino,
125 u64 off, u64 *plen,
126 int opcode, int flags,
127 struct ceph_snap_context *snapc,
128 int do_sync,
129 u32 truncate_seq,
130 u64 truncate_size,
131 struct timespec *mtime,
132 bool use_mempool, int num_reply)
133{
134 struct ceph_osd_request *req;
135 struct ceph_msg *msg;
136 struct ceph_osd_request_head *head;
137 struct ceph_osd_op *op;
138 void *p;
139 int num_op = 1 + do_sync;
140 size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
141 int i;
142
143 if (use_mempool) {
144 req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
145 memset(req, 0, sizeof(*req));
146 } else {
147 req = kzalloc(sizeof(*req), GFP_NOFS);
148 }
149 if (req == NULL)
150 return ERR_PTR(-ENOMEM);
151
152 req->r_osdc = osdc;
153 req->r_mempool = use_mempool;
154 kref_init(&req->r_kref);
155 init_completion(&req->r_completion);
156 init_completion(&req->r_safe_completion);
157 INIT_LIST_HEAD(&req->r_unsafe_item);
158 req->r_flags = flags;
159
160 WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
161
162 /* create reply message */
163 if (use_mempool)
164 msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
165 else
166 msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
167 OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
168 if (IS_ERR(msg)) {
169 ceph_osdc_put_request(req);
170 return ERR_PTR(PTR_ERR(msg));
171 }
172 req->r_reply = msg;
173
174 /* create request message; allow space for oid */
175 msg_size += 40;
176 if (snapc)
177 msg_size += sizeof(u64) * snapc->num_snaps;
178 if (use_mempool)
179 msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
180 else
181 msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
182 if (IS_ERR(msg)) {
183 ceph_osdc_put_request(req);
184 return ERR_PTR(PTR_ERR(msg));
185 }
186 msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
187 memset(msg->front.iov_base, 0, msg->front.iov_len);
188 head = msg->front.iov_base;
189 op = (void *)(head + 1);
190 p = (void *)(op + num_op);
191
192 req->r_request = msg;
193 req->r_snapc = ceph_get_snap_context(snapc);
194
195 head->client_inc = cpu_to_le32(1); /* always, for now. */
196 head->flags = cpu_to_le32(flags);
197 if (flags & CEPH_OSD_FLAG_WRITE)
198 ceph_encode_timespec(&head->mtime, mtime);
199 head->num_ops = cpu_to_le16(num_op);
200 op->op = cpu_to_le16(opcode);
201
202 /* calculate max write size */
203 calc_layout(osdc, vino, layout, off, plen, req);
204 req->r_file_layout = *layout; /* keep a copy */
205
206 if (flags & CEPH_OSD_FLAG_WRITE) {
207 req->r_request->hdr.data_off = cpu_to_le16(off);
208 req->r_request->hdr.data_len = cpu_to_le32(*plen);
209 op->payload_len = cpu_to_le32(*plen);
210 }
211 op->extent.truncate_size = cpu_to_le64(truncate_size);
212 op->extent.truncate_seq = cpu_to_le32(truncate_seq);
213
214 /* fill in oid */
215 head->object_len = cpu_to_le32(req->r_oid_len);
216 memcpy(p, req->r_oid, req->r_oid_len);
217 p += req->r_oid_len;
218
219 if (do_sync) {
220 op++;
221 op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
222 }
223 if (snapc) {
224 head->snap_seq = cpu_to_le64(snapc->seq);
225 head->num_snaps = cpu_to_le32(snapc->num_snaps);
226 for (i = 0; i < snapc->num_snaps; i++) {
227 put_unaligned_le64(snapc->snaps[i], p);
228 p += sizeof(u64);
229 }
230 }
231
232 BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
233 msg_size = p - msg->front.iov_base;
234 msg->front.iov_len = msg_size;
235 msg->hdr.front_len = cpu_to_le32(msg_size);
236 return req;
237}
238
239/*
240 * We keep osd requests in an rbtree, sorted by ->r_tid.
241 */
242static void __insert_request(struct ceph_osd_client *osdc,
243 struct ceph_osd_request *new)
244{
245 struct rb_node **p = &osdc->requests.rb_node;
246 struct rb_node *parent = NULL;
247 struct ceph_osd_request *req = NULL;
248
249 while (*p) {
250 parent = *p;
251 req = rb_entry(parent, struct ceph_osd_request, r_node);
252 if (new->r_tid < req->r_tid)
253 p = &(*p)->rb_left;
254 else if (new->r_tid > req->r_tid)
255 p = &(*p)->rb_right;
256 else
257 BUG();
258 }
259
260 rb_link_node(&new->r_node, parent, p);
261 rb_insert_color(&new->r_node, &osdc->requests);
262}
263
264static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
265 u64 tid)
266{
267 struct ceph_osd_request *req;
268 struct rb_node *n = osdc->requests.rb_node;
269
270 while (n) {
271 req = rb_entry(n, struct ceph_osd_request, r_node);
272 if (tid < req->r_tid)
273 n = n->rb_left;
274 else if (tid > req->r_tid)
275 n = n->rb_right;
276 else
277 return req;
278 }
279 return NULL;
280}
281
282static struct ceph_osd_request *
283__lookup_request_ge(struct ceph_osd_client *osdc,
284 u64 tid)
285{
286 struct ceph_osd_request *req;
287 struct rb_node *n = osdc->requests.rb_node;
288
289 while (n) {
290 req = rb_entry(n, struct ceph_osd_request, r_node);
291 if (tid < req->r_tid) {
292 if (!n->rb_left)
293 return req;
294 n = n->rb_left;
295 } else if (tid > req->r_tid) {
296 n = n->rb_right;
297 } else {
298 return req;
299 }
300 }
301 return NULL;
302}
303
304
305/*
306 * If the osd connection drops, we need to resubmit all requests.
307 */
308static void osd_reset(struct ceph_connection *con)
309{
310 struct ceph_osd *osd = con->private;
311 struct ceph_osd_client *osdc;
312
313 if (!osd)
314 return;
315 dout("osd_reset osd%d\n", osd->o_osd);
316 osdc = osd->o_osdc;
317 down_read(&osdc->map_sem);
318 kick_requests(osdc, osd);
319 up_read(&osdc->map_sem);
320}
321
322/*
323 * Track open sessions with osds.
324 */
325static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
326{
327 struct ceph_osd *osd;
328
329 osd = kzalloc(sizeof(*osd), GFP_NOFS);
330 if (!osd)
331 return NULL;
332
333 atomic_set(&osd->o_ref, 1);
334 osd->o_osdc = osdc;
335 INIT_LIST_HEAD(&osd->o_requests);
336 INIT_LIST_HEAD(&osd->o_osd_lru);
337 osd->o_incarnation = 1;
338
339 ceph_con_init(osdc->client->msgr, &osd->o_con);
340 osd->o_con.private = osd;
341 osd->o_con.ops = &osd_con_ops;
342 osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
343
344 INIT_LIST_HEAD(&osd->o_keepalive_item);
345 return osd;
346}
347
348static struct ceph_osd *get_osd(struct ceph_osd *osd)
349{
350 if (atomic_inc_not_zero(&osd->o_ref)) {
351 dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
352 atomic_read(&osd->o_ref));
353 return osd;
354 } else {
355 dout("get_osd %p FAIL\n", osd);
356 return NULL;
357 }
358}
359
360static void put_osd(struct ceph_osd *osd)
361{
362 dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
363 atomic_read(&osd->o_ref) - 1);
364 if (atomic_dec_and_test(&osd->o_ref))
365 kfree(osd);
366}
367
368/*
369 * remove an osd from our map
370 */
371static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
372{
373 dout("__remove_osd %p\n", osd);
374 BUG_ON(!list_empty(&osd->o_requests));
375 rb_erase(&osd->o_node, &osdc->osds);
376 list_del_init(&osd->o_osd_lru);
377 ceph_con_close(&osd->o_con);
378 put_osd(osd);
379}
380
381static void __move_osd_to_lru(struct ceph_osd_client *osdc,
382 struct ceph_osd *osd)
383{
384 dout("__move_osd_to_lru %p\n", osd);
385 BUG_ON(!list_empty(&osd->o_osd_lru));
386 list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
387 osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
388}
389
390static void __remove_osd_from_lru(struct ceph_osd *osd)
391{
392 dout("__remove_osd_from_lru %p\n", osd);
393 if (!list_empty(&osd->o_osd_lru))
394 list_del_init(&osd->o_osd_lru);
395}
396
397static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
398{
399 struct ceph_osd *osd, *nosd;
400
401 dout("__remove_old_osds %p\n", osdc);
402 mutex_lock(&osdc->request_mutex);
403 list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
404 if (!remove_all && time_before(jiffies, osd->lru_ttl))
405 break;
406 __remove_osd(osdc, osd);
407 }
408 mutex_unlock(&osdc->request_mutex);
409}
410
411/*
412 * reset osd connect
413 */
414static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
415{
416 int ret = 0;
417
418 dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
419 if (list_empty(&osd->o_requests)) {
420 __remove_osd(osdc, osd);
421 } else {
422 ceph_con_close(&osd->o_con);
423 ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
424 osd->o_incarnation++;
425 }
426 return ret;
427}
428
429static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
430{
431 struct rb_node **p = &osdc->osds.rb_node;
432 struct rb_node *parent = NULL;
433 struct ceph_osd *osd = NULL;
434
435 while (*p) {
436 parent = *p;
437 osd = rb_entry(parent, struct ceph_osd, o_node);
438 if (new->o_osd < osd->o_osd)
439 p = &(*p)->rb_left;
440 else if (new->o_osd > osd->o_osd)
441 p = &(*p)->rb_right;
442 else
443 BUG();
444 }
445
446 rb_link_node(&new->o_node, parent, p);
447 rb_insert_color(&new->o_node, &osdc->osds);
448}
449
450static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
451{
452 struct ceph_osd *osd;
453 struct rb_node *n = osdc->osds.rb_node;
454
455 while (n) {
456 osd = rb_entry(n, struct ceph_osd, o_node);
457 if (o < osd->o_osd)
458 n = n->rb_left;
459 else if (o > osd->o_osd)
460 n = n->rb_right;
461 else
462 return osd;
463 }
464 return NULL;
465}
466
467static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
468{
469 schedule_delayed_work(&osdc->timeout_work,
470 osdc->client->mount_args->osd_keepalive_timeout * HZ);
471}
472
473static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
474{
475 cancel_delayed_work(&osdc->timeout_work);
476}
477
478/*
479 * Register request, assign tid. If this is the first request, set up
480 * the timeout event.
481 */
482static void register_request(struct ceph_osd_client *osdc,
483 struct ceph_osd_request *req)
484{
485 mutex_lock(&osdc->request_mutex);
486 req->r_tid = ++osdc->last_tid;
487 req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
488 INIT_LIST_HEAD(&req->r_req_lru_item);
489
490 dout("register_request %p tid %lld\n", req, req->r_tid);
491 __insert_request(osdc, req);
492 ceph_osdc_get_request(req);
493 osdc->num_requests++;
494
495 if (osdc->num_requests == 1) {
496 dout(" first request, scheduling timeout\n");
497 __schedule_osd_timeout(osdc);
498 }
499 mutex_unlock(&osdc->request_mutex);
500}
501
502/*
503 * called under osdc->request_mutex
504 */
505static void __unregister_request(struct ceph_osd_client *osdc,
506 struct ceph_osd_request *req)
507{
508 dout("__unregister_request %p tid %lld\n", req, req->r_tid);
509 rb_erase(&req->r_node, &osdc->requests);
510 osdc->num_requests--;
511
512 if (req->r_osd) {
513 /* make sure the original request isn't in flight. */
514 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
515
516 list_del_init(&req->r_osd_item);
517 if (list_empty(&req->r_osd->o_requests))
518 __move_osd_to_lru(osdc, req->r_osd);
519 req->r_osd = NULL;
520 }
521
522 ceph_osdc_put_request(req);
523
524 list_del_init(&req->r_req_lru_item);
525 if (osdc->num_requests == 0) {
526 dout(" no requests, canceling timeout\n");
527 __cancel_osd_timeout(osdc);
528 }
529}
530
531/*
532 * Cancel a previously queued request message
533 */
534static void __cancel_request(struct ceph_osd_request *req)
535{
536 if (req->r_sent) {
537 ceph_con_revoke(&req->r_osd->o_con, req->r_request);
538 req->r_sent = 0;
539 }
540 list_del_init(&req->r_req_lru_item);
541}
542
543/*
544 * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
545 * (as needed), and set the request r_osd appropriately. If there is
546 * no up osd, set r_osd to NULL.
547 *
548 * Return 0 if unchanged, 1 if changed, or negative on error.
549 *
550 * Caller should hold map_sem for read and request_mutex.
551 */
552static int __map_osds(struct ceph_osd_client *osdc,
553 struct ceph_osd_request *req)
554{
555 struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
556 struct ceph_pg pgid;
557 int o = -1;
558 int err;
559
560 dout("map_osds %p tid %lld\n", req, req->r_tid);
561 err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
562 &req->r_file_layout, osdc->osdmap);
563 if (err)
564 return err;
565 pgid = reqhead->layout.ol_pgid;
566 req->r_pgid = pgid;
567
568 o = ceph_calc_pg_primary(osdc->osdmap, pgid);
569
570 if ((req->r_osd && req->r_osd->o_osd == o &&
571 req->r_sent >= req->r_osd->o_incarnation) ||
572 (req->r_osd == NULL && o == -1))
573 return 0; /* no change */
574
575 dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
576 req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
577 req->r_osd ? req->r_osd->o_osd : -1);
578
579 if (req->r_osd) {
580 __cancel_request(req);
581 list_del_init(&req->r_osd_item);
582 req->r_osd = NULL;
583 }
584
585 req->r_osd = __lookup_osd(osdc, o);
586 if (!req->r_osd && o >= 0) {
587 err = -ENOMEM;
588 req->r_osd = create_osd(osdc);
589 if (!req->r_osd)
590 goto out;
591
592 dout("map_osds osd %p is osd%d\n", req->r_osd, o);
593 req->r_osd->o_osd = o;
594 req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
595 __insert_osd(osdc, req->r_osd);
596
597 ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
598 }
599
600 if (req->r_osd) {
601 __remove_osd_from_lru(req->r_osd);
602 list_add(&req->r_osd_item, &req->r_osd->o_requests);
603 }
604 err = 1; /* osd changed */
605
606out:
607 return err;
608}
609
610/*
611 * caller should hold map_sem (for read) and request_mutex
612 */
613static int __send_request(struct ceph_osd_client *osdc,
614 struct ceph_osd_request *req)
615{
616 struct ceph_osd_request_head *reqhead;
617 int err;
618
619 err = __map_osds(osdc, req);
620 if (err < 0)
621 return err;
622 if (req->r_osd == NULL) {
623 dout("send_request %p no up osds in pg\n", req);
624 ceph_monc_request_next_osdmap(&osdc->client->monc);
625 return 0;
626 }
627
628 dout("send_request %p tid %llu to osd%d flags %d\n",
629 req, req->r_tid, req->r_osd->o_osd, req->r_flags);
630
631 reqhead = req->r_request->front.iov_base;
632 reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
633 reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
634 reqhead->reassert_version = req->r_reassert_version;
635
636 req->r_sent_stamp = jiffies;
637 list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
638
639 ceph_msg_get(req->r_request); /* send consumes a ref */
640 ceph_con_send(&req->r_osd->o_con, req->r_request);
641 req->r_sent = req->r_osd->o_incarnation;
642 return 0;
643}
644
645/*
646 * Timeout callback, called every N seconds when 1 or more osd
647 * requests has been active for more than N seconds. When this
648 * happens, we ping all OSDs with requests who have timed out to
649 * ensure any communications channel reset is detected. Reset the
650 * request timeouts another N seconds in the future as we go.
651 * Reschedule the timeout event another N seconds in future (unless
652 * there are no open requests).
653 */
654static void handle_timeout(struct work_struct *work)
655{
656 struct ceph_osd_client *osdc =
657 container_of(work, struct ceph_osd_client, timeout_work.work);
658 struct ceph_osd_request *req, *last_req = NULL;
659 struct ceph_osd *osd;
660 unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
661 unsigned long keepalive =
662 osdc->client->mount_args->osd_keepalive_timeout * HZ;
663 unsigned long last_sent = 0;
664 struct rb_node *p;
665 struct list_head slow_osds;
666
667 dout("timeout\n");
668 down_read(&osdc->map_sem);
669
670 ceph_monc_request_next_osdmap(&osdc->client->monc);
671
672 mutex_lock(&osdc->request_mutex);
673 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
674 req = rb_entry(p, struct ceph_osd_request, r_node);
675
676 if (req->r_resend) {
677 int err;
678
679 dout("osdc resending prev failed %lld\n", req->r_tid);
680 err = __send_request(osdc, req);
681 if (err)
682 dout("osdc failed again on %lld\n", req->r_tid);
683 else
684 req->r_resend = false;
685 continue;
686 }
687 }
688
689 /*
690 * reset osds that appear to be _really_ unresponsive. this
691 * is a failsafe measure.. we really shouldn't be getting to
692 * this point if the system is working properly. the monitors
693 * should mark the osd as failed and we should find out about
694 * it from an updated osd map.
695 */
696 while (!list_empty(&osdc->req_lru)) {
697 req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
698 r_req_lru_item);
699
700 if (time_before(jiffies, req->r_sent_stamp + timeout))
701 break;
702
703 BUG_ON(req == last_req && req->r_sent_stamp == last_sent);
704 last_req = req;
705 last_sent = req->r_sent_stamp;
706
707 osd = req->r_osd;
708 BUG_ON(!osd);
709 pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
710 req->r_tid, osd->o_osd);
711 __kick_requests(osdc, osd);
712 }
713
714 /*
715 * ping osds that are a bit slow. this ensures that if there
716 * is a break in the TCP connection we will notice, and reopen
717 * a connection with that osd (from the fault callback).
718 */
719 INIT_LIST_HEAD(&slow_osds);
720 list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
721 if (time_before(jiffies, req->r_sent_stamp + keepalive))
722 break;
723
724 osd = req->r_osd;
725 BUG_ON(!osd);
726 dout(" tid %llu is slow, will send keepalive on osd%d\n",
727 req->r_tid, osd->o_osd);
728 list_move_tail(&osd->o_keepalive_item, &slow_osds);
729 }
730 while (!list_empty(&slow_osds)) {
731 osd = list_entry(slow_osds.next, struct ceph_osd,
732 o_keepalive_item);
733 list_del_init(&osd->o_keepalive_item);
734 ceph_con_keepalive(&osd->o_con);
735 }
736
737 __schedule_osd_timeout(osdc);
738 mutex_unlock(&osdc->request_mutex);
739
740 up_read(&osdc->map_sem);
741}
742
743static void handle_osds_timeout(struct work_struct *work)
744{
745 struct ceph_osd_client *osdc =
746 container_of(work, struct ceph_osd_client,
747 osds_timeout_work.work);
748 unsigned long delay =
749 osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
750
751 dout("osds timeout\n");
752 down_read(&osdc->map_sem);
753 remove_old_osds(osdc, 0);
754 up_read(&osdc->map_sem);
755
756 schedule_delayed_work(&osdc->osds_timeout_work,
757 round_jiffies_relative(delay));
758}
759
760/*
761 * handle osd op reply. either call the callback if it is specified,
762 * or do the completion to wake up the waiting thread.
763 */
764static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
765 struct ceph_connection *con)
766{
767 struct ceph_osd_reply_head *rhead = msg->front.iov_base;
768 struct ceph_osd_request *req;
769 u64 tid;
770 int numops, object_len, flags;
771
772 tid = le64_to_cpu(msg->hdr.tid);
773 if (msg->front.iov_len < sizeof(*rhead))
774 goto bad;
775 numops = le32_to_cpu(rhead->num_ops);
776 object_len = le32_to_cpu(rhead->object_len);
777 if (msg->front.iov_len != sizeof(*rhead) + object_len +
778 numops * sizeof(struct ceph_osd_op))
779 goto bad;
780 dout("handle_reply %p tid %llu\n", msg, tid);
781
782 /* lookup */
783 mutex_lock(&osdc->request_mutex);
784 req = __lookup_request(osdc, tid);
785 if (req == NULL) {
786 dout("handle_reply tid %llu dne\n", tid);
787 mutex_unlock(&osdc->request_mutex);
788 return;
789 }
790 ceph_osdc_get_request(req);
791 flags = le32_to_cpu(rhead->flags);
792
793 /*
794 * if this connection filled our message, drop our reference now, to
795 * avoid a (safe but slower) revoke later.
796 */
797 if (req->r_con_filling_msg == con && req->r_reply == msg) {
798 dout(" dropping con_filling_msg ref %p\n", con);
799 req->r_con_filling_msg = NULL;
800 ceph_con_put(con);
801 }
802
803 if (!req->r_got_reply) {
804 unsigned bytes;
805
806 req->r_result = le32_to_cpu(rhead->result);
807 bytes = le32_to_cpu(msg->hdr.data_len);
808 dout("handle_reply result %d bytes %d\n", req->r_result,
809 bytes);
810 if (req->r_result == 0)
811 req->r_result = bytes;
812
813 /* in case this is a write and we need to replay, */
814 req->r_reassert_version = rhead->reassert_version;
815
816 req->r_got_reply = 1;
817 } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
818 dout("handle_reply tid %llu dup ack\n", tid);
819 mutex_unlock(&osdc->request_mutex);
820 goto done;
821 }
822
823 dout("handle_reply tid %llu flags %d\n", tid, flags);
824
825 /* either this is a read, or we got the safe response */
826 if ((flags & CEPH_OSD_FLAG_ONDISK) ||
827 ((flags & CEPH_OSD_FLAG_WRITE) == 0))
828 __unregister_request(osdc, req);
829
830 mutex_unlock(&osdc->request_mutex);
831
832 if (req->r_callback)
833 req->r_callback(req, msg);
834 else
835 complete(&req->r_completion);
836
837 if (flags & CEPH_OSD_FLAG_ONDISK) {
838 if (req->r_safe_callback)
839 req->r_safe_callback(req, msg);
840 complete(&req->r_safe_completion); /* fsync waiter */
841 }
842
843done:
844 ceph_osdc_put_request(req);
845 return;
846
847bad:
848 pr_err("corrupt osd_op_reply got %d %d expected %d\n",
849 (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
850 (int)sizeof(*rhead));
851 ceph_msg_dump(msg);
852}
853
854
855static int __kick_requests(struct ceph_osd_client *osdc,
856 struct ceph_osd *kickosd)
857{
858 struct ceph_osd_request *req;
859 struct rb_node *p, *n;
860 int needmap = 0;
861 int err;
862
863 dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
864 if (kickosd) {
865 __reset_osd(osdc, kickosd);
866 } else {
867 for (p = rb_first(&osdc->osds); p; p = n) {
868 struct ceph_osd *osd =
869 rb_entry(p, struct ceph_osd, o_node);
870
871 n = rb_next(p);
872 if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
873 memcmp(&osd->o_con.peer_addr,
874 ceph_osd_addr(osdc->osdmap,
875 osd->o_osd),
876 sizeof(struct ceph_entity_addr)) != 0)
877 __reset_osd(osdc, osd);
878 }
879 }
880
881 for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
882 req = rb_entry(p, struct ceph_osd_request, r_node);
883
884 if (req->r_resend) {
885 dout(" r_resend set on tid %llu\n", req->r_tid);
886 __cancel_request(req);
887 goto kick;
888 }
889 if (req->r_osd && kickosd == req->r_osd) {
890 __cancel_request(req);
891 goto kick;
892 }
893
894 err = __map_osds(osdc, req);
895 if (err == 0)
896 continue; /* no change */
897 if (err < 0) {
898 /*
899 * FIXME: really, we should set the request
900 * error and fail if this isn't a 'nofail'
901 * request, but that's a fair bit more
902 * complicated to do. So retry!
903 */
904 dout(" setting r_resend on %llu\n", req->r_tid);
905 req->r_resend = true;
906 continue;
907 }
908 if (req->r_osd == NULL) {
909 dout("tid %llu maps to no valid osd\n", req->r_tid);
910 needmap++; /* request a newer map */
911 continue;
912 }
913
914kick:
915 dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
916 req->r_osd->o_osd);
917 req->r_flags |= CEPH_OSD_FLAG_RETRY;
918 err = __send_request(osdc, req);
919 if (err) {
920 dout(" setting r_resend on %llu\n", req->r_tid);
921 req->r_resend = true;
922 }
923 }
924
925 return needmap;
926}
927
928/*
929 * Resubmit osd requests whose osd or osd address has changed. Request
930 * a new osd map if osds are down, or we are otherwise unable to determine
931 * how to direct a request.
932 *
933 * Close connections to down osds.
934 *
935 * If @who is specified, resubmit requests for that specific osd.
936 *
937 * Caller should hold map_sem for read and request_mutex.
938 */
939static void kick_requests(struct ceph_osd_client *osdc,
940 struct ceph_osd *kickosd)
941{
942 int needmap;
943
944 mutex_lock(&osdc->request_mutex);
945 needmap = __kick_requests(osdc, kickosd);
946 mutex_unlock(&osdc->request_mutex);
947
948 if (needmap) {
949 dout("%d requests for down osds, need new map\n", needmap);
950 ceph_monc_request_next_osdmap(&osdc->client->monc);
951 }
952
953}
954/*
955 * Process updated osd map.
956 *
957 * The message contains any number of incremental and full maps, normally
958 * indicating some sort of topology change in the cluster. Kick requests
959 * off to different OSDs as needed.
960 */
961void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
962{
963 void *p, *end, *next;
964 u32 nr_maps, maplen;
965 u32 epoch;
966 struct ceph_osdmap *newmap = NULL, *oldmap;
967 int err;
968 struct ceph_fsid fsid;
969
970 dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
971 p = msg->front.iov_base;
972 end = p + msg->front.iov_len;
973
974 /* verify fsid */
975 ceph_decode_need(&p, end, sizeof(fsid), bad);
976 ceph_decode_copy(&p, &fsid, sizeof(fsid));
977 if (ceph_check_fsid(osdc->client, &fsid) < 0)
978 return;
979
980 down_write(&osdc->map_sem);
981
982 /* incremental maps */
983 ceph_decode_32_safe(&p, end, nr_maps, bad);
984 dout(" %d inc maps\n", nr_maps);
985 while (nr_maps > 0) {
986 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
987 epoch = ceph_decode_32(&p);
988 maplen = ceph_decode_32(&p);
989 ceph_decode_need(&p, end, maplen, bad);
990 next = p + maplen;
991 if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
992 dout("applying incremental map %u len %d\n",
993 epoch, maplen);
994 newmap = osdmap_apply_incremental(&p, next,
995 osdc->osdmap,
996 osdc->client->msgr);
997 if (IS_ERR(newmap)) {
998 err = PTR_ERR(newmap);
999 goto bad;
1000 }
1001 BUG_ON(!newmap);
1002 if (newmap != osdc->osdmap) {
1003 ceph_osdmap_destroy(osdc->osdmap);
1004 osdc->osdmap = newmap;
1005 }
1006 } else {
1007 dout("ignoring incremental map %u len %d\n",
1008 epoch, maplen);
1009 }
1010 p = next;
1011 nr_maps--;
1012 }
1013 if (newmap)
1014 goto done;
1015
1016 /* full maps */
1017 ceph_decode_32_safe(&p, end, nr_maps, bad);
1018 dout(" %d full maps\n", nr_maps);
1019 while (nr_maps) {
1020 ceph_decode_need(&p, end, 2*sizeof(u32), bad);
1021 epoch = ceph_decode_32(&p);
1022 maplen = ceph_decode_32(&p);
1023 ceph_decode_need(&p, end, maplen, bad);
1024 if (nr_maps > 1) {
1025 dout("skipping non-latest full map %u len %d\n",
1026 epoch, maplen);
1027 } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
1028 dout("skipping full map %u len %d, "
1029 "older than our %u\n", epoch, maplen,
1030 osdc->osdmap->epoch);
1031 } else {
1032 dout("taking full map %u len %d\n", epoch, maplen);
1033 newmap = osdmap_decode(&p, p+maplen);
1034 if (IS_ERR(newmap)) {
1035 err = PTR_ERR(newmap);
1036 goto bad;
1037 }
1038 BUG_ON(!newmap);
1039 oldmap = osdc->osdmap;
1040 osdc->osdmap = newmap;
1041 if (oldmap)
1042 ceph_osdmap_destroy(oldmap);
1043 }
1044 p += maplen;
1045 nr_maps--;
1046 }
1047
1048done:
1049 downgrade_write(&osdc->map_sem);
1050 ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
1051 if (newmap)
1052 kick_requests(osdc, NULL);
1053 up_read(&osdc->map_sem);
1054 return;
1055
1056bad:
1057 pr_err("osdc handle_map corrupt msg\n");
1058 ceph_msg_dump(msg);
1059 up_write(&osdc->map_sem);
1060 return;
1061}
1062
1063
1064/*
1065 * A read request prepares specific pages that data is to be read into.
1066 * When a message is being read off the wire, we call prepare_pages to
1067 * find those pages.
1068 * 0 = success, -1 failure.
1069 */
1070static int __prepare_pages(struct ceph_connection *con,
1071 struct ceph_msg_header *hdr,
1072 struct ceph_osd_request *req,
1073 u64 tid,
1074 struct ceph_msg *m)
1075{
1076 struct ceph_osd *osd = con->private;
1077 struct ceph_osd_client *osdc;
1078 int ret = -1;
1079 int data_len = le32_to_cpu(hdr->data_len);
1080 unsigned data_off = le16_to_cpu(hdr->data_off);
1081
1082 int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
1083
1084 if (!osd)
1085 return -1;
1086
1087 osdc = osd->o_osdc;
1088
1089 dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
1090 tid, req->r_num_pages, want);
1091 if (unlikely(req->r_num_pages < want))
1092 goto out;
1093 m->pages = req->r_pages;
1094 m->nr_pages = req->r_num_pages;
1095 ret = 0; /* success */
1096out:
1097 BUG_ON(ret < 0 || m->nr_pages < want);
1098
1099 return ret;
1100}
1101
1102/*
1103 * Register request, send initial attempt.
1104 */
1105int ceph_osdc_start_request(struct ceph_osd_client *osdc,
1106 struct ceph_osd_request *req,
1107 bool nofail)
1108{
1109 int rc = 0;
1110
1111 req->r_request->pages = req->r_pages;
1112 req->r_request->nr_pages = req->r_num_pages;
1113
1114 register_request(osdc, req);
1115
1116 down_read(&osdc->map_sem);
1117 mutex_lock(&osdc->request_mutex);
1118 /*
1119 * a racing kick_requests() may have sent the message for us
1120 * while we dropped request_mutex above, so only send now if
1121 * the request still han't been touched yet.
1122 */
1123 if (req->r_sent == 0) {
1124 rc = __send_request(osdc, req);
1125 if (rc) {
1126 if (nofail) {
1127 dout("osdc_start_request failed send, "
1128 " marking %lld\n", req->r_tid);
1129 req->r_resend = true;
1130 rc = 0;
1131 } else {
1132 __unregister_request(osdc, req);
1133 }
1134 }
1135 }
1136 mutex_unlock(&osdc->request_mutex);
1137 up_read(&osdc->map_sem);
1138 return rc;
1139}
1140
1141/*
1142 * wait for a request to complete
1143 */
1144int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
1145 struct ceph_osd_request *req)
1146{
1147 int rc;
1148
1149 rc = wait_for_completion_interruptible(&req->r_completion);
1150 if (rc < 0) {
1151 mutex_lock(&osdc->request_mutex);
1152 __cancel_request(req);
1153 __unregister_request(osdc, req);
1154 mutex_unlock(&osdc->request_mutex);
1155 dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
1156 return rc;
1157 }
1158
1159 dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
1160 return req->r_result;
1161}
1162
1163/*
1164 * sync - wait for all in-flight requests to flush. avoid starvation.
1165 */
1166void ceph_osdc_sync(struct ceph_osd_client *osdc)
1167{
1168 struct ceph_osd_request *req;
1169 u64 last_tid, next_tid = 0;
1170
1171 mutex_lock(&osdc->request_mutex);
1172 last_tid = osdc->last_tid;
1173 while (1) {
1174 req = __lookup_request_ge(osdc, next_tid);
1175 if (!req)
1176 break;
1177 if (req->r_tid > last_tid)
1178 break;
1179
1180 next_tid = req->r_tid + 1;
1181 if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
1182 continue;
1183
1184 ceph_osdc_get_request(req);
1185 mutex_unlock(&osdc->request_mutex);
1186 dout("sync waiting on tid %llu (last is %llu)\n",
1187 req->r_tid, last_tid);
1188 wait_for_completion(&req->r_safe_completion);
1189 mutex_lock(&osdc->request_mutex);
1190 ceph_osdc_put_request(req);
1191 }
1192 mutex_unlock(&osdc->request_mutex);
1193 dout("sync done (thru tid %llu)\n", last_tid);
1194}
1195
1196/*
1197 * init, shutdown
1198 */
1199int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
1200{
1201 int err;
1202
1203 dout("init\n");
1204 osdc->client = client;
1205 osdc->osdmap = NULL;
1206 init_rwsem(&osdc->map_sem);
1207 init_completion(&osdc->map_waiters);
1208 osdc->last_requested_map = 0;
1209 mutex_init(&osdc->request_mutex);
1210 osdc->last_tid = 0;
1211 osdc->osds = RB_ROOT;
1212 INIT_LIST_HEAD(&osdc->osd_lru);
1213 osdc->requests = RB_ROOT;
1214 INIT_LIST_HEAD(&osdc->req_lru);
1215 osdc->num_requests = 0;
1216 INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
1217 INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
1218
1219 schedule_delayed_work(&osdc->osds_timeout_work,
1220 round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
1221
1222 err = -ENOMEM;
1223 osdc->req_mempool = mempool_create_kmalloc_pool(10,
1224 sizeof(struct ceph_osd_request));
1225 if (!osdc->req_mempool)
1226 goto out;
1227
1228 err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
1229 if (err < 0)
1230 goto out_mempool;
1231 err = ceph_msgpool_init(&osdc->msgpool_op_reply,
1232 OSD_OPREPLY_FRONT_LEN, 10, true);
1233 if (err < 0)
1234 goto out_msgpool;
1235 return 0;
1236
1237out_msgpool:
1238 ceph_msgpool_destroy(&osdc->msgpool_op);
1239out_mempool:
1240 mempool_destroy(osdc->req_mempool);
1241out:
1242 return err;
1243}
1244
1245void ceph_osdc_stop(struct ceph_osd_client *osdc)
1246{
1247 cancel_delayed_work_sync(&osdc->timeout_work);
1248 cancel_delayed_work_sync(&osdc->osds_timeout_work);
1249 if (osdc->osdmap) {
1250 ceph_osdmap_destroy(osdc->osdmap);
1251 osdc->osdmap = NULL;
1252 }
1253 remove_old_osds(osdc, 1);
1254 mempool_destroy(osdc->req_mempool);
1255 ceph_msgpool_destroy(&osdc->msgpool_op);
1256 ceph_msgpool_destroy(&osdc->msgpool_op_reply);
1257}
1258
1259/*
1260 * Read some contiguous pages. If we cross a stripe boundary, shorten
1261 * *plen. Return number of bytes read, or error.
1262 */
1263int ceph_osdc_readpages(struct ceph_osd_client *osdc,
1264 struct ceph_vino vino, struct ceph_file_layout *layout,
1265 u64 off, u64 *plen,
1266 u32 truncate_seq, u64 truncate_size,
1267 struct page **pages, int num_pages)
1268{
1269 struct ceph_osd_request *req;
1270 int rc = 0;
1271
1272 dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
1273 vino.snap, off, *plen);
1274 req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
1275 CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
1276 NULL, 0, truncate_seq, truncate_size, NULL,
1277 false, 1);
1278 if (IS_ERR(req))
1279 return PTR_ERR(req);
1280
1281 /* it may be a short read due to an object boundary */
1282 req->r_pages = pages;
1283 num_pages = calc_pages_for(off, *plen);
1284 req->r_num_pages = num_pages;
1285
1286 dout("readpages final extent is %llu~%llu (%d pages)\n",
1287 off, *plen, req->r_num_pages);
1288
1289 rc = ceph_osdc_start_request(osdc, req, false);
1290 if (!rc)
1291 rc = ceph_osdc_wait_request(osdc, req);
1292
1293 ceph_osdc_put_request(req);
1294 dout("readpages result %d\n", rc);
1295 return rc;
1296}
1297
1298/*
1299 * do a synchronous write on N pages
1300 */
1301int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
1302 struct ceph_file_layout *layout,
1303 struct ceph_snap_context *snapc,
1304 u64 off, u64 len,
1305 u32 truncate_seq, u64 truncate_size,
1306 struct timespec *mtime,
1307 struct page **pages, int num_pages,
1308 int flags, int do_sync, bool nofail)
1309{
1310 struct ceph_osd_request *req;
1311 int rc = 0;
1312
1313 BUG_ON(vino.snap != CEPH_NOSNAP);
1314 req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
1315 CEPH_OSD_OP_WRITE,
1316 flags | CEPH_OSD_FLAG_ONDISK |
1317 CEPH_OSD_FLAG_WRITE,
1318 snapc, do_sync,
1319 truncate_seq, truncate_size, mtime,
1320 nofail, 1);
1321 if (IS_ERR(req))
1322 return PTR_ERR(req);
1323
1324 /* it may be a short write due to an object boundary */
1325 req->r_pages = pages;
1326 req->r_num_pages = calc_pages_for(off, len);
1327 dout("writepages %llu~%llu (%d pages)\n", off, len,
1328 req->r_num_pages);
1329
1330 rc = ceph_osdc_start_request(osdc, req, nofail);
1331 if (!rc)
1332 rc = ceph_osdc_wait_request(osdc, req);
1333
1334 ceph_osdc_put_request(req);
1335 if (rc == 0)
1336 rc = len;
1337 dout("writepages result %d\n", rc);
1338 return rc;
1339}
1340
1341/*
1342 * handle incoming message
1343 */
1344static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
1345{
1346 struct ceph_osd *osd = con->private;
1347 struct ceph_osd_client *osdc;
1348 int type = le16_to_cpu(msg->hdr.type);
1349
1350 if (!osd)
1351 return;
1352 osdc = osd->o_osdc;
1353
1354 switch (type) {
1355 case CEPH_MSG_OSD_MAP:
1356 ceph_osdc_handle_map(osdc, msg);
1357 break;
1358 case CEPH_MSG_OSD_OPREPLY:
1359 handle_reply(osdc, msg, con);
1360 break;
1361
1362 default:
1363 pr_err("received unknown message type %d %s\n", type,
1364 ceph_msg_type_name(type));
1365 }
1366 ceph_msg_put(msg);
1367}
1368
1369/*
1370 * lookup and return message for incoming reply
1371 */
1372static struct ceph_msg *get_reply(struct ceph_connection *con,
1373 struct ceph_msg_header *hdr,
1374 int *skip)
1375{
1376 struct ceph_osd *osd = con->private;
1377 struct ceph_osd_client *osdc = osd->o_osdc;
1378 struct ceph_msg *m;
1379 struct ceph_osd_request *req;
1380 int front = le32_to_cpu(hdr->front_len);
1381 int data_len = le32_to_cpu(hdr->data_len);
1382 u64 tid;
1383 int err;
1384
1385 tid = le64_to_cpu(hdr->tid);
1386 mutex_lock(&osdc->request_mutex);
1387 req = __lookup_request(osdc, tid);
1388 if (!req) {
1389 *skip = 1;
1390 m = NULL;
1391 pr_info("get_reply unknown tid %llu from osd%d\n", tid,
1392 osd->o_osd);
1393 goto out;
1394 }
1395
1396 if (req->r_con_filling_msg) {
1397 dout("get_reply revoking msg %p from old con %p\n",
1398 req->r_reply, req->r_con_filling_msg);
1399 ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
1400 ceph_con_put(req->r_con_filling_msg);
1401 }
1402
1403 if (front > req->r_reply->front.iov_len) {
1404 pr_warning("get_reply front %d > preallocated %d\n",
1405 front, (int)req->r_reply->front.iov_len);
1406 m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
1407 if (IS_ERR(m))
1408 goto out;
1409 ceph_msg_put(req->r_reply);
1410 req->r_reply = m;
1411 }
1412 m = ceph_msg_get(req->r_reply);
1413
1414 if (data_len > 0) {
1415 err = __prepare_pages(con, hdr, req, tid, m);
1416 if (err < 0) {
1417 *skip = 1;
1418 ceph_msg_put(m);
1419 m = ERR_PTR(err);
1420 }
1421 }
1422 *skip = 0;
1423 req->r_con_filling_msg = ceph_con_get(con);
1424 dout("get_reply tid %lld %p\n", tid, m);
1425
1426out:
1427 mutex_unlock(&osdc->request_mutex);
1428 return m;
1429
1430}
1431
1432static struct ceph_msg *alloc_msg(struct ceph_connection *con,
1433 struct ceph_msg_header *hdr,
1434 int *skip)
1435{
1436 struct ceph_osd *osd = con->private;
1437 int type = le16_to_cpu(hdr->type);
1438 int front = le32_to_cpu(hdr->front_len);
1439
1440 switch (type) {
1441 case CEPH_MSG_OSD_MAP:
1442 return ceph_msg_new(type, front, 0, 0, NULL);
1443 case CEPH_MSG_OSD_OPREPLY:
1444 return get_reply(con, hdr, skip);
1445 default:
1446 pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
1447 osd->o_osd);
1448 *skip = 1;
1449 return NULL;
1450 }
1451}
1452
1453/*
1454 * Wrappers to refcount containing ceph_osd struct
1455 */
1456static struct ceph_connection *get_osd_con(struct ceph_connection *con)
1457{
1458 struct ceph_osd *osd = con->private;
1459 if (get_osd(osd))
1460 return con;
1461 return NULL;
1462}
1463
1464static void put_osd_con(struct ceph_connection *con)
1465{
1466 struct ceph_osd *osd = con->private;
1467 put_osd(osd);
1468}
1469
1470/*
1471 * authentication
1472 */
1473static int get_authorizer(struct ceph_connection *con,
1474 void **buf, int *len, int *proto,
1475 void **reply_buf, int *reply_len, int force_new)
1476{
1477 struct ceph_osd *o = con->private;
1478 struct ceph_osd_client *osdc = o->o_osdc;
1479 struct ceph_auth_client *ac = osdc->client->monc.auth;
1480 int ret = 0;
1481
1482 if (force_new && o->o_authorizer) {
1483 ac->ops->destroy_authorizer(ac, o->o_authorizer);
1484 o->o_authorizer = NULL;
1485 }
1486 if (o->o_authorizer == NULL) {
1487 ret = ac->ops->create_authorizer(
1488 ac, CEPH_ENTITY_TYPE_OSD,
1489 &o->o_authorizer,
1490 &o->o_authorizer_buf,
1491 &o->o_authorizer_buf_len,
1492 &o->o_authorizer_reply_buf,
1493 &o->o_authorizer_reply_buf_len);
1494 if (ret)
1495 return ret;
1496 }
1497
1498 *proto = ac->protocol;
1499 *buf = o->o_authorizer_buf;
1500 *len = o->o_authorizer_buf_len;
1501 *reply_buf = o->o_authorizer_reply_buf;
1502 *reply_len = o->o_authorizer_reply_buf_len;
1503 return 0;
1504}
1505
1506
1507static int verify_authorizer_reply(struct ceph_connection *con, int len)
1508{
1509 struct ceph_osd *o = con->private;
1510 struct ceph_osd_client *osdc = o->o_osdc;
1511 struct ceph_auth_client *ac = osdc->client->monc.auth;
1512
1513 return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
1514}
1515
1516static int invalidate_authorizer(struct ceph_connection *con)
1517{
1518 struct ceph_osd *o = con->private;
1519 struct ceph_osd_client *osdc = o->o_osdc;
1520 struct ceph_auth_client *ac = osdc->client->monc.auth;
1521
1522 if (ac->ops->invalidate_authorizer)
1523 ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
1524
1525 return ceph_monc_validate_auth(&osdc->client->monc);
1526}
1527
1528const static struct ceph_connection_operations osd_con_ops = {
1529 .get = get_osd_con,
1530 .put = put_osd_con,
1531 .dispatch = dispatch,
1532 .get_authorizer = get_authorizer,
1533 .verify_authorizer_reply = verify_authorizer_reply,
1534 .invalidate_authorizer = invalidate_authorizer,
1535 .alloc_msg = alloc_msg,
1536 .fault = osd_reset,
1537};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
new file mode 100644
index 000000000000..1b1a3ca43afc
--- /dev/null
+++ b/fs/ceph/osd_client.h
@@ -0,0 +1,166 @@
1#ifndef _FS_CEPH_OSD_CLIENT_H
2#define _FS_CEPH_OSD_CLIENT_H
3
4#include <linux/completion.h>
5#include <linux/kref.h>
6#include <linux/mempool.h>
7#include <linux/rbtree.h>
8
9#include "types.h"
10#include "osdmap.h"
11#include "messenger.h"
12
13struct ceph_msg;
14struct ceph_snap_context;
15struct ceph_osd_request;
16struct ceph_osd_client;
17struct ceph_authorizer;
18
19/*
20 * completion callback for async writepages
21 */
22typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
23 struct ceph_msg *);
24
25/* a given osd we're communicating with */
26struct ceph_osd {
27 atomic_t o_ref;
28 struct ceph_osd_client *o_osdc;
29 int o_osd;
30 int o_incarnation;
31 struct rb_node o_node;
32 struct ceph_connection o_con;
33 struct list_head o_requests;
34 struct list_head o_osd_lru;
35 struct ceph_authorizer *o_authorizer;
36 void *o_authorizer_buf, *o_authorizer_reply_buf;
37 size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
38 unsigned long lru_ttl;
39 int o_marked_for_keepalive;
40 struct list_head o_keepalive_item;
41};
42
43/* an in-flight request */
44struct ceph_osd_request {
45 u64 r_tid; /* unique for this client */
46 struct rb_node r_node;
47 struct list_head r_req_lru_item;
48 struct list_head r_osd_item;
49 struct ceph_osd *r_osd;
50 struct ceph_pg r_pgid;
51
52 struct ceph_connection *r_con_filling_msg;
53
54 struct ceph_msg *r_request, *r_reply;
55 int r_result;
56 int r_flags; /* any additional flags for the osd */
57 u32 r_sent; /* >0 if r_request is sending/sent */
58 int r_got_reply;
59
60 struct ceph_osd_client *r_osdc;
61 struct kref r_kref;
62 bool r_mempool;
63 struct completion r_completion, r_safe_completion;
64 ceph_osdc_callback_t r_callback, r_safe_callback;
65 struct ceph_eversion r_reassert_version;
66 struct list_head r_unsafe_item;
67
68 struct inode *r_inode; /* for use by callbacks */
69 struct writeback_control *r_wbc; /* ditto */
70
71 char r_oid[40]; /* object name */
72 int r_oid_len;
73 unsigned long r_sent_stamp;
74 bool r_resend; /* msg send failed, needs retry */
75
76 struct ceph_file_layout r_file_layout;
77 struct ceph_snap_context *r_snapc; /* snap context for writes */
78 unsigned r_num_pages; /* size of page array (follows) */
79 struct page **r_pages; /* pages for data payload */
80 int r_pages_from_pool;
81 int r_own_pages; /* if true, i own page list */
82};
83
84struct ceph_osd_client {
85 struct ceph_client *client;
86
87 struct ceph_osdmap *osdmap; /* current map */
88 struct rw_semaphore map_sem;
89 struct completion map_waiters;
90 u64 last_requested_map;
91
92 struct mutex request_mutex;
93 struct rb_root osds; /* osds */
94 struct list_head osd_lru; /* idle osds */
95 u64 timeout_tid; /* tid of timeout triggering rq */
96 u64 last_tid; /* tid of last request */
97 struct rb_root requests; /* pending requests */
98 struct list_head req_lru; /* pending requests lru */
99 int num_requests;
100 struct delayed_work timeout_work;
101 struct delayed_work osds_timeout_work;
102#ifdef CONFIG_DEBUG_FS
103 struct dentry *debugfs_file;
104#endif
105
106 mempool_t *req_mempool;
107
108 struct ceph_msgpool msgpool_op;
109 struct ceph_msgpool msgpool_op_reply;
110};
111
112extern int ceph_osdc_init(struct ceph_osd_client *osdc,
113 struct ceph_client *client);
114extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
115
116extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
117 struct ceph_msg *msg);
118extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
119 struct ceph_msg *msg);
120
121extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
122 struct ceph_file_layout *layout,
123 struct ceph_vino vino,
124 u64 offset, u64 *len, int op, int flags,
125 struct ceph_snap_context *snapc,
126 int do_sync, u32 truncate_seq,
127 u64 truncate_size,
128 struct timespec *mtime,
129 bool use_mempool, int num_reply);
130
131static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
132{
133 kref_get(&req->r_kref);
134}
135extern void ceph_osdc_release_request(struct kref *kref);
136static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
137{
138 kref_put(&req->r_kref, ceph_osdc_release_request);
139}
140
141extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
142 struct ceph_osd_request *req,
143 bool nofail);
144extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
145 struct ceph_osd_request *req);
146extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
147
148extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
149 struct ceph_vino vino,
150 struct ceph_file_layout *layout,
151 u64 off, u64 *plen,
152 u32 truncate_seq, u64 truncate_size,
153 struct page **pages, int nr_pages);
154
155extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
156 struct ceph_vino vino,
157 struct ceph_file_layout *layout,
158 struct ceph_snap_context *sc,
159 u64 off, u64 len,
160 u32 truncate_seq, u64 truncate_size,
161 struct timespec *mtime,
162 struct page **pages, int nr_pages,
163 int flags, int do_sync, bool nofail);
164
165#endif
166
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
new file mode 100644
index 000000000000..b83f2692b835
--- /dev/null
+++ b/fs/ceph/osdmap.c
@@ -0,0 +1,1019 @@
1
2#include <asm/div64.h>
3
4#include "super.h"
5#include "osdmap.h"
6#include "crush/hash.h"
7#include "crush/mapper.h"
8#include "decode.h"
9#include "ceph_debug.h"
10
11char *ceph_osdmap_state_str(char *str, int len, int state)
12{
13 int flag = 0;
14
15 if (!len)
16 goto done;
17
18 *str = '\0';
19 if (state) {
20 if (state & CEPH_OSD_EXISTS) {
21 snprintf(str, len, "exists");
22 flag = 1;
23 }
24 if (state & CEPH_OSD_UP) {
25 snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
26 "up");
27 flag = 1;
28 }
29 } else {
30 snprintf(str, len, "doesn't exist");
31 }
32done:
33 return str;
34}
35
36/* maps */
37
38static int calc_bits_of(unsigned t)
39{
40 int b = 0;
41 while (t) {
42 t = t >> 1;
43 b++;
44 }
45 return b;
46}
47
48/*
49 * the foo_mask is the smallest value 2^n-1 that is >= foo.
50 */
51static void calc_pg_masks(struct ceph_pg_pool_info *pi)
52{
53 pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
54 pi->pgp_num_mask =
55 (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
56 pi->lpg_num_mask =
57 (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
58 pi->lpgp_num_mask =
59 (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
60}
61
62/*
63 * decode crush map
64 */
65static int crush_decode_uniform_bucket(void **p, void *end,
66 struct crush_bucket_uniform *b)
67{
68 dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
69 ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
70 b->item_weight = ceph_decode_32(p);
71 return 0;
72bad:
73 return -EINVAL;
74}
75
76static int crush_decode_list_bucket(void **p, void *end,
77 struct crush_bucket_list *b)
78{
79 int j;
80 dout("crush_decode_list_bucket %p to %p\n", *p, end);
81 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
82 if (b->item_weights == NULL)
83 return -ENOMEM;
84 b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
85 if (b->sum_weights == NULL)
86 return -ENOMEM;
87 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
88 for (j = 0; j < b->h.size; j++) {
89 b->item_weights[j] = ceph_decode_32(p);
90 b->sum_weights[j] = ceph_decode_32(p);
91 }
92 return 0;
93bad:
94 return -EINVAL;
95}
96
97static int crush_decode_tree_bucket(void **p, void *end,
98 struct crush_bucket_tree *b)
99{
100 int j;
101 dout("crush_decode_tree_bucket %p to %p\n", *p, end);
102 ceph_decode_32_safe(p, end, b->num_nodes, bad);
103 b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
104 if (b->node_weights == NULL)
105 return -ENOMEM;
106 ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
107 for (j = 0; j < b->num_nodes; j++)
108 b->node_weights[j] = ceph_decode_32(p);
109 return 0;
110bad:
111 return -EINVAL;
112}
113
114static int crush_decode_straw_bucket(void **p, void *end,
115 struct crush_bucket_straw *b)
116{
117 int j;
118 dout("crush_decode_straw_bucket %p to %p\n", *p, end);
119 b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
120 if (b->item_weights == NULL)
121 return -ENOMEM;
122 b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
123 if (b->straws == NULL)
124 return -ENOMEM;
125 ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
126 for (j = 0; j < b->h.size; j++) {
127 b->item_weights[j] = ceph_decode_32(p);
128 b->straws[j] = ceph_decode_32(p);
129 }
130 return 0;
131bad:
132 return -EINVAL;
133}
134
135static struct crush_map *crush_decode(void *pbyval, void *end)
136{
137 struct crush_map *c;
138 int err = -EINVAL;
139 int i, j;
140 void **p = &pbyval;
141 void *start = pbyval;
142 u32 magic;
143
144 dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
145
146 c = kzalloc(sizeof(*c), GFP_NOFS);
147 if (c == NULL)
148 return ERR_PTR(-ENOMEM);
149
150 ceph_decode_need(p, end, 4*sizeof(u32), bad);
151 magic = ceph_decode_32(p);
152 if (magic != CRUSH_MAGIC) {
153 pr_err("crush_decode magic %x != current %x\n",
154 (unsigned)magic, (unsigned)CRUSH_MAGIC);
155 goto bad;
156 }
157 c->max_buckets = ceph_decode_32(p);
158 c->max_rules = ceph_decode_32(p);
159 c->max_devices = ceph_decode_32(p);
160
161 c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
162 if (c->device_parents == NULL)
163 goto badmem;
164 c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
165 if (c->bucket_parents == NULL)
166 goto badmem;
167
168 c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
169 if (c->buckets == NULL)
170 goto badmem;
171 c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
172 if (c->rules == NULL)
173 goto badmem;
174
175 /* buckets */
176 for (i = 0; i < c->max_buckets; i++) {
177 int size = 0;
178 u32 alg;
179 struct crush_bucket *b;
180
181 ceph_decode_32_safe(p, end, alg, bad);
182 if (alg == 0) {
183 c->buckets[i] = NULL;
184 continue;
185 }
186 dout("crush_decode bucket %d off %x %p to %p\n",
187 i, (int)(*p-start), *p, end);
188
189 switch (alg) {
190 case CRUSH_BUCKET_UNIFORM:
191 size = sizeof(struct crush_bucket_uniform);
192 break;
193 case CRUSH_BUCKET_LIST:
194 size = sizeof(struct crush_bucket_list);
195 break;
196 case CRUSH_BUCKET_TREE:
197 size = sizeof(struct crush_bucket_tree);
198 break;
199 case CRUSH_BUCKET_STRAW:
200 size = sizeof(struct crush_bucket_straw);
201 break;
202 default:
203 err = -EINVAL;
204 goto bad;
205 }
206 BUG_ON(size == 0);
207 b = c->buckets[i] = kzalloc(size, GFP_NOFS);
208 if (b == NULL)
209 goto badmem;
210
211 ceph_decode_need(p, end, 4*sizeof(u32), bad);
212 b->id = ceph_decode_32(p);
213 b->type = ceph_decode_16(p);
214 b->alg = ceph_decode_8(p);
215 b->hash = ceph_decode_8(p);
216 b->weight = ceph_decode_32(p);
217 b->size = ceph_decode_32(p);
218
219 dout("crush_decode bucket size %d off %x %p to %p\n",
220 b->size, (int)(*p-start), *p, end);
221
222 b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
223 if (b->items == NULL)
224 goto badmem;
225 b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
226 if (b->perm == NULL)
227 goto badmem;
228 b->perm_n = 0;
229
230 ceph_decode_need(p, end, b->size*sizeof(u32), bad);
231 for (j = 0; j < b->size; j++)
232 b->items[j] = ceph_decode_32(p);
233
234 switch (b->alg) {
235 case CRUSH_BUCKET_UNIFORM:
236 err = crush_decode_uniform_bucket(p, end,
237 (struct crush_bucket_uniform *)b);
238 if (err < 0)
239 goto bad;
240 break;
241 case CRUSH_BUCKET_LIST:
242 err = crush_decode_list_bucket(p, end,
243 (struct crush_bucket_list *)b);
244 if (err < 0)
245 goto bad;
246 break;
247 case CRUSH_BUCKET_TREE:
248 err = crush_decode_tree_bucket(p, end,
249 (struct crush_bucket_tree *)b);
250 if (err < 0)
251 goto bad;
252 break;
253 case CRUSH_BUCKET_STRAW:
254 err = crush_decode_straw_bucket(p, end,
255 (struct crush_bucket_straw *)b);
256 if (err < 0)
257 goto bad;
258 break;
259 }
260 }
261
262 /* rules */
263 dout("rule vec is %p\n", c->rules);
264 for (i = 0; i < c->max_rules; i++) {
265 u32 yes;
266 struct crush_rule *r;
267
268 ceph_decode_32_safe(p, end, yes, bad);
269 if (!yes) {
270 dout("crush_decode NO rule %d off %x %p to %p\n",
271 i, (int)(*p-start), *p, end);
272 c->rules[i] = NULL;
273 continue;
274 }
275
276 dout("crush_decode rule %d off %x %p to %p\n",
277 i, (int)(*p-start), *p, end);
278
279 /* len */
280 ceph_decode_32_safe(p, end, yes, bad);
281#if BITS_PER_LONG == 32
282 err = -EINVAL;
283 if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
284 goto bad;
285#endif
286 r = c->rules[i] = kmalloc(sizeof(*r) +
287 yes*sizeof(struct crush_rule_step),
288 GFP_NOFS);
289 if (r == NULL)
290 goto badmem;
291 dout(" rule %d is at %p\n", i, r);
292 r->len = yes;
293 ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
294 ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
295 for (j = 0; j < r->len; j++) {
296 r->steps[j].op = ceph_decode_32(p);
297 r->steps[j].arg1 = ceph_decode_32(p);
298 r->steps[j].arg2 = ceph_decode_32(p);
299 }
300 }
301
302 /* ignore trailing name maps. */
303
304 dout("crush_decode success\n");
305 return c;
306
307badmem:
308 err = -ENOMEM;
309bad:
310 dout("crush_decode fail %d\n", err);
311 crush_destroy(c);
312 return ERR_PTR(err);
313}
314
315
316/*
317 * osd map
318 */
319void ceph_osdmap_destroy(struct ceph_osdmap *map)
320{
321 dout("osdmap_destroy %p\n", map);
322 if (map->crush)
323 crush_destroy(map->crush);
324 while (!RB_EMPTY_ROOT(&map->pg_temp)) {
325 struct ceph_pg_mapping *pg =
326 rb_entry(rb_first(&map->pg_temp),
327 struct ceph_pg_mapping, node);
328 rb_erase(&pg->node, &map->pg_temp);
329 kfree(pg);
330 }
331 while (!RB_EMPTY_ROOT(&map->pg_pools)) {
332 struct ceph_pg_pool_info *pi =
333 rb_entry(rb_first(&map->pg_pools),
334 struct ceph_pg_pool_info, node);
335 rb_erase(&pi->node, &map->pg_pools);
336 kfree(pi);
337 }
338 kfree(map->osd_state);
339 kfree(map->osd_weight);
340 kfree(map->osd_addr);
341 kfree(map);
342}
343
344/*
345 * adjust max osd value. reallocate arrays.
346 */
347static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
348{
349 u8 *state;
350 struct ceph_entity_addr *addr;
351 u32 *weight;
352
353 state = kcalloc(max, sizeof(*state), GFP_NOFS);
354 addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
355 weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
356 if (state == NULL || addr == NULL || weight == NULL) {
357 kfree(state);
358 kfree(addr);
359 kfree(weight);
360 return -ENOMEM;
361 }
362
363 /* copy old? */
364 if (map->osd_state) {
365 memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
366 memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
367 memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
368 kfree(map->osd_state);
369 kfree(map->osd_addr);
370 kfree(map->osd_weight);
371 }
372
373 map->osd_state = state;
374 map->osd_weight = weight;
375 map->osd_addr = addr;
376 map->max_osd = max;
377 return 0;
378}
379
380/*
381 * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
382 * to a set of osds)
383 */
384static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
385{
386 u64 a = *(u64 *)&l;
387 u64 b = *(u64 *)&r;
388
389 if (a < b)
390 return -1;
391 if (a > b)
392 return 1;
393 return 0;
394}
395
396static int __insert_pg_mapping(struct ceph_pg_mapping *new,
397 struct rb_root *root)
398{
399 struct rb_node **p = &root->rb_node;
400 struct rb_node *parent = NULL;
401 struct ceph_pg_mapping *pg = NULL;
402 int c;
403
404 while (*p) {
405 parent = *p;
406 pg = rb_entry(parent, struct ceph_pg_mapping, node);
407 c = pgid_cmp(new->pgid, pg->pgid);
408 if (c < 0)
409 p = &(*p)->rb_left;
410 else if (c > 0)
411 p = &(*p)->rb_right;
412 else
413 return -EEXIST;
414 }
415
416 rb_link_node(&new->node, parent, p);
417 rb_insert_color(&new->node, root);
418 return 0;
419}
420
421static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
422 struct ceph_pg pgid)
423{
424 struct rb_node *n = root->rb_node;
425 struct ceph_pg_mapping *pg;
426 int c;
427
428 while (n) {
429 pg = rb_entry(n, struct ceph_pg_mapping, node);
430 c = pgid_cmp(pgid, pg->pgid);
431 if (c < 0)
432 n = n->rb_left;
433 else if (c > 0)
434 n = n->rb_right;
435 else
436 return pg;
437 }
438 return NULL;
439}
440
441/*
442 * rbtree of pg pool info
443 */
444static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
445{
446 struct rb_node **p = &root->rb_node;
447 struct rb_node *parent = NULL;
448 struct ceph_pg_pool_info *pi = NULL;
449
450 while (*p) {
451 parent = *p;
452 pi = rb_entry(parent, struct ceph_pg_pool_info, node);
453 if (new->id < pi->id)
454 p = &(*p)->rb_left;
455 else if (new->id > pi->id)
456 p = &(*p)->rb_right;
457 else
458 return -EEXIST;
459 }
460
461 rb_link_node(&new->node, parent, p);
462 rb_insert_color(&new->node, root);
463 return 0;
464}
465
466static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
467{
468 struct ceph_pg_pool_info *pi;
469 struct rb_node *n = root->rb_node;
470
471 while (n) {
472 pi = rb_entry(n, struct ceph_pg_pool_info, node);
473 if (id < pi->id)
474 n = n->rb_left;
475 else if (id > pi->id)
476 n = n->rb_right;
477 else
478 return pi;
479 }
480 return NULL;
481}
482
483/*
484 * decode a full map.
485 */
486struct ceph_osdmap *osdmap_decode(void **p, void *end)
487{
488 struct ceph_osdmap *map;
489 u16 version;
490 u32 len, max, i;
491 u8 ev;
492 int err = -EINVAL;
493 void *start = *p;
494 struct ceph_pg_pool_info *pi;
495
496 dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
497
498 map = kzalloc(sizeof(*map), GFP_NOFS);
499 if (map == NULL)
500 return ERR_PTR(-ENOMEM);
501 map->pg_temp = RB_ROOT;
502
503 ceph_decode_16_safe(p, end, version, bad);
504 if (version > CEPH_OSDMAP_VERSION) {
505 pr_warning("got unknown v %d > %d of osdmap\n", version,
506 CEPH_OSDMAP_VERSION);
507 goto bad;
508 }
509
510 ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
511 ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
512 map->epoch = ceph_decode_32(p);
513 ceph_decode_copy(p, &map->created, sizeof(map->created));
514 ceph_decode_copy(p, &map->modified, sizeof(map->modified));
515
516 ceph_decode_32_safe(p, end, max, bad);
517 while (max--) {
518 ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
519 pi = kmalloc(sizeof(*pi), GFP_NOFS);
520 if (!pi)
521 goto bad;
522 pi->id = ceph_decode_32(p);
523 ev = ceph_decode_8(p); /* encoding version */
524 if (ev > CEPH_PG_POOL_VERSION) {
525 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
526 ev, CEPH_PG_POOL_VERSION);
527 goto bad;
528 }
529 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
530 __insert_pg_pool(&map->pg_pools, pi);
531 calc_pg_masks(pi);
532 *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
533 *p += le32_to_cpu(pi->v.num_removed_snap_intervals)
534 * sizeof(u64) * 2;
535 }
536 ceph_decode_32_safe(p, end, map->pool_max, bad);
537
538 ceph_decode_32_safe(p, end, map->flags, bad);
539
540 max = ceph_decode_32(p);
541
542 /* (re)alloc osd arrays */
543 err = osdmap_set_max_osd(map, max);
544 if (err < 0)
545 goto bad;
546 dout("osdmap_decode max_osd = %d\n", map->max_osd);
547
548 /* osds */
549 err = -EINVAL;
550 ceph_decode_need(p, end, 3*sizeof(u32) +
551 map->max_osd*(1 + sizeof(*map->osd_weight) +
552 sizeof(*map->osd_addr)), bad);
553 *p += 4; /* skip length field (should match max) */
554 ceph_decode_copy(p, map->osd_state, map->max_osd);
555
556 *p += 4; /* skip length field (should match max) */
557 for (i = 0; i < map->max_osd; i++)
558 map->osd_weight[i] = ceph_decode_32(p);
559
560 *p += 4; /* skip length field (should match max) */
561 ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
562 for (i = 0; i < map->max_osd; i++)
563 ceph_decode_addr(&map->osd_addr[i]);
564
565 /* pg_temp */
566 ceph_decode_32_safe(p, end, len, bad);
567 for (i = 0; i < len; i++) {
568 int n, j;
569 struct ceph_pg pgid;
570 struct ceph_pg_mapping *pg;
571
572 ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
573 ceph_decode_copy(p, &pgid, sizeof(pgid));
574 n = ceph_decode_32(p);
575 ceph_decode_need(p, end, n * sizeof(u32), bad);
576 err = -ENOMEM;
577 pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
578 if (!pg)
579 goto bad;
580 pg->pgid = pgid;
581 pg->len = n;
582 for (j = 0; j < n; j++)
583 pg->osds[j] = ceph_decode_32(p);
584
585 err = __insert_pg_mapping(pg, &map->pg_temp);
586 if (err)
587 goto bad;
588 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
589 }
590
591 /* crush */
592 ceph_decode_32_safe(p, end, len, bad);
593 dout("osdmap_decode crush len %d from off 0x%x\n", len,
594 (int)(*p - start));
595 ceph_decode_need(p, end, len, bad);
596 map->crush = crush_decode(*p, end);
597 *p += len;
598 if (IS_ERR(map->crush)) {
599 err = PTR_ERR(map->crush);
600 map->crush = NULL;
601 goto bad;
602 }
603
604 /* ignore the rest of the map */
605 *p = end;
606
607 dout("osdmap_decode done %p %p\n", *p, end);
608 return map;
609
610bad:
611 dout("osdmap_decode fail\n");
612 ceph_osdmap_destroy(map);
613 return ERR_PTR(err);
614}
615
616/*
617 * decode and apply an incremental map update.
618 */
619struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
620 struct ceph_osdmap *map,
621 struct ceph_messenger *msgr)
622{
623 struct crush_map *newcrush = NULL;
624 struct ceph_fsid fsid;
625 u32 epoch = 0;
626 struct ceph_timespec modified;
627 u32 len, pool;
628 __s32 new_pool_max, new_flags, max;
629 void *start = *p;
630 int err = -EINVAL;
631 u16 version;
632 struct rb_node *rbp;
633
634 ceph_decode_16_safe(p, end, version, bad);
635 if (version > CEPH_OSDMAP_INC_VERSION) {
636 pr_warning("got unknown v %d > %d of inc osdmap\n", version,
637 CEPH_OSDMAP_INC_VERSION);
638 goto bad;
639 }
640
641 ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
642 bad);
643 ceph_decode_copy(p, &fsid, sizeof(fsid));
644 epoch = ceph_decode_32(p);
645 BUG_ON(epoch != map->epoch+1);
646 ceph_decode_copy(p, &modified, sizeof(modified));
647 new_pool_max = ceph_decode_32(p);
648 new_flags = ceph_decode_32(p);
649
650 /* full map? */
651 ceph_decode_32_safe(p, end, len, bad);
652 if (len > 0) {
653 dout("apply_incremental full map len %d, %p to %p\n",
654 len, *p, end);
655 return osdmap_decode(p, min(*p+len, end));
656 }
657
658 /* new crush? */
659 ceph_decode_32_safe(p, end, len, bad);
660 if (len > 0) {
661 dout("apply_incremental new crush map len %d, %p to %p\n",
662 len, *p, end);
663 newcrush = crush_decode(*p, min(*p+len, end));
664 if (IS_ERR(newcrush))
665 return ERR_PTR(PTR_ERR(newcrush));
666 }
667
668 /* new flags? */
669 if (new_flags >= 0)
670 map->flags = new_flags;
671 if (new_pool_max >= 0)
672 map->pool_max = new_pool_max;
673
674 ceph_decode_need(p, end, 5*sizeof(u32), bad);
675
676 /* new max? */
677 max = ceph_decode_32(p);
678 if (max >= 0) {
679 err = osdmap_set_max_osd(map, max);
680 if (err < 0)
681 goto bad;
682 }
683
684 map->epoch++;
685 map->modified = map->modified;
686 if (newcrush) {
687 if (map->crush)
688 crush_destroy(map->crush);
689 map->crush = newcrush;
690 newcrush = NULL;
691 }
692
693 /* new_pool */
694 ceph_decode_32_safe(p, end, len, bad);
695 while (len--) {
696 __u8 ev;
697 struct ceph_pg_pool_info *pi;
698
699 ceph_decode_32_safe(p, end, pool, bad);
700 ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
701 ev = ceph_decode_8(p); /* encoding version */
702 if (ev > CEPH_PG_POOL_VERSION) {
703 pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
704 ev, CEPH_PG_POOL_VERSION);
705 goto bad;
706 }
707 pi = __lookup_pg_pool(&map->pg_pools, pool);
708 if (!pi) {
709 pi = kmalloc(sizeof(*pi), GFP_NOFS);
710 if (!pi) {
711 err = -ENOMEM;
712 goto bad;
713 }
714 pi->id = pool;
715 __insert_pg_pool(&map->pg_pools, pi);
716 }
717 ceph_decode_copy(p, &pi->v, sizeof(pi->v));
718 calc_pg_masks(pi);
719 }
720
721 /* old_pool */
722 ceph_decode_32_safe(p, end, len, bad);
723 while (len--) {
724 struct ceph_pg_pool_info *pi;
725
726 ceph_decode_32_safe(p, end, pool, bad);
727 pi = __lookup_pg_pool(&map->pg_pools, pool);
728 if (pi) {
729 rb_erase(&pi->node, &map->pg_pools);
730 kfree(pi);
731 }
732 }
733
734 /* new_up */
735 err = -EINVAL;
736 ceph_decode_32_safe(p, end, len, bad);
737 while (len--) {
738 u32 osd;
739 struct ceph_entity_addr addr;
740 ceph_decode_32_safe(p, end, osd, bad);
741 ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
742 ceph_decode_addr(&addr);
743 pr_info("osd%d up\n", osd);
744 BUG_ON(osd >= map->max_osd);
745 map->osd_state[osd] |= CEPH_OSD_UP;
746 map->osd_addr[osd] = addr;
747 }
748
749 /* new_down */
750 ceph_decode_32_safe(p, end, len, bad);
751 while (len--) {
752 u32 osd;
753 ceph_decode_32_safe(p, end, osd, bad);
754 (*p)++; /* clean flag */
755 pr_info("osd%d down\n", osd);
756 if (osd < map->max_osd)
757 map->osd_state[osd] &= ~CEPH_OSD_UP;
758 }
759
760 /* new_weight */
761 ceph_decode_32_safe(p, end, len, bad);
762 while (len--) {
763 u32 osd, off;
764 ceph_decode_need(p, end, sizeof(u32)*2, bad);
765 osd = ceph_decode_32(p);
766 off = ceph_decode_32(p);
767 pr_info("osd%d weight 0x%x %s\n", osd, off,
768 off == CEPH_OSD_IN ? "(in)" :
769 (off == CEPH_OSD_OUT ? "(out)" : ""));
770 if (osd < map->max_osd)
771 map->osd_weight[osd] = off;
772 }
773
774 /* new_pg_temp */
775 rbp = rb_first(&map->pg_temp);
776 ceph_decode_32_safe(p, end, len, bad);
777 while (len--) {
778 struct ceph_pg_mapping *pg;
779 int j;
780 struct ceph_pg pgid;
781 u32 pglen;
782 ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
783 ceph_decode_copy(p, &pgid, sizeof(pgid));
784 pglen = ceph_decode_32(p);
785
786 /* remove any? */
787 while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
788 node)->pgid, pgid) <= 0) {
789 struct rb_node *cur = rbp;
790 rbp = rb_next(rbp);
791 dout(" removed pg_temp %llx\n",
792 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
793 node)->pgid);
794 rb_erase(cur, &map->pg_temp);
795 }
796
797 if (pglen) {
798 /* insert */
799 ceph_decode_need(p, end, pglen*sizeof(u32), bad);
800 pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
801 if (!pg) {
802 err = -ENOMEM;
803 goto bad;
804 }
805 pg->pgid = pgid;
806 pg->len = pglen;
807 for (j = 0; j < pglen; j++)
808 pg->osds[j] = ceph_decode_32(p);
809 err = __insert_pg_mapping(pg, &map->pg_temp);
810 if (err)
811 goto bad;
812 dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
813 pglen);
814 }
815 }
816 while (rbp) {
817 struct rb_node *cur = rbp;
818 rbp = rb_next(rbp);
819 dout(" removed pg_temp %llx\n",
820 *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
821 node)->pgid);
822 rb_erase(cur, &map->pg_temp);
823 }
824
825 /* ignore the rest */
826 *p = end;
827 return map;
828
829bad:
830 pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
831 epoch, (int)(*p - start), *p, start, end);
832 print_hex_dump(KERN_DEBUG, "osdmap: ",
833 DUMP_PREFIX_OFFSET, 16, 1,
834 start, end - start, true);
835 if (newcrush)
836 crush_destroy(newcrush);
837 return ERR_PTR(err);
838}
839
840
841
842
843/*
844 * calculate file layout from given offset, length.
845 * fill in correct oid, logical length, and object extent
846 * offset, length.
847 *
848 * for now, we write only a single su, until we can
849 * pass a stride back to the caller.
850 */
851void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
852 u64 off, u64 *plen,
853 u64 *ono,
854 u64 *oxoff, u64 *oxlen)
855{
856 u32 osize = le32_to_cpu(layout->fl_object_size);
857 u32 su = le32_to_cpu(layout->fl_stripe_unit);
858 u32 sc = le32_to_cpu(layout->fl_stripe_count);
859 u32 bl, stripeno, stripepos, objsetno;
860 u32 su_per_object;
861 u64 t, su_offset;
862
863 dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
864 osize, su);
865 su_per_object = osize / su;
866 dout("osize %u / su %u = su_per_object %u\n", osize, su,
867 su_per_object);
868
869 BUG_ON((su & ~PAGE_MASK) != 0);
870 /* bl = *off / su; */
871 t = off;
872 do_div(t, su);
873 bl = t;
874 dout("off %llu / su %u = bl %u\n", off, su, bl);
875
876 stripeno = bl / sc;
877 stripepos = bl % sc;
878 objsetno = stripeno / su_per_object;
879
880 *ono = objsetno * sc + stripepos;
881 dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
882
883 /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
884 t = off;
885 su_offset = do_div(t, su);
886 *oxoff = su_offset + (stripeno % su_per_object) * su;
887
888 /*
889 * Calculate the length of the extent being written to the selected
890 * object. This is the minimum of the full length requested (plen) or
891 * the remainder of the current stripe being written to.
892 */
893 *oxlen = min_t(u64, *plen, su - su_offset);
894 *plen = *oxlen;
895
896 dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
897}
898
899/*
900 * calculate an object layout (i.e. pgid) from an oid,
901 * file_layout, and osdmap
902 */
903int ceph_calc_object_layout(struct ceph_object_layout *ol,
904 const char *oid,
905 struct ceph_file_layout *fl,
906 struct ceph_osdmap *osdmap)
907{
908 unsigned num, num_mask;
909 struct ceph_pg pgid;
910 s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
911 int poolid = le32_to_cpu(fl->fl_pg_pool);
912 struct ceph_pg_pool_info *pool;
913 unsigned ps;
914
915 BUG_ON(!osdmap);
916
917 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
918 if (!pool)
919 return -EIO;
920 ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
921 if (preferred >= 0) {
922 ps += preferred;
923 num = le32_to_cpu(pool->v.lpg_num);
924 num_mask = pool->lpg_num_mask;
925 } else {
926 num = le32_to_cpu(pool->v.pg_num);
927 num_mask = pool->pg_num_mask;
928 }
929
930 pgid.ps = cpu_to_le16(ps);
931 pgid.preferred = cpu_to_le16(preferred);
932 pgid.pool = fl->fl_pg_pool;
933 if (preferred >= 0)
934 dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
935 (int)preferred);
936 else
937 dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
938
939 ol->ol_pgid = pgid;
940 ol->ol_stripe_unit = fl->fl_object_stripe_unit;
941 return 0;
942}
943
944/*
945 * Calculate raw osd vector for the given pgid. Return pointer to osd
946 * array, or NULL on failure.
947 */
948static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
949 int *osds, int *num)
950{
951 struct ceph_pg_mapping *pg;
952 struct ceph_pg_pool_info *pool;
953 int ruleno;
954 unsigned poolid, ps, pps;
955 int preferred;
956
957 /* pg_temp? */
958 pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
959 if (pg) {
960 *num = pg->len;
961 return pg->osds;
962 }
963
964 /* crush */
965 poolid = le32_to_cpu(pgid.pool);
966 ps = le16_to_cpu(pgid.ps);
967 preferred = (s16)le16_to_cpu(pgid.preferred);
968
969 /* don't forcefeed bad device ids to crush */
970 if (preferred >= osdmap->max_osd ||
971 preferred >= osdmap->crush->max_devices)
972 preferred = -1;
973
974 pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
975 if (!pool)
976 return NULL;
977 ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
978 pool->v.type, pool->v.size);
979 if (ruleno < 0) {
980 pr_err("no crush rule pool %d type %d size %d\n",
981 poolid, pool->v.type, pool->v.size);
982 return NULL;
983 }
984
985 if (preferred >= 0)
986 pps = ceph_stable_mod(ps,
987 le32_to_cpu(pool->v.lpgp_num),
988 pool->lpgp_num_mask);
989 else
990 pps = ceph_stable_mod(ps,
991 le32_to_cpu(pool->v.pgp_num),
992 pool->pgp_num_mask);
993 pps += poolid;
994 *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
995 min_t(int, pool->v.size, *num),
996 preferred, osdmap->osd_weight);
997 return osds;
998}
999
1000/*
1001 * Return primary osd for given pgid, or -1 if none.
1002 */
1003int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
1004{
1005 int rawosds[10], *osds;
1006 int i, num = ARRAY_SIZE(rawosds);
1007
1008 osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
1009 if (!osds)
1010 return -1;
1011
1012 /* primary is first up osd */
1013 for (i = 0; i < num; i++)
1014 if (ceph_osd_is_up(osdmap, osds[i])) {
1015 return osds[i];
1016 break;
1017 }
1018 return -1;
1019}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
new file mode 100644
index 000000000000..1fb55afb2642
--- /dev/null
+++ b/fs/ceph/osdmap.h
@@ -0,0 +1,125 @@
1#ifndef _FS_CEPH_OSDMAP_H
2#define _FS_CEPH_OSDMAP_H
3
4#include <linux/rbtree.h>
5#include "types.h"
6#include "ceph_fs.h"
7#include "crush/crush.h"
8
9/*
10 * The osd map describes the current membership of the osd cluster and
11 * specifies the mapping of objects to placement groups and placement
12 * groups to (sets of) osds. That is, it completely specifies the
13 * (desired) distribution of all data objects in the system at some
14 * point in time.
15 *
16 * Each map version is identified by an epoch, which increases monotonically.
17 *
18 * The map can be updated either via an incremental map (diff) describing
19 * the change between two successive epochs, or as a fully encoded map.
20 */
21struct ceph_pg_pool_info {
22 struct rb_node node;
23 int id;
24 struct ceph_pg_pool v;
25 int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
26};
27
28struct ceph_pg_mapping {
29 struct rb_node node;
30 struct ceph_pg pgid;
31 int len;
32 int osds[];
33};
34
35struct ceph_osdmap {
36 struct ceph_fsid fsid;
37 u32 epoch;
38 u32 mkfs_epoch;
39 struct ceph_timespec created, modified;
40
41 u32 flags; /* CEPH_OSDMAP_* */
42
43 u32 max_osd; /* size of osd_state, _offload, _addr arrays */
44 u8 *osd_state; /* CEPH_OSD_* */
45 u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
46 struct ceph_entity_addr *osd_addr;
47
48 struct rb_root pg_temp;
49 struct rb_root pg_pools;
50 u32 pool_max;
51
52 /* the CRUSH map specifies the mapping of placement groups to
53 * the list of osds that store+replicate them. */
54 struct crush_map *crush;
55};
56
57/*
58 * file layout helpers
59 */
60#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
61#define ceph_file_layout_stripe_count(l) \
62 ((__s32)le32_to_cpu((l).fl_stripe_count))
63#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
64#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
65#define ceph_file_layout_object_su(l) \
66 ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
67#define ceph_file_layout_pg_preferred(l) \
68 ((__s32)le32_to_cpu((l).fl_pg_preferred))
69#define ceph_file_layout_pg_pool(l) \
70 ((__s32)le32_to_cpu((l).fl_pg_pool))
71
72static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
73{
74 return le32_to_cpu(l->fl_stripe_unit) *
75 le32_to_cpu(l->fl_stripe_count);
76}
77
78/* "period" == bytes before i start on a new set of objects */
79static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
80{
81 return le32_to_cpu(l->fl_object_size) *
82 le32_to_cpu(l->fl_stripe_count);
83}
84
85
86static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
87{
88 return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
89}
90
91static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
92{
93 return map && (map->flags & flag);
94}
95
96extern char *ceph_osdmap_state_str(char *str, int len, int state);
97
98static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
99 int osd)
100{
101 if (osd >= map->max_osd)
102 return NULL;
103 return &map->osd_addr[osd];
104}
105
106extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
107extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
108 struct ceph_osdmap *map,
109 struct ceph_messenger *msgr);
110extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
111
112/* calculate mapping of a file extent to an object */
113extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
114 u64 off, u64 *plen,
115 u64 *bno, u64 *oxoff, u64 *oxlen);
116
117/* calculate mapping of object to a placement group */
118extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
119 const char *oid,
120 struct ceph_file_layout *fl,
121 struct ceph_osdmap *osdmap);
122extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
123 struct ceph_pg pgid);
124
125#endif
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
new file mode 100644
index 000000000000..370e93695474
--- /dev/null
+++ b/fs/ceph/pagelist.c
@@ -0,0 +1,54 @@
1
2#include <linux/pagemap.h>
3#include <linux/highmem.h>
4
5#include "pagelist.h"
6
7int ceph_pagelist_release(struct ceph_pagelist *pl)
8{
9 if (pl->mapped_tail)
10 kunmap(pl->mapped_tail);
11 while (!list_empty(&pl->head)) {
12 struct page *page = list_first_entry(&pl->head, struct page,
13 lru);
14 list_del(&page->lru);
15 __free_page(page);
16 }
17 return 0;
18}
19
20static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
21{
22 struct page *page = alloc_page(GFP_NOFS);
23 if (!page)
24 return -ENOMEM;
25 pl->room += PAGE_SIZE;
26 list_add_tail(&page->lru, &pl->head);
27 if (pl->mapped_tail)
28 kunmap(pl->mapped_tail);
29 pl->mapped_tail = kmap(page);
30 return 0;
31}
32
33int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
34{
35 while (pl->room < len) {
36 size_t bit = pl->room;
37 int ret;
38
39 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
40 buf, bit);
41 pl->length += bit;
42 pl->room -= bit;
43 buf += bit;
44 len -= bit;
45 ret = ceph_pagelist_addpage(pl);
46 if (ret)
47 return ret;
48 }
49
50 memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
51 pl->length += len;
52 pl->room -= len;
53 return 0;
54}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
new file mode 100644
index 000000000000..e8a4187e1087
--- /dev/null
+++ b/fs/ceph/pagelist.h
@@ -0,0 +1,54 @@
1#ifndef __FS_CEPH_PAGELIST_H
2#define __FS_CEPH_PAGELIST_H
3
4#include <linux/list.h>
5
6struct ceph_pagelist {
7 struct list_head head;
8 void *mapped_tail;
9 size_t length;
10 size_t room;
11};
12
13static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
14{
15 INIT_LIST_HEAD(&pl->head);
16 pl->mapped_tail = NULL;
17 pl->length = 0;
18 pl->room = 0;
19}
20extern int ceph_pagelist_release(struct ceph_pagelist *pl);
21
22extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
23
24static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
25{
26 __le64 ev = cpu_to_le64(v);
27 return ceph_pagelist_append(pl, &ev, sizeof(ev));
28}
29static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
30{
31 __le32 ev = cpu_to_le32(v);
32 return ceph_pagelist_append(pl, &ev, sizeof(ev));
33}
34static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
35{
36 __le16 ev = cpu_to_le16(v);
37 return ceph_pagelist_append(pl, &ev, sizeof(ev));
38}
39static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
40{
41 return ceph_pagelist_append(pl, &v, 1);
42}
43static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
44 char *s, size_t len)
45{
46 int ret = ceph_pagelist_encode_32(pl, len);
47 if (ret)
48 return ret;
49 if (len)
50 return ceph_pagelist_append(pl, s, len);
51 return 0;
52}
53
54#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
new file mode 100644
index 000000000000..26ac8b89a676
--- /dev/null
+++ b/fs/ceph/rados.h
@@ -0,0 +1,374 @@
1#ifndef __RADOS_H
2#define __RADOS_H
3
4/*
5 * Data types for the Ceph distributed object storage layer RADOS
6 * (Reliable Autonomic Distributed Object Store).
7 */
8
9#include "msgr.h"
10
11/*
12 * osdmap encoding versions
13 */
14#define CEPH_OSDMAP_INC_VERSION 4
15#define CEPH_OSDMAP_VERSION 4
16
17/*
18 * fs id
19 */
20struct ceph_fsid {
21 unsigned char fsid[16];
22};
23
24static inline int ceph_fsid_compare(const struct ceph_fsid *a,
25 const struct ceph_fsid *b)
26{
27 return memcmp(a, b, sizeof(*a));
28}
29
30/*
31 * ino, object, etc.
32 */
33typedef __le64 ceph_snapid_t;
34#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
35#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
36#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
37
38struct ceph_timespec {
39 __le32 tv_sec;
40 __le32 tv_nsec;
41} __attribute__ ((packed));
42
43
44/*
45 * object layout - how objects are mapped into PGs
46 */
47#define CEPH_OBJECT_LAYOUT_HASH 1
48#define CEPH_OBJECT_LAYOUT_LINEAR 2
49#define CEPH_OBJECT_LAYOUT_HASHINO 3
50
51/*
52 * pg layout -- how PGs are mapped onto (sets of) OSDs
53 */
54#define CEPH_PG_LAYOUT_CRUSH 0
55#define CEPH_PG_LAYOUT_HASH 1
56#define CEPH_PG_LAYOUT_LINEAR 2
57#define CEPH_PG_LAYOUT_HYBRID 3
58
59
60/*
61 * placement group.
62 * we encode this into one __le64.
63 */
64struct ceph_pg {
65 __le16 preferred; /* preferred primary osd */
66 __le16 ps; /* placement seed */
67 __le32 pool; /* object pool */
68} __attribute__ ((packed));
69
70/*
71 * pg_pool is a set of pgs storing a pool of objects
72 *
73 * pg_num -- base number of pseudorandomly placed pgs
74 *
75 * pgp_num -- effective number when calculating pg placement. this
76 * is used for pg_num increases. new pgs result in data being "split"
77 * into new pgs. for this to proceed smoothly, new pgs are intiially
78 * colocated with their parents; that is, pgp_num doesn't increase
79 * until the new pgs have successfully split. only _then_ are the new
80 * pgs placed independently.
81 *
82 * lpg_num -- localized pg count (per device). replicas are randomly
83 * selected.
84 *
85 * lpgp_num -- as above.
86 */
87#define CEPH_PG_TYPE_REP 1
88#define CEPH_PG_TYPE_RAID4 2
89#define CEPH_PG_POOL_VERSION 2
90struct ceph_pg_pool {
91 __u8 type; /* CEPH_PG_TYPE_* */
92 __u8 size; /* number of osds in each pg */
93 __u8 crush_ruleset; /* crush placement rule */
94 __u8 object_hash; /* hash mapping object name to ps */
95 __le32 pg_num, pgp_num; /* number of pg's */
96 __le32 lpg_num, lpgp_num; /* number of localized pg's */
97 __le32 last_change; /* most recent epoch changed */
98 __le64 snap_seq; /* seq for per-pool snapshot */
99 __le32 snap_epoch; /* epoch of last snap */
100 __le32 num_snaps;
101 __le32 num_removed_snap_intervals;
102 __le64 uid;
103} __attribute__ ((packed));
104
105/*
106 * stable_mod func is used to control number of placement groups.
107 * similar to straight-up modulo, but produces a stable mapping as b
108 * increases over time. b is the number of bins, and bmask is the
109 * containing power of 2 minus 1.
110 *
111 * b <= bmask and bmask=(2**n)-1
112 * e.g., b=12 -> bmask=15, b=123 -> bmask=127
113 */
114static inline int ceph_stable_mod(int x, int b, int bmask)
115{
116 if ((x & bmask) < b)
117 return x & bmask;
118 else
119 return x & (bmask >> 1);
120}
121
122/*
123 * object layout - how a given object should be stored.
124 */
125struct ceph_object_layout {
126 struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
127 __le32 ol_stripe_unit; /* for per-object parity, if any */
128} __attribute__ ((packed));
129
130/*
131 * compound epoch+version, used by storage layer to serialize mutations
132 */
133struct ceph_eversion {
134 __le32 epoch;
135 __le64 version;
136} __attribute__ ((packed));
137
138/*
139 * osd map bits
140 */
141
142/* status bits */
143#define CEPH_OSD_EXISTS 1
144#define CEPH_OSD_UP 2
145
146/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
147#define CEPH_OSD_IN 0x10000
148#define CEPH_OSD_OUT 0
149
150
151/*
152 * osd map flag bits
153 */
154#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
155#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
156#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
157#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
158#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
159
160/*
161 * osd ops
162 */
163#define CEPH_OSD_OP_MODE 0xf000
164#define CEPH_OSD_OP_MODE_RD 0x1000
165#define CEPH_OSD_OP_MODE_WR 0x2000
166#define CEPH_OSD_OP_MODE_RMW 0x3000
167#define CEPH_OSD_OP_MODE_SUB 0x4000
168
169#define CEPH_OSD_OP_TYPE 0x0f00
170#define CEPH_OSD_OP_TYPE_LOCK 0x0100
171#define CEPH_OSD_OP_TYPE_DATA 0x0200
172#define CEPH_OSD_OP_TYPE_ATTR 0x0300
173#define CEPH_OSD_OP_TYPE_EXEC 0x0400
174#define CEPH_OSD_OP_TYPE_PG 0x0500
175
176enum {
177 /** data **/
178 /* read */
179 CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
180 CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
181
182 /* fancy read */
183 CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
184
185 /* write */
186 CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
187 CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
188 CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
189 CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
190 CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
191
192 /* fancy write */
193 CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
194 CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
195 CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
196 CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
197
198 CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
199 CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
200 CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
201
202 CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
203
204 /** attrs **/
205 /* read */
206 CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
207 CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
208
209 /* write */
210 CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
211 CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
212 CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
213 CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
214
215 /** subop **/
216 CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
217 CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
218 CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
219 CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
220 CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
221
222 /** lock **/
223 CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
224 CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
225 CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
226 CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
227 CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
228 CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
229
230 /** exec **/
231 CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
232
233 /** pg **/
234 CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
235};
236
237static inline int ceph_osd_op_type_lock(int op)
238{
239 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
240}
241static inline int ceph_osd_op_type_data(int op)
242{
243 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
244}
245static inline int ceph_osd_op_type_attr(int op)
246{
247 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
248}
249static inline int ceph_osd_op_type_exec(int op)
250{
251 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
252}
253static inline int ceph_osd_op_type_pg(int op)
254{
255 return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
256}
257
258static inline int ceph_osd_op_mode_subop(int op)
259{
260 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
261}
262static inline int ceph_osd_op_mode_read(int op)
263{
264 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
265}
266static inline int ceph_osd_op_mode_modify(int op)
267{
268 return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
269}
270
271#define CEPH_OSD_TMAP_HDR 'h'
272#define CEPH_OSD_TMAP_SET 's'
273#define CEPH_OSD_TMAP_RM 'r'
274
275extern const char *ceph_osd_op_name(int op);
276
277
278/*
279 * osd op flags
280 *
281 * An op may be READ, WRITE, or READ|WRITE.
282 */
283enum {
284 CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
285 CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
286 CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
287 CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
288 CEPH_OSD_FLAG_READ = 16, /* op may read */
289 CEPH_OSD_FLAG_WRITE = 32, /* op may write */
290 CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
291 CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
292 CEPH_OSD_FLAG_BALANCE_READS = 256,
293 CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
294 CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
295 CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
296};
297
298enum {
299 CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
300};
301
302#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
303#define EBLACKLISTED ESHUTDOWN /* blacklisted */
304
305/*
306 * an individual object operation. each may be accompanied by some data
307 * payload
308 */
309struct ceph_osd_op {
310 __le16 op; /* CEPH_OSD_OP_* */
311 __le32 flags; /* CEPH_OSD_FLAG_* */
312 union {
313 struct {
314 __le64 offset, length;
315 __le64 truncate_size;
316 __le32 truncate_seq;
317 } __attribute__ ((packed)) extent;
318 struct {
319 __le32 name_len;
320 __le32 value_len;
321 } __attribute__ ((packed)) xattr;
322 struct {
323 __u8 class_len;
324 __u8 method_len;
325 __u8 argc;
326 __le32 indata_len;
327 } __attribute__ ((packed)) cls;
328 struct {
329 __le64 cookie, count;
330 } __attribute__ ((packed)) pgls;
331 };
332 __le32 payload_len;
333} __attribute__ ((packed));
334
335/*
336 * osd request message header. each request may include multiple
337 * ceph_osd_op object operations.
338 */
339struct ceph_osd_request_head {
340 __le32 client_inc; /* client incarnation */
341 struct ceph_object_layout layout; /* pgid */
342 __le32 osdmap_epoch; /* client's osdmap epoch */
343
344 __le32 flags;
345
346 struct ceph_timespec mtime; /* for mutations only */
347 struct ceph_eversion reassert_version; /* if we are replaying op */
348
349 __le32 object_len; /* length of object name */
350
351 __le64 snapid; /* snapid to read */
352 __le64 snap_seq; /* writer's snap context */
353 __le32 num_snaps;
354
355 __le16 num_ops;
356 struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
357} __attribute__ ((packed));
358
359struct ceph_osd_reply_head {
360 __le32 client_inc; /* client incarnation */
361 __le32 flags;
362 struct ceph_object_layout layout;
363 __le32 osdmap_epoch;
364 struct ceph_eversion reassert_version; /* for replaying uncommitted */
365
366 __le32 result; /* result code */
367
368 __le32 object_len; /* length of object name */
369 __le32 num_ops;
370 struct ceph_osd_op ops[0]; /* ops[], object */
371} __attribute__ ((packed));
372
373
374#endif
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
new file mode 100644
index 000000000000..bf2a5f3846a4
--- /dev/null
+++ b/fs/ceph/snap.c
@@ -0,0 +1,904 @@
1#include "ceph_debug.h"
2
3#include <linux/sort.h>
4
5#include "super.h"
6#include "decode.h"
7
8/*
9 * Snapshots in ceph are driven in large part by cooperation from the
10 * client. In contrast to local file systems or file servers that
11 * implement snapshots at a single point in the system, ceph's
12 * distributed access to storage requires clients to help decide
13 * whether a write logically occurs before or after a recently created
14 * snapshot.
15 *
16 * This provides a perfect instantanous client-wide snapshot. Between
17 * clients, however, snapshots may appear to be applied at slightly
18 * different points in time, depending on delays in delivering the
19 * snapshot notification.
20 *
21 * Snapshots are _not_ file system-wide. Instead, each snapshot
22 * applies to the subdirectory nested beneath some directory. This
23 * effectively divides the hierarchy into multiple "realms," where all
24 * of the files contained by each realm share the same set of
25 * snapshots. An individual realm's snap set contains snapshots
26 * explicitly created on that realm, as well as any snaps in its
27 * parent's snap set _after_ the point at which the parent became it's
28 * parent (due to, say, a rename). Similarly, snaps from prior parents
29 * during the time intervals during which they were the parent are included.
30 *
31 * The client is spared most of this detail, fortunately... it must only
32 * maintains a hierarchy of realms reflecting the current parent/child
33 * realm relationship, and for each realm has an explicit list of snaps
34 * inherited from prior parents.
35 *
36 * A snap_realm struct is maintained for realms containing every inode
37 * with an open cap in the system. (The needed snap realm information is
38 * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq'
39 * version number is used to ensure that as realm parameters change (new
40 * snapshot, new parent, etc.) the client's realm hierarchy is updated.
41 *
42 * The realm hierarchy drives the generation of a 'snap context' for each
43 * realm, which simply lists the resulting set of snaps for the realm. This
44 * is attached to any writes sent to OSDs.
45 */
46/*
47 * Unfortunately error handling is a bit mixed here. If we get a snap
48 * update, but don't have enough memory to update our realm hierarchy,
49 * it's not clear what we can do about it (besides complaining to the
50 * console).
51 */
52
53
54/*
55 * increase ref count for the realm
56 *
57 * caller must hold snap_rwsem for write.
58 */
59void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
60 struct ceph_snap_realm *realm)
61{
62 dout("get_realm %p %d -> %d\n", realm,
63 atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
64 /*
65 * since we _only_ increment realm refs or empty the empty
66 * list with snap_rwsem held, adjusting the empty list here is
67 * safe. we do need to protect against concurrent empty list
68 * additions, however.
69 */
70 if (atomic_read(&realm->nref) == 0) {
71 spin_lock(&mdsc->snap_empty_lock);
72 list_del_init(&realm->empty_item);
73 spin_unlock(&mdsc->snap_empty_lock);
74 }
75
76 atomic_inc(&realm->nref);
77}
78
79static void __insert_snap_realm(struct rb_root *root,
80 struct ceph_snap_realm *new)
81{
82 struct rb_node **p = &root->rb_node;
83 struct rb_node *parent = NULL;
84 struct ceph_snap_realm *r = NULL;
85
86 while (*p) {
87 parent = *p;
88 r = rb_entry(parent, struct ceph_snap_realm, node);
89 if (new->ino < r->ino)
90 p = &(*p)->rb_left;
91 else if (new->ino > r->ino)
92 p = &(*p)->rb_right;
93 else
94 BUG();
95 }
96
97 rb_link_node(&new->node, parent, p);
98 rb_insert_color(&new->node, root);
99}
100
101/*
102 * create and get the realm rooted at @ino and bump its ref count.
103 *
104 * caller must hold snap_rwsem for write.
105 */
106static struct ceph_snap_realm *ceph_create_snap_realm(
107 struct ceph_mds_client *mdsc,
108 u64 ino)
109{
110 struct ceph_snap_realm *realm;
111
112 realm = kzalloc(sizeof(*realm), GFP_NOFS);
113 if (!realm)
114 return ERR_PTR(-ENOMEM);
115
116 atomic_set(&realm->nref, 0); /* tree does not take a ref */
117 realm->ino = ino;
118 INIT_LIST_HEAD(&realm->children);
119 INIT_LIST_HEAD(&realm->child_item);
120 INIT_LIST_HEAD(&realm->empty_item);
121 INIT_LIST_HEAD(&realm->inodes_with_caps);
122 spin_lock_init(&realm->inodes_with_caps_lock);
123 __insert_snap_realm(&mdsc->snap_realms, realm);
124 dout("create_snap_realm %llx %p\n", realm->ino, realm);
125 return realm;
126}
127
128/*
129 * lookup the realm rooted at @ino.
130 *
131 * caller must hold snap_rwsem for write.
132 */
133struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
134 u64 ino)
135{
136 struct rb_node *n = mdsc->snap_realms.rb_node;
137 struct ceph_snap_realm *r;
138
139 while (n) {
140 r = rb_entry(n, struct ceph_snap_realm, node);
141 if (ino < r->ino)
142 n = n->rb_left;
143 else if (ino > r->ino)
144 n = n->rb_right;
145 else {
146 dout("lookup_snap_realm %llx %p\n", r->ino, r);
147 return r;
148 }
149 }
150 return NULL;
151}
152
153static void __put_snap_realm(struct ceph_mds_client *mdsc,
154 struct ceph_snap_realm *realm);
155
156/*
157 * called with snap_rwsem (write)
158 */
159static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
160 struct ceph_snap_realm *realm)
161{
162 dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
163
164 rb_erase(&realm->node, &mdsc->snap_realms);
165
166 if (realm->parent) {
167 list_del_init(&realm->child_item);
168 __put_snap_realm(mdsc, realm->parent);
169 }
170
171 kfree(realm->prior_parent_snaps);
172 kfree(realm->snaps);
173 ceph_put_snap_context(realm->cached_context);
174 kfree(realm);
175}
176
177/*
178 * caller holds snap_rwsem (write)
179 */
180static void __put_snap_realm(struct ceph_mds_client *mdsc,
181 struct ceph_snap_realm *realm)
182{
183 dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
184 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
185 if (atomic_dec_and_test(&realm->nref))
186 __destroy_snap_realm(mdsc, realm);
187}
188
189/*
190 * caller needn't hold any locks
191 */
192void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
193 struct ceph_snap_realm *realm)
194{
195 dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
196 atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
197 if (!atomic_dec_and_test(&realm->nref))
198 return;
199
200 if (down_write_trylock(&mdsc->snap_rwsem)) {
201 __destroy_snap_realm(mdsc, realm);
202 up_write(&mdsc->snap_rwsem);
203 } else {
204 spin_lock(&mdsc->snap_empty_lock);
205 list_add(&mdsc->snap_empty, &realm->empty_item);
206 spin_unlock(&mdsc->snap_empty_lock);
207 }
208}
209
210/*
211 * Clean up any realms whose ref counts have dropped to zero. Note
212 * that this does not include realms who were created but not yet
213 * used.
214 *
215 * Called under snap_rwsem (write)
216 */
217static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
218{
219 struct ceph_snap_realm *realm;
220
221 spin_lock(&mdsc->snap_empty_lock);
222 while (!list_empty(&mdsc->snap_empty)) {
223 realm = list_first_entry(&mdsc->snap_empty,
224 struct ceph_snap_realm, empty_item);
225 list_del(&realm->empty_item);
226 spin_unlock(&mdsc->snap_empty_lock);
227 __destroy_snap_realm(mdsc, realm);
228 spin_lock(&mdsc->snap_empty_lock);
229 }
230 spin_unlock(&mdsc->snap_empty_lock);
231}
232
233void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
234{
235 down_write(&mdsc->snap_rwsem);
236 __cleanup_empty_realms(mdsc);
237 up_write(&mdsc->snap_rwsem);
238}
239
240/*
241 * adjust the parent realm of a given @realm. adjust child list, and parent
242 * pointers, and ref counts appropriately.
243 *
244 * return true if parent was changed, 0 if unchanged, <0 on error.
245 *
246 * caller must hold snap_rwsem for write.
247 */
248static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
249 struct ceph_snap_realm *realm,
250 u64 parentino)
251{
252 struct ceph_snap_realm *parent;
253
254 if (realm->parent_ino == parentino)
255 return 0;
256
257 parent = ceph_lookup_snap_realm(mdsc, parentino);
258 if (!parent) {
259 parent = ceph_create_snap_realm(mdsc, parentino);
260 if (IS_ERR(parent))
261 return PTR_ERR(parent);
262 }
263 dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
264 realm->ino, realm, realm->parent_ino, realm->parent,
265 parentino, parent);
266 if (realm->parent) {
267 list_del_init(&realm->child_item);
268 ceph_put_snap_realm(mdsc, realm->parent);
269 }
270 realm->parent_ino = parentino;
271 realm->parent = parent;
272 ceph_get_snap_realm(mdsc, parent);
273 list_add(&realm->child_item, &parent->children);
274 return 1;
275}
276
277
278static int cmpu64_rev(const void *a, const void *b)
279{
280 if (*(u64 *)a < *(u64 *)b)
281 return 1;
282 if (*(u64 *)a > *(u64 *)b)
283 return -1;
284 return 0;
285}
286
287/*
288 * build the snap context for a given realm.
289 */
290static int build_snap_context(struct ceph_snap_realm *realm)
291{
292 struct ceph_snap_realm *parent = realm->parent;
293 struct ceph_snap_context *snapc;
294 int err = 0;
295 int i;
296 int num = realm->num_prior_parent_snaps + realm->num_snaps;
297
298 /*
299 * build parent context, if it hasn't been built.
300 * conservatively estimate that all parent snaps might be
301 * included by us.
302 */
303 if (parent) {
304 if (!parent->cached_context) {
305 err = build_snap_context(parent);
306 if (err)
307 goto fail;
308 }
309 num += parent->cached_context->num_snaps;
310 }
311
312 /* do i actually need to update? not if my context seq
313 matches realm seq, and my parents' does to. (this works
314 because we rebuild_snap_realms() works _downward_ in
315 hierarchy after each update.) */
316 if (realm->cached_context &&
317 realm->cached_context->seq <= realm->seq &&
318 (!parent ||
319 realm->cached_context->seq <= parent->cached_context->seq)) {
320 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
321 " (unchanged)\n",
322 realm->ino, realm, realm->cached_context,
323 realm->cached_context->seq,
324 realm->cached_context->num_snaps);
325 return 0;
326 }
327
328 /* alloc new snap context */
329 err = -ENOMEM;
330 if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
331 goto fail;
332 snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
333 if (!snapc)
334 goto fail;
335 atomic_set(&snapc->nref, 1);
336
337 /* build (reverse sorted) snap vector */
338 num = 0;
339 snapc->seq = realm->seq;
340 if (parent) {
341 /* include any of parent's snaps occuring _after_ my
342 parent became my parent */
343 for (i = 0; i < parent->cached_context->num_snaps; i++)
344 if (parent->cached_context->snaps[i] >=
345 realm->parent_since)
346 snapc->snaps[num++] =
347 parent->cached_context->snaps[i];
348 if (parent->cached_context->seq > snapc->seq)
349 snapc->seq = parent->cached_context->seq;
350 }
351 memcpy(snapc->snaps + num, realm->snaps,
352 sizeof(u64)*realm->num_snaps);
353 num += realm->num_snaps;
354 memcpy(snapc->snaps + num, realm->prior_parent_snaps,
355 sizeof(u64)*realm->num_prior_parent_snaps);
356 num += realm->num_prior_parent_snaps;
357
358 sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
359 snapc->num_snaps = num;
360 dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
361 realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
362
363 if (realm->cached_context)
364 ceph_put_snap_context(realm->cached_context);
365 realm->cached_context = snapc;
366 return 0;
367
368fail:
369 /*
370 * if we fail, clear old (incorrect) cached_context... hopefully
371 * we'll have better luck building it later
372 */
373 if (realm->cached_context) {
374 ceph_put_snap_context(realm->cached_context);
375 realm->cached_context = NULL;
376 }
377 pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
378 realm, err);
379 return err;
380}
381
382/*
383 * rebuild snap context for the given realm and all of its children.
384 */
385static void rebuild_snap_realms(struct ceph_snap_realm *realm)
386{
387 struct ceph_snap_realm *child;
388
389 dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
390 build_snap_context(realm);
391
392 list_for_each_entry(child, &realm->children, child_item)
393 rebuild_snap_realms(child);
394}
395
396
397/*
398 * helper to allocate and decode an array of snapids. free prior
399 * instance, if any.
400 */
401static int dup_array(u64 **dst, __le64 *src, int num)
402{
403 int i;
404
405 kfree(*dst);
406 if (num) {
407 *dst = kcalloc(num, sizeof(u64), GFP_NOFS);
408 if (!*dst)
409 return -ENOMEM;
410 for (i = 0; i < num; i++)
411 (*dst)[i] = get_unaligned_le64(src + i);
412 } else {
413 *dst = NULL;
414 }
415 return 0;
416}
417
418
419/*
420 * When a snapshot is applied, the size/mtime inode metadata is queued
421 * in a ceph_cap_snap (one for each snapshot) until writeback
422 * completes and the metadata can be flushed back to the MDS.
423 *
424 * However, if a (sync) write is currently in-progress when we apply
425 * the snapshot, we have to wait until the write succeeds or fails
426 * (and a final size/mtime is known). In this case the
427 * cap_snap->writing = 1, and is said to be "pending." When the write
428 * finishes, we __ceph_finish_cap_snap().
429 *
430 * Caller must hold snap_rwsem for read (i.e., the realm topology won't
431 * change).
432 */
433void ceph_queue_cap_snap(struct ceph_inode_info *ci,
434 struct ceph_snap_context *snapc)
435{
436 struct inode *inode = &ci->vfs_inode;
437 struct ceph_cap_snap *capsnap;
438 int used;
439
440 capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
441 if (!capsnap) {
442 pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
443 return;
444 }
445
446 spin_lock(&inode->i_lock);
447 used = __ceph_caps_used(ci);
448 if (__ceph_have_pending_cap_snap(ci)) {
449 /* there is no point in queuing multiple "pending" cap_snaps,
450 as no new writes are allowed to start when pending, so any
451 writes in progress now were started before the previous
452 cap_snap. lucky us. */
453 dout("queue_cap_snap %p snapc %p seq %llu used %d"
454 " already pending\n", inode, snapc, snapc->seq, used);
455 kfree(capsnap);
456 } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
457 igrab(inode);
458
459 atomic_set(&capsnap->nref, 1);
460 capsnap->ci = ci;
461 INIT_LIST_HEAD(&capsnap->ci_item);
462 INIT_LIST_HEAD(&capsnap->flushing_item);
463
464 capsnap->follows = snapc->seq - 1;
465 capsnap->context = ceph_get_snap_context(snapc);
466 capsnap->issued = __ceph_caps_issued(ci, NULL);
467 capsnap->dirty = __ceph_caps_dirty(ci);
468
469 capsnap->mode = inode->i_mode;
470 capsnap->uid = inode->i_uid;
471 capsnap->gid = inode->i_gid;
472
473 /* fixme? */
474 capsnap->xattr_blob = NULL;
475 capsnap->xattr_len = 0;
476
477 /* dirty page count moved from _head to this cap_snap;
478 all subsequent writes page dirties occur _after_ this
479 snapshot. */
480 capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
481 ci->i_wrbuffer_ref_head = 0;
482 ceph_put_snap_context(ci->i_head_snapc);
483 ci->i_head_snapc = NULL;
484 list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
485
486 if (used & CEPH_CAP_FILE_WR) {
487 dout("queue_cap_snap %p cap_snap %p snapc %p"
488 " seq %llu used WR, now pending\n", inode,
489 capsnap, snapc, snapc->seq);
490 capsnap->writing = 1;
491 } else {
492 /* note mtime, size NOW. */
493 __ceph_finish_cap_snap(ci, capsnap);
494 }
495 } else {
496 dout("queue_cap_snap %p nothing dirty|writing\n", inode);
497 kfree(capsnap);
498 }
499
500 spin_unlock(&inode->i_lock);
501}
502
503/*
504 * Finalize the size, mtime for a cap_snap.. that is, settle on final values
505 * to be used for the snapshot, to be flushed back to the mds.
506 *
507 * If capsnap can now be flushed, add to snap_flush list, and return 1.
508 *
509 * Caller must hold i_lock.
510 */
511int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
512 struct ceph_cap_snap *capsnap)
513{
514 struct inode *inode = &ci->vfs_inode;
515 struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
516
517 BUG_ON(capsnap->writing);
518 capsnap->size = inode->i_size;
519 capsnap->mtime = inode->i_mtime;
520 capsnap->atime = inode->i_atime;
521 capsnap->ctime = inode->i_ctime;
522 capsnap->time_warp_seq = ci->i_time_warp_seq;
523 if (capsnap->dirty_pages) {
524 dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu "
525 "still has %d dirty pages\n", inode, capsnap,
526 capsnap->context, capsnap->context->seq,
527 capsnap->size, capsnap->dirty_pages);
528 return 0;
529 }
530 dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n",
531 inode, capsnap, capsnap->context,
532 capsnap->context->seq, capsnap->size);
533
534 spin_lock(&mdsc->snap_flush_lock);
535 list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
536 spin_unlock(&mdsc->snap_flush_lock);
537 return 1; /* caller may want to ceph_flush_snaps */
538}
539
540
541/*
542 * Parse and apply a snapblob "snap trace" from the MDS. This specifies
543 * the snap realm parameters from a given realm and all of its ancestors,
544 * up to the root.
545 *
546 * Caller must hold snap_rwsem for write.
547 */
548int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
549 void *p, void *e, bool deletion)
550{
551 struct ceph_mds_snap_realm *ri; /* encoded */
552 __le64 *snaps; /* encoded */
553 __le64 *prior_parent_snaps; /* encoded */
554 struct ceph_snap_realm *realm;
555 int invalidate = 0;
556 int err = -ENOMEM;
557
558 dout("update_snap_trace deletion=%d\n", deletion);
559more:
560 ceph_decode_need(&p, e, sizeof(*ri), bad);
561 ri = p;
562 p += sizeof(*ri);
563 ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
564 le32_to_cpu(ri->num_prior_parent_snaps)), bad);
565 snaps = p;
566 p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
567 prior_parent_snaps = p;
568 p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
569
570 realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
571 if (!realm) {
572 realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
573 if (IS_ERR(realm)) {
574 err = PTR_ERR(realm);
575 goto fail;
576 }
577 }
578
579 if (le64_to_cpu(ri->seq) > realm->seq) {
580 dout("update_snap_trace updating %llx %p %lld -> %lld\n",
581 realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
582 /*
583 * if the realm seq has changed, queue a cap_snap for every
584 * inode with open caps. we do this _before_ we update
585 * the realm info so that we prepare for writeback under the
586 * _previous_ snap context.
587 *
588 * ...unless it's a snap deletion!
589 */
590 if (!deletion) {
591 struct ceph_inode_info *ci;
592 struct inode *lastinode = NULL;
593
594 spin_lock(&realm->inodes_with_caps_lock);
595 list_for_each_entry(ci, &realm->inodes_with_caps,
596 i_snap_realm_item) {
597 struct inode *inode = igrab(&ci->vfs_inode);
598 if (!inode)
599 continue;
600 spin_unlock(&realm->inodes_with_caps_lock);
601 if (lastinode)
602 iput(lastinode);
603 lastinode = inode;
604 ceph_queue_cap_snap(ci, realm->cached_context);
605 spin_lock(&realm->inodes_with_caps_lock);
606 }
607 spin_unlock(&realm->inodes_with_caps_lock);
608 if (lastinode)
609 iput(lastinode);
610 dout("update_snap_trace cap_snaps queued\n");
611 }
612
613 } else {
614 dout("update_snap_trace %llx %p seq %lld unchanged\n",
615 realm->ino, realm, realm->seq);
616 }
617
618 /* ensure the parent is correct */
619 err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
620 if (err < 0)
621 goto fail;
622 invalidate += err;
623
624 if (le64_to_cpu(ri->seq) > realm->seq) {
625 /* update realm parameters, snap lists */
626 realm->seq = le64_to_cpu(ri->seq);
627 realm->created = le64_to_cpu(ri->created);
628 realm->parent_since = le64_to_cpu(ri->parent_since);
629
630 realm->num_snaps = le32_to_cpu(ri->num_snaps);
631 err = dup_array(&realm->snaps, snaps, realm->num_snaps);
632 if (err < 0)
633 goto fail;
634
635 realm->num_prior_parent_snaps =
636 le32_to_cpu(ri->num_prior_parent_snaps);
637 err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
638 realm->num_prior_parent_snaps);
639 if (err < 0)
640 goto fail;
641
642 invalidate = 1;
643 } else if (!realm->cached_context) {
644 invalidate = 1;
645 }
646
647 dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
648 realm, invalidate, p, e);
649
650 if (p < e)
651 goto more;
652
653 /* invalidate when we reach the _end_ (root) of the trace */
654 if (invalidate)
655 rebuild_snap_realms(realm);
656
657 __cleanup_empty_realms(mdsc);
658 return 0;
659
660bad:
661 err = -EINVAL;
662fail:
663 pr_err("update_snap_trace error %d\n", err);
664 return err;
665}
666
667
668/*
669 * Send any cap_snaps that are queued for flush. Try to carry
670 * s_mutex across multiple snap flushes to avoid locking overhead.
671 *
672 * Caller holds no locks.
673 */
674static void flush_snaps(struct ceph_mds_client *mdsc)
675{
676 struct ceph_inode_info *ci;
677 struct inode *inode;
678 struct ceph_mds_session *session = NULL;
679
680 dout("flush_snaps\n");
681 spin_lock(&mdsc->snap_flush_lock);
682 while (!list_empty(&mdsc->snap_flush_list)) {
683 ci = list_first_entry(&mdsc->snap_flush_list,
684 struct ceph_inode_info, i_snap_flush_item);
685 inode = &ci->vfs_inode;
686 igrab(inode);
687 spin_unlock(&mdsc->snap_flush_lock);
688 spin_lock(&inode->i_lock);
689 __ceph_flush_snaps(ci, &session);
690 spin_unlock(&inode->i_lock);
691 iput(inode);
692 spin_lock(&mdsc->snap_flush_lock);
693 }
694 spin_unlock(&mdsc->snap_flush_lock);
695
696 if (session) {
697 mutex_unlock(&session->s_mutex);
698 ceph_put_mds_session(session);
699 }
700 dout("flush_snaps done\n");
701}
702
703
704/*
705 * Handle a snap notification from the MDS.
706 *
707 * This can take two basic forms: the simplest is just a snap creation
708 * or deletion notification on an existing realm. This should update the
709 * realm and its children.
710 *
711 * The more difficult case is realm creation, due to snap creation at a
712 * new point in the file hierarchy, or due to a rename that moves a file or
713 * directory into another realm.
714 */
715void ceph_handle_snap(struct ceph_mds_client *mdsc,
716 struct ceph_mds_session *session,
717 struct ceph_msg *msg)
718{
719 struct super_block *sb = mdsc->client->sb;
720 int mds = session->s_mds;
721 u64 split;
722 int op;
723 int trace_len;
724 struct ceph_snap_realm *realm = NULL;
725 void *p = msg->front.iov_base;
726 void *e = p + msg->front.iov_len;
727 struct ceph_mds_snap_head *h;
728 int num_split_inos, num_split_realms;
729 __le64 *split_inos = NULL, *split_realms = NULL;
730 int i;
731 int locked_rwsem = 0;
732
733 /* decode */
734 if (msg->front.iov_len < sizeof(*h))
735 goto bad;
736 h = p;
737 op = le32_to_cpu(h->op);
738 split = le64_to_cpu(h->split); /* non-zero if we are splitting an
739 * existing realm */
740 num_split_inos = le32_to_cpu(h->num_split_inos);
741 num_split_realms = le32_to_cpu(h->num_split_realms);
742 trace_len = le32_to_cpu(h->trace_len);
743 p += sizeof(*h);
744
745 dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
746 ceph_snap_op_name(op), split, trace_len);
747
748 mutex_lock(&session->s_mutex);
749 session->s_seq++;
750 mutex_unlock(&session->s_mutex);
751
752 down_write(&mdsc->snap_rwsem);
753 locked_rwsem = 1;
754
755 if (op == CEPH_SNAP_OP_SPLIT) {
756 struct ceph_mds_snap_realm *ri;
757
758 /*
759 * A "split" breaks part of an existing realm off into
760 * a new realm. The MDS provides a list of inodes
761 * (with caps) and child realms that belong to the new
762 * child.
763 */
764 split_inos = p;
765 p += sizeof(u64) * num_split_inos;
766 split_realms = p;
767 p += sizeof(u64) * num_split_realms;
768 ceph_decode_need(&p, e, sizeof(*ri), bad);
769 /* we will peek at realm info here, but will _not_
770 * advance p, as the realm update will occur below in
771 * ceph_update_snap_trace. */
772 ri = p;
773
774 realm = ceph_lookup_snap_realm(mdsc, split);
775 if (!realm) {
776 realm = ceph_create_snap_realm(mdsc, split);
777 if (IS_ERR(realm))
778 goto out;
779 }
780 ceph_get_snap_realm(mdsc, realm);
781
782 dout("splitting snap_realm %llx %p\n", realm->ino, realm);
783 for (i = 0; i < num_split_inos; i++) {
784 struct ceph_vino vino = {
785 .ino = le64_to_cpu(split_inos[i]),
786 .snap = CEPH_NOSNAP,
787 };
788 struct inode *inode = ceph_find_inode(sb, vino);
789 struct ceph_inode_info *ci;
790
791 if (!inode)
792 continue;
793 ci = ceph_inode(inode);
794
795 spin_lock(&inode->i_lock);
796 if (!ci->i_snap_realm)
797 goto skip_inode;
798 /*
799 * If this inode belongs to a realm that was
800 * created after our new realm, we experienced
801 * a race (due to another split notifications
802 * arriving from a different MDS). So skip
803 * this inode.
804 */
805 if (ci->i_snap_realm->created >
806 le64_to_cpu(ri->created)) {
807 dout(" leaving %p in newer realm %llx %p\n",
808 inode, ci->i_snap_realm->ino,
809 ci->i_snap_realm);
810 goto skip_inode;
811 }
812 dout(" will move %p to split realm %llx %p\n",
813 inode, realm->ino, realm);
814 /*
815 * Remove the inode from the realm's inode
816 * list, but don't add it to the new realm
817 * yet. We don't want the cap_snap to be
818 * queued (again) by ceph_update_snap_trace()
819 * below. Queue it _now_, under the old context.
820 */
821 list_del_init(&ci->i_snap_realm_item);
822 spin_unlock(&inode->i_lock);
823
824 ceph_queue_cap_snap(ci,
825 ci->i_snap_realm->cached_context);
826
827 iput(inode);
828 continue;
829
830skip_inode:
831 spin_unlock(&inode->i_lock);
832 iput(inode);
833 }
834
835 /* we may have taken some of the old realm's children. */
836 for (i = 0; i < num_split_realms; i++) {
837 struct ceph_snap_realm *child =
838 ceph_lookup_snap_realm(mdsc,
839 le64_to_cpu(split_realms[i]));
840 if (!child)
841 continue;
842 adjust_snap_realm_parent(mdsc, child, realm->ino);
843 }
844 }
845
846 /*
847 * update using the provided snap trace. if we are deleting a
848 * snap, we can avoid queueing cap_snaps.
849 */
850 ceph_update_snap_trace(mdsc, p, e,
851 op == CEPH_SNAP_OP_DESTROY);
852
853 if (op == CEPH_SNAP_OP_SPLIT) {
854 /*
855 * ok, _now_ add the inodes into the new realm.
856 */
857 for (i = 0; i < num_split_inos; i++) {
858 struct ceph_vino vino = {
859 .ino = le64_to_cpu(split_inos[i]),
860 .snap = CEPH_NOSNAP,
861 };
862 struct inode *inode = ceph_find_inode(sb, vino);
863 struct ceph_inode_info *ci;
864
865 if (!inode)
866 continue;
867 ci = ceph_inode(inode);
868 spin_lock(&inode->i_lock);
869 if (!ci->i_snap_realm)
870 goto split_skip_inode;
871 ceph_put_snap_realm(mdsc, ci->i_snap_realm);
872 spin_lock(&realm->inodes_with_caps_lock);
873 list_add(&ci->i_snap_realm_item,
874 &realm->inodes_with_caps);
875 ci->i_snap_realm = realm;
876 spin_unlock(&realm->inodes_with_caps_lock);
877 ceph_get_snap_realm(mdsc, realm);
878split_skip_inode:
879 spin_unlock(&inode->i_lock);
880 iput(inode);
881 }
882
883 /* we took a reference when we created the realm, above */
884 ceph_put_snap_realm(mdsc, realm);
885 }
886
887 __cleanup_empty_realms(mdsc);
888
889 up_write(&mdsc->snap_rwsem);
890
891 flush_snaps(mdsc);
892 return;
893
894bad:
895 pr_err("corrupt snap message from mds%d\n", mds);
896 ceph_msg_dump(msg);
897out:
898 if (locked_rwsem)
899 up_write(&mdsc->snap_rwsem);
900 return;
901}
902
903
904
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
new file mode 100644
index 000000000000..4290a6e860b0
--- /dev/null
+++ b/fs/ceph/super.c
@@ -0,0 +1,1030 @@
1
2#include "ceph_debug.h"
3
4#include <linux/backing-dev.h>
5#include <linux/fs.h>
6#include <linux/inet.h>
7#include <linux/in6.h>
8#include <linux/module.h>
9#include <linux/mount.h>
10#include <linux/parser.h>
11#include <linux/rwsem.h>
12#include <linux/sched.h>
13#include <linux/seq_file.h>
14#include <linux/statfs.h>
15#include <linux/string.h>
16#include <linux/version.h>
17#include <linux/vmalloc.h>
18
19#include "decode.h"
20#include "super.h"
21#include "mon_client.h"
22#include "auth.h"
23
24/*
25 * Ceph superblock operations
26 *
27 * Handle the basics of mounting, unmounting.
28 */
29
30
31/*
32 * find filename portion of a path (/foo/bar/baz -> baz)
33 */
34const char *ceph_file_part(const char *s, int len)
35{
36 const char *e = s + len;
37
38 while (e != s && *(e-1) != '/')
39 e--;
40 return e;
41}
42
43
44/*
45 * super ops
46 */
47static void ceph_put_super(struct super_block *s)
48{
49 struct ceph_client *cl = ceph_client(s);
50
51 dout("put_super\n");
52 ceph_mdsc_close_sessions(&cl->mdsc);
53 return;
54}
55
56static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
57{
58 struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
59 struct ceph_monmap *monmap = client->monc.monmap;
60 struct ceph_statfs st;
61 u64 fsid;
62 int err;
63
64 dout("statfs\n");
65 err = ceph_monc_do_statfs(&client->monc, &st);
66 if (err < 0)
67 return err;
68
69 /* fill in kstatfs */
70 buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
71
72 /*
73 * express utilization in terms of large blocks to avoid
74 * overflow on 32-bit machines.
75 */
76 buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
77 buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
78 buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
79 (CEPH_BLOCK_SHIFT-10);
80 buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
81
82 buf->f_files = le64_to_cpu(st.num_objects);
83 buf->f_ffree = -1;
84 buf->f_namelen = PATH_MAX;
85 buf->f_frsize = PAGE_CACHE_SIZE;
86
87 /* leave fsid little-endian, regardless of host endianness */
88 fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
89 buf->f_fsid.val[0] = fsid & 0xffffffff;
90 buf->f_fsid.val[1] = fsid >> 32;
91
92 return 0;
93}
94
95
96static int ceph_syncfs(struct super_block *sb, int wait)
97{
98 dout("sync_fs %d\n", wait);
99 ceph_osdc_sync(&ceph_client(sb)->osdc);
100 ceph_mdsc_sync(&ceph_client(sb)->mdsc);
101 dout("sync_fs %d done\n", wait);
102 return 0;
103}
104
105
106/**
107 * ceph_show_options - Show mount options in /proc/mounts
108 * @m: seq_file to write to
109 * @mnt: mount descriptor
110 */
111static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
112{
113 struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
114 struct ceph_mount_args *args = client->mount_args;
115
116 if (args->flags & CEPH_OPT_FSID)
117 seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
118 le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
119 le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
120 if (args->flags & CEPH_OPT_NOSHARE)
121 seq_puts(m, ",noshare");
122 if (args->flags & CEPH_OPT_DIRSTAT)
123 seq_puts(m, ",dirstat");
124 if ((args->flags & CEPH_OPT_RBYTES) == 0)
125 seq_puts(m, ",norbytes");
126 if (args->flags & CEPH_OPT_NOCRC)
127 seq_puts(m, ",nocrc");
128 if (args->flags & CEPH_OPT_NOASYNCREADDIR)
129 seq_puts(m, ",noasyncreaddir");
130 if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
131 seq_printf(m, ",snapdirname=%s", args->snapdir_name);
132 if (args->name)
133 seq_printf(m, ",name=%s", args->name);
134 if (args->secret)
135 seq_puts(m, ",secret=<hidden>");
136 return 0;
137}
138
139/*
140 * caches
141 */
142struct kmem_cache *ceph_inode_cachep;
143struct kmem_cache *ceph_cap_cachep;
144struct kmem_cache *ceph_dentry_cachep;
145struct kmem_cache *ceph_file_cachep;
146
147static void ceph_inode_init_once(void *foo)
148{
149 struct ceph_inode_info *ci = foo;
150 inode_init_once(&ci->vfs_inode);
151}
152
153static int default_congestion_kb(void)
154{
155 int congestion_kb;
156
157 /*
158 * Copied from NFS
159 *
160 * congestion size, scale with available memory.
161 *
162 * 64MB: 8192k
163 * 128MB: 11585k
164 * 256MB: 16384k
165 * 512MB: 23170k
166 * 1GB: 32768k
167 * 2GB: 46340k
168 * 4GB: 65536k
169 * 8GB: 92681k
170 * 16GB: 131072k
171 *
172 * This allows larger machines to have larger/more transfers.
173 * Limit the default to 256M
174 */
175 congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
176 if (congestion_kb > 256*1024)
177 congestion_kb = 256*1024;
178
179 return congestion_kb;
180}
181
182static int __init init_caches(void)
183{
184 ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
185 sizeof(struct ceph_inode_info),
186 __alignof__(struct ceph_inode_info),
187 (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
188 ceph_inode_init_once);
189 if (ceph_inode_cachep == NULL)
190 return -ENOMEM;
191
192 ceph_cap_cachep = KMEM_CACHE(ceph_cap,
193 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
194 if (ceph_cap_cachep == NULL)
195 goto bad_cap;
196
197 ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
198 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
199 if (ceph_dentry_cachep == NULL)
200 goto bad_dentry;
201
202 ceph_file_cachep = KMEM_CACHE(ceph_file_info,
203 SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
204 if (ceph_file_cachep == NULL)
205 goto bad_file;
206
207 return 0;
208
209bad_file:
210 kmem_cache_destroy(ceph_dentry_cachep);
211bad_dentry:
212 kmem_cache_destroy(ceph_cap_cachep);
213bad_cap:
214 kmem_cache_destroy(ceph_inode_cachep);
215 return -ENOMEM;
216}
217
218static void destroy_caches(void)
219{
220 kmem_cache_destroy(ceph_inode_cachep);
221 kmem_cache_destroy(ceph_cap_cachep);
222 kmem_cache_destroy(ceph_dentry_cachep);
223 kmem_cache_destroy(ceph_file_cachep);
224}
225
226
227/*
228 * ceph_umount_begin - initiate forced umount. Tear down down the
229 * mount, skipping steps that may hang while waiting for server(s).
230 */
231static void ceph_umount_begin(struct super_block *sb)
232{
233 struct ceph_client *client = ceph_sb_to_client(sb);
234
235 dout("ceph_umount_begin - starting forced umount\n");
236 if (!client)
237 return;
238 client->mount_state = CEPH_MOUNT_SHUTDOWN;
239 return;
240}
241
242static const struct super_operations ceph_super_ops = {
243 .alloc_inode = ceph_alloc_inode,
244 .destroy_inode = ceph_destroy_inode,
245 .write_inode = ceph_write_inode,
246 .sync_fs = ceph_syncfs,
247 .put_super = ceph_put_super,
248 .show_options = ceph_show_options,
249 .statfs = ceph_statfs,
250 .umount_begin = ceph_umount_begin,
251};
252
253
254const char *ceph_msg_type_name(int type)
255{
256 switch (type) {
257 case CEPH_MSG_SHUTDOWN: return "shutdown";
258 case CEPH_MSG_PING: return "ping";
259 case CEPH_MSG_AUTH: return "auth";
260 case CEPH_MSG_AUTH_REPLY: return "auth_reply";
261 case CEPH_MSG_MON_MAP: return "mon_map";
262 case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
263 case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
264 case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
265 case CEPH_MSG_STATFS: return "statfs";
266 case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
267 case CEPH_MSG_MDS_MAP: return "mds_map";
268 case CEPH_MSG_CLIENT_SESSION: return "client_session";
269 case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
270 case CEPH_MSG_CLIENT_REQUEST: return "client_request";
271 case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
272 case CEPH_MSG_CLIENT_REPLY: return "client_reply";
273 case CEPH_MSG_CLIENT_CAPS: return "client_caps";
274 case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
275 case CEPH_MSG_CLIENT_SNAP: return "client_snap";
276 case CEPH_MSG_CLIENT_LEASE: return "client_lease";
277 case CEPH_MSG_OSD_MAP: return "osd_map";
278 case CEPH_MSG_OSD_OP: return "osd_op";
279 case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
280 default: return "unknown";
281 }
282}
283
284
285/*
286 * mount options
287 */
288enum {
289 Opt_fsidmajor,
290 Opt_fsidminor,
291 Opt_monport,
292 Opt_wsize,
293 Opt_rsize,
294 Opt_osdtimeout,
295 Opt_osdkeepalivetimeout,
296 Opt_mount_timeout,
297 Opt_osd_idle_ttl,
298 Opt_caps_wanted_delay_min,
299 Opt_caps_wanted_delay_max,
300 Opt_readdir_max_entries,
301 Opt_congestion_kb,
302 Opt_last_int,
303 /* int args above */
304 Opt_snapdirname,
305 Opt_name,
306 Opt_secret,
307 Opt_last_string,
308 /* string args above */
309 Opt_ip,
310 Opt_noshare,
311 Opt_dirstat,
312 Opt_nodirstat,
313 Opt_rbytes,
314 Opt_norbytes,
315 Opt_nocrc,
316 Opt_noasyncreaddir,
317};
318
319static match_table_t arg_tokens = {
320 {Opt_fsidmajor, "fsidmajor=%ld"},
321 {Opt_fsidminor, "fsidminor=%ld"},
322 {Opt_monport, "monport=%d"},
323 {Opt_wsize, "wsize=%d"},
324 {Opt_rsize, "rsize=%d"},
325 {Opt_osdtimeout, "osdtimeout=%d"},
326 {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
327 {Opt_mount_timeout, "mount_timeout=%d"},
328 {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
329 {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
330 {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
331 {Opt_readdir_max_entries, "readdir_max_entries=%d"},
332 {Opt_congestion_kb, "write_congestion_kb=%d"},
333 /* int args above */
334 {Opt_snapdirname, "snapdirname=%s"},
335 {Opt_name, "name=%s"},
336 {Opt_secret, "secret=%s"},
337 /* string args above */
338 {Opt_ip, "ip=%s"},
339 {Opt_noshare, "noshare"},
340 {Opt_dirstat, "dirstat"},
341 {Opt_nodirstat, "nodirstat"},
342 {Opt_rbytes, "rbytes"},
343 {Opt_norbytes, "norbytes"},
344 {Opt_nocrc, "nocrc"},
345 {Opt_noasyncreaddir, "noasyncreaddir"},
346 {-1, NULL}
347};
348
349
350static struct ceph_mount_args *parse_mount_args(int flags, char *options,
351 const char *dev_name,
352 const char **path)
353{
354 struct ceph_mount_args *args;
355 const char *c;
356 int err = -ENOMEM;
357 substring_t argstr[MAX_OPT_ARGS];
358
359 args = kzalloc(sizeof(*args), GFP_KERNEL);
360 if (!args)
361 return ERR_PTR(-ENOMEM);
362 args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
363 GFP_KERNEL);
364 if (!args->mon_addr)
365 goto out;
366
367 dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
368
369 /* start with defaults */
370 args->sb_flags = flags;
371 args->flags = CEPH_OPT_DEFAULT;
372 args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
373 args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
374 args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
375 args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
376 args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
377 args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
378 args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
379 args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
380 args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
381 args->max_readdir = 1024;
382 args->congestion_kb = default_congestion_kb();
383
384 /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
385 err = -EINVAL;
386 if (!dev_name)
387 goto out;
388 *path = strstr(dev_name, ":/");
389 if (*path == NULL) {
390 pr_err("device name is missing path (no :/ in %s)\n",
391 dev_name);
392 goto out;
393 }
394
395 /* get mon ip(s) */
396 err = ceph_parse_ips(dev_name, *path, args->mon_addr,
397 CEPH_MAX_MON, &args->num_mon);
398 if (err < 0)
399 goto out;
400
401 /* path on server */
402 *path += 2;
403 dout("server path '%s'\n", *path);
404
405 /* parse mount options */
406 while ((c = strsep(&options, ",")) != NULL) {
407 int token, intval, ret;
408 if (!*c)
409 continue;
410 err = -EINVAL;
411 token = match_token((char *)c, arg_tokens, argstr);
412 if (token < 0) {
413 pr_err("bad mount option at '%s'\n", c);
414 goto out;
415 }
416 if (token < Opt_last_int) {
417 ret = match_int(&argstr[0], &intval);
418 if (ret < 0) {
419 pr_err("bad mount option arg (not int) "
420 "at '%s'\n", c);
421 continue;
422 }
423 dout("got int token %d val %d\n", token, intval);
424 } else if (token > Opt_last_int && token < Opt_last_string) {
425 dout("got string token %d val %s\n", token,
426 argstr[0].from);
427 } else {
428 dout("got token %d\n", token);
429 }
430 switch (token) {
431 case Opt_fsidmajor:
432 *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
433 break;
434 case Opt_fsidminor:
435 *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
436 break;
437 case Opt_ip:
438 err = ceph_parse_ips(argstr[0].from,
439 argstr[0].to,
440 &args->my_addr,
441 1, NULL);
442 if (err < 0)
443 goto out;
444 args->flags |= CEPH_OPT_MYIP;
445 break;
446
447 case Opt_snapdirname:
448 kfree(args->snapdir_name);
449 args->snapdir_name = kstrndup(argstr[0].from,
450 argstr[0].to-argstr[0].from,
451 GFP_KERNEL);
452 break;
453 case Opt_name:
454 args->name = kstrndup(argstr[0].from,
455 argstr[0].to-argstr[0].from,
456 GFP_KERNEL);
457 break;
458 case Opt_secret:
459 args->secret = kstrndup(argstr[0].from,
460 argstr[0].to-argstr[0].from,
461 GFP_KERNEL);
462 break;
463
464 /* misc */
465 case Opt_wsize:
466 args->wsize = intval;
467 break;
468 case Opt_rsize:
469 args->rsize = intval;
470 break;
471 case Opt_osdtimeout:
472 args->osd_timeout = intval;
473 break;
474 case Opt_osdkeepalivetimeout:
475 args->osd_keepalive_timeout = intval;
476 break;
477 case Opt_mount_timeout:
478 args->mount_timeout = intval;
479 break;
480 case Opt_caps_wanted_delay_min:
481 args->caps_wanted_delay_min = intval;
482 break;
483 case Opt_caps_wanted_delay_max:
484 args->caps_wanted_delay_max = intval;
485 break;
486 case Opt_readdir_max_entries:
487 args->max_readdir = intval;
488 break;
489 case Opt_congestion_kb:
490 args->congestion_kb = intval;
491 break;
492
493 case Opt_noshare:
494 args->flags |= CEPH_OPT_NOSHARE;
495 break;
496
497 case Opt_dirstat:
498 args->flags |= CEPH_OPT_DIRSTAT;
499 break;
500 case Opt_nodirstat:
501 args->flags &= ~CEPH_OPT_DIRSTAT;
502 break;
503 case Opt_rbytes:
504 args->flags |= CEPH_OPT_RBYTES;
505 break;
506 case Opt_norbytes:
507 args->flags &= ~CEPH_OPT_RBYTES;
508 break;
509 case Opt_nocrc:
510 args->flags |= CEPH_OPT_NOCRC;
511 break;
512 case Opt_noasyncreaddir:
513 args->flags |= CEPH_OPT_NOASYNCREADDIR;
514 break;
515
516 default:
517 BUG_ON(token);
518 }
519 }
520 return args;
521
522out:
523 kfree(args->mon_addr);
524 kfree(args);
525 return ERR_PTR(err);
526}
527
528static void destroy_mount_args(struct ceph_mount_args *args)
529{
530 dout("destroy_mount_args %p\n", args);
531 kfree(args->snapdir_name);
532 args->snapdir_name = NULL;
533 kfree(args->name);
534 args->name = NULL;
535 kfree(args->secret);
536 args->secret = NULL;
537 kfree(args);
538}
539
540/*
541 * create a fresh client instance
542 */
543static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
544{
545 struct ceph_client *client;
546 int err = -ENOMEM;
547
548 client = kzalloc(sizeof(*client), GFP_KERNEL);
549 if (client == NULL)
550 return ERR_PTR(-ENOMEM);
551
552 mutex_init(&client->mount_mutex);
553
554 init_waitqueue_head(&client->auth_wq);
555
556 client->sb = NULL;
557 client->mount_state = CEPH_MOUNT_MOUNTING;
558 client->mount_args = args;
559
560 client->msgr = NULL;
561
562 client->auth_err = 0;
563 atomic_long_set(&client->writeback_count, 0);
564
565 err = bdi_init(&client->backing_dev_info);
566 if (err < 0)
567 goto fail;
568
569 err = -ENOMEM;
570 client->wb_wq = create_workqueue("ceph-writeback");
571 if (client->wb_wq == NULL)
572 goto fail_bdi;
573 client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
574 if (client->pg_inv_wq == NULL)
575 goto fail_wb_wq;
576 client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
577 if (client->trunc_wq == NULL)
578 goto fail_pg_inv_wq;
579
580 /* set up mempools */
581 err = -ENOMEM;
582 client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
583 client->mount_args->wsize >> PAGE_CACHE_SHIFT);
584 if (!client->wb_pagevec_pool)
585 goto fail_trunc_wq;
586
587 /* caps */
588 client->min_caps = args->max_readdir;
589 ceph_adjust_min_caps(client->min_caps);
590
591 /* subsystems */
592 err = ceph_monc_init(&client->monc, client);
593 if (err < 0)
594 goto fail_mempool;
595 err = ceph_osdc_init(&client->osdc, client);
596 if (err < 0)
597 goto fail_monc;
598 err = ceph_mdsc_init(&client->mdsc, client);
599 if (err < 0)
600 goto fail_osdc;
601 return client;
602
603fail_osdc:
604 ceph_osdc_stop(&client->osdc);
605fail_monc:
606 ceph_monc_stop(&client->monc);
607fail_mempool:
608 mempool_destroy(client->wb_pagevec_pool);
609fail_trunc_wq:
610 destroy_workqueue(client->trunc_wq);
611fail_pg_inv_wq:
612 destroy_workqueue(client->pg_inv_wq);
613fail_wb_wq:
614 destroy_workqueue(client->wb_wq);
615fail_bdi:
616 bdi_destroy(&client->backing_dev_info);
617fail:
618 kfree(client);
619 return ERR_PTR(err);
620}
621
622static void ceph_destroy_client(struct ceph_client *client)
623{
624 dout("destroy_client %p\n", client);
625
626 /* unmount */
627 ceph_mdsc_stop(&client->mdsc);
628 ceph_monc_stop(&client->monc);
629 ceph_osdc_stop(&client->osdc);
630
631 ceph_adjust_min_caps(-client->min_caps);
632
633 ceph_debugfs_client_cleanup(client);
634 destroy_workqueue(client->wb_wq);
635 destroy_workqueue(client->pg_inv_wq);
636 destroy_workqueue(client->trunc_wq);
637
638 if (client->msgr)
639 ceph_messenger_destroy(client->msgr);
640 mempool_destroy(client->wb_pagevec_pool);
641
642 destroy_mount_args(client->mount_args);
643
644 kfree(client);
645 dout("destroy_client %p done\n", client);
646}
647
648/*
649 * Initially learn our fsid, or verify an fsid matches.
650 */
651int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
652{
653 if (client->have_fsid) {
654 if (ceph_fsid_compare(&client->fsid, fsid)) {
655 pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
656 PR_FSID(&client->fsid), PR_FSID(fsid));
657 return -1;
658 }
659 } else {
660 pr_info("client%lld fsid " FSID_FORMAT "\n",
661 client->monc.auth->global_id, PR_FSID(fsid));
662 memcpy(&client->fsid, fsid, sizeof(*fsid));
663 ceph_debugfs_client_init(client);
664 client->have_fsid = true;
665 }
666 return 0;
667}
668
669/*
670 * true if we have the mon map (and have thus joined the cluster)
671 */
672static int have_mon_map(struct ceph_client *client)
673{
674 return client->monc.monmap && client->monc.monmap->epoch;
675}
676
677/*
678 * Bootstrap mount by opening the root directory. Note the mount
679 * @started time from caller, and time out if this takes too long.
680 */
681static struct dentry *open_root_dentry(struct ceph_client *client,
682 const char *path,
683 unsigned long started)
684{
685 struct ceph_mds_client *mdsc = &client->mdsc;
686 struct ceph_mds_request *req = NULL;
687 int err;
688 struct dentry *root;
689
690 /* open dir */
691 dout("open_root_inode opening '%s'\n", path);
692 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
693 if (IS_ERR(req))
694 return ERR_PTR(PTR_ERR(req));
695 req->r_path1 = kstrdup(path, GFP_NOFS);
696 req->r_ino1.ino = CEPH_INO_ROOT;
697 req->r_ino1.snap = CEPH_NOSNAP;
698 req->r_started = started;
699 req->r_timeout = client->mount_args->mount_timeout * HZ;
700 req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
701 req->r_num_caps = 2;
702 err = ceph_mdsc_do_request(mdsc, NULL, req);
703 if (err == 0) {
704 dout("open_root_inode success\n");
705 if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
706 client->sb->s_root == NULL)
707 root = d_alloc_root(req->r_target_inode);
708 else
709 root = d_obtain_alias(req->r_target_inode);
710 req->r_target_inode = NULL;
711 dout("open_root_inode success, root dentry is %p\n", root);
712 } else {
713 root = ERR_PTR(err);
714 }
715 ceph_mdsc_put_request(req);
716 return root;
717}
718
719/*
720 * mount: join the ceph cluster, and open root directory.
721 */
722static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
723 const char *path)
724{
725 struct ceph_entity_addr *myaddr = NULL;
726 int err;
727 unsigned long timeout = client->mount_args->mount_timeout * HZ;
728 unsigned long started = jiffies; /* note the start time */
729 struct dentry *root;
730
731 dout("mount start\n");
732 mutex_lock(&client->mount_mutex);
733
734 /* initialize the messenger */
735 if (client->msgr == NULL) {
736 if (ceph_test_opt(client, MYIP))
737 myaddr = &client->mount_args->my_addr;
738 client->msgr = ceph_messenger_create(myaddr);
739 if (IS_ERR(client->msgr)) {
740 err = PTR_ERR(client->msgr);
741 client->msgr = NULL;
742 goto out;
743 }
744 client->msgr->nocrc = ceph_test_opt(client, NOCRC);
745 }
746
747 /* open session, and wait for mon, mds, and osd maps */
748 err = ceph_monc_open_session(&client->monc);
749 if (err < 0)
750 goto out;
751
752 while (!have_mon_map(client)) {
753 err = -EIO;
754 if (timeout && time_after_eq(jiffies, started + timeout))
755 goto out;
756
757 /* wait */
758 dout("mount waiting for mon_map\n");
759 err = wait_event_interruptible_timeout(client->auth_wq,
760 have_mon_map(client) || (client->auth_err < 0),
761 timeout);
762 if (err == -EINTR || err == -ERESTARTSYS)
763 goto out;
764 if (client->auth_err < 0) {
765 err = client->auth_err;
766 goto out;
767 }
768 }
769
770 dout("mount opening root\n");
771 root = open_root_dentry(client, "", started);
772 if (IS_ERR(root)) {
773 err = PTR_ERR(root);
774 goto out;
775 }
776 if (client->sb->s_root)
777 dput(root);
778 else
779 client->sb->s_root = root;
780
781 if (path[0] == 0) {
782 dget(root);
783 } else {
784 dout("mount opening base mountpoint\n");
785 root = open_root_dentry(client, path, started);
786 if (IS_ERR(root)) {
787 err = PTR_ERR(root);
788 dput(client->sb->s_root);
789 client->sb->s_root = NULL;
790 goto out;
791 }
792 }
793
794 mnt->mnt_root = root;
795 mnt->mnt_sb = client->sb;
796
797 client->mount_state = CEPH_MOUNT_MOUNTED;
798 dout("mount success\n");
799 err = 0;
800
801out:
802 mutex_unlock(&client->mount_mutex);
803 return err;
804}
805
806static int ceph_set_super(struct super_block *s, void *data)
807{
808 struct ceph_client *client = data;
809 int ret;
810
811 dout("set_super %p data %p\n", s, data);
812
813 s->s_flags = client->mount_args->sb_flags;
814 s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
815
816 s->s_fs_info = client;
817 client->sb = s;
818
819 s->s_op = &ceph_super_ops;
820 s->s_export_op = &ceph_export_ops;
821
822 s->s_time_gran = 1000; /* 1000 ns == 1 us */
823
824 ret = set_anon_super(s, NULL); /* what is that second arg for? */
825 if (ret != 0)
826 goto fail;
827
828 return ret;
829
830fail:
831 s->s_fs_info = NULL;
832 client->sb = NULL;
833 return ret;
834}
835
836/*
837 * share superblock if same fs AND options
838 */
839static int ceph_compare_super(struct super_block *sb, void *data)
840{
841 struct ceph_client *new = data;
842 struct ceph_mount_args *args = new->mount_args;
843 struct ceph_client *other = ceph_sb_to_client(sb);
844 int i;
845
846 dout("ceph_compare_super %p\n", sb);
847 if (args->flags & CEPH_OPT_FSID) {
848 if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
849 dout("fsid doesn't match\n");
850 return 0;
851 }
852 } else {
853 /* do we share (a) monitor? */
854 for (i = 0; i < new->monc.monmap->num_mon; i++)
855 if (ceph_monmap_contains(other->monc.monmap,
856 &new->monc.monmap->mon_inst[i].addr))
857 break;
858 if (i == new->monc.monmap->num_mon) {
859 dout("mon ip not part of monmap\n");
860 return 0;
861 }
862 dout("mon ip matches existing sb %p\n", sb);
863 }
864 if (args->sb_flags != other->mount_args->sb_flags) {
865 dout("flags differ\n");
866 return 0;
867 }
868 return 1;
869}
870
871/*
872 * construct our own bdi so we can control readahead, etc.
873 */
874static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
875{
876 int err;
877
878 sb->s_bdi = &client->backing_dev_info;
879
880 /* set ra_pages based on rsize mount option? */
881 if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
882 client->backing_dev_info.ra_pages =
883 (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
884 >> PAGE_SHIFT;
885 err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
886 return err;
887}
888
889static int ceph_get_sb(struct file_system_type *fs_type,
890 int flags, const char *dev_name, void *data,
891 struct vfsmount *mnt)
892{
893 struct super_block *sb;
894 struct ceph_client *client;
895 int err;
896 int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
897 const char *path = NULL;
898 struct ceph_mount_args *args;
899
900 dout("ceph_get_sb\n");
901 args = parse_mount_args(flags, data, dev_name, &path);
902 if (IS_ERR(args)) {
903 err = PTR_ERR(args);
904 goto out_final;
905 }
906
907 /* create client (which we may/may not use) */
908 client = ceph_create_client(args);
909 if (IS_ERR(client)) {
910 err = PTR_ERR(client);
911 goto out_final;
912 }
913
914 if (client->mount_args->flags & CEPH_OPT_NOSHARE)
915 compare_super = NULL;
916 sb = sget(fs_type, compare_super, ceph_set_super, client);
917 if (IS_ERR(sb)) {
918 err = PTR_ERR(sb);
919 goto out;
920 }
921
922 if (ceph_client(sb) != client) {
923 ceph_destroy_client(client);
924 client = ceph_client(sb);
925 dout("get_sb got existing client %p\n", client);
926 } else {
927 dout("get_sb using new client %p\n", client);
928 err = ceph_register_bdi(sb, client);
929 if (err < 0)
930 goto out_splat;
931 }
932
933 err = ceph_mount(client, mnt, path);
934 if (err < 0)
935 goto out_splat;
936 dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
937 mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
938 return 0;
939
940out_splat:
941 ceph_mdsc_close_sessions(&client->mdsc);
942 up_write(&sb->s_umount);
943 deactivate_super(sb);
944 goto out_final;
945
946out:
947 ceph_destroy_client(client);
948out_final:
949 dout("ceph_get_sb fail %d\n", err);
950 return err;
951}
952
953static void ceph_kill_sb(struct super_block *s)
954{
955 struct ceph_client *client = ceph_sb_to_client(s);
956 dout("kill_sb %p\n", s);
957 ceph_mdsc_pre_umount(&client->mdsc);
958 kill_anon_super(s); /* will call put_super after sb is r/o */
959 if (s->s_bdi == &client->backing_dev_info)
960 bdi_unregister(&client->backing_dev_info);
961 bdi_destroy(&client->backing_dev_info);
962 ceph_destroy_client(client);
963}
964
965static struct file_system_type ceph_fs_type = {
966 .owner = THIS_MODULE,
967 .name = "ceph",
968 .get_sb = ceph_get_sb,
969 .kill_sb = ceph_kill_sb,
970 .fs_flags = FS_RENAME_DOES_D_MOVE,
971};
972
973#define _STRINGIFY(x) #x
974#define STRINGIFY(x) _STRINGIFY(x)
975
976static int __init init_ceph(void)
977{
978 int ret = 0;
979
980 ret = ceph_debugfs_init();
981 if (ret < 0)
982 goto out;
983
984 ret = ceph_msgr_init();
985 if (ret < 0)
986 goto out_debugfs;
987
988 ret = init_caches();
989 if (ret)
990 goto out_msgr;
991
992 ceph_caps_init();
993
994 ret = register_filesystem(&ceph_fs_type);
995 if (ret)
996 goto out_icache;
997
998 pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n",
999 CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH,
1000 CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL);
1001 return 0;
1002
1003out_icache:
1004 destroy_caches();
1005out_msgr:
1006 ceph_msgr_exit();
1007out_debugfs:
1008 ceph_debugfs_cleanup();
1009out:
1010 return ret;
1011}
1012
1013static void __exit exit_ceph(void)
1014{
1015 dout("exit_ceph\n");
1016 unregister_filesystem(&ceph_fs_type);
1017 ceph_caps_finalize();
1018 destroy_caches();
1019 ceph_msgr_exit();
1020 ceph_debugfs_cleanup();
1021}
1022
1023module_init(init_ceph);
1024module_exit(exit_ceph);
1025
1026MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
1027MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
1028MODULE_AUTHOR("Patience Warnick <patience@newdream.net>");
1029MODULE_DESCRIPTION("Ceph filesystem for Linux");
1030MODULE_LICENSE("GPL");
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
new file mode 100644
index 000000000000..65d12036b670
--- /dev/null
+++ b/fs/ceph/super.h
@@ -0,0 +1,901 @@
1#ifndef _FS_CEPH_SUPER_H
2#define _FS_CEPH_SUPER_H
3
4#include "ceph_debug.h"
5
6#include <asm/unaligned.h>
7#include <linux/backing-dev.h>
8#include <linux/completion.h>
9#include <linux/exportfs.h>
10#include <linux/fs.h>
11#include <linux/mempool.h>
12#include <linux/pagemap.h>
13#include <linux/wait.h>
14#include <linux/writeback.h>
15
16#include "types.h"
17#include "messenger.h"
18#include "msgpool.h"
19#include "mon_client.h"
20#include "mds_client.h"
21#include "osd_client.h"
22#include "ceph_fs.h"
23
24/* f_type in struct statfs */
25#define CEPH_SUPER_MAGIC 0x00c36400
26
27/* large granularity for statfs utilization stats to facilitate
28 * large volume sizes on 32-bit machines. */
29#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
30#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
31
32/*
33 * mount options
34 */
35#define CEPH_OPT_FSID (1<<0)
36#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
37#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
38#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
39#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
40#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
41#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
42
43#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES)
44
45#define ceph_set_opt(client, opt) \
46 (client)->mount_args->flags |= CEPH_OPT_##opt;
47#define ceph_test_opt(client, opt) \
48 (!!((client)->mount_args->flags & CEPH_OPT_##opt))
49
50
51struct ceph_mount_args {
52 int sb_flags;
53 int num_mon;
54 struct ceph_entity_addr *mon_addr;
55 int flags;
56 int mount_timeout;
57 int osd_idle_ttl;
58 int caps_wanted_delay_min, caps_wanted_delay_max;
59 struct ceph_fsid fsid;
60 struct ceph_entity_addr my_addr;
61 int wsize;
62 int rsize; /* max readahead */
63 int max_readdir; /* max readdir size */
64 int congestion_kb; /* max readdir size */
65 int osd_timeout;
66 int osd_keepalive_timeout;
67 char *snapdir_name; /* default ".snap" */
68 char *name;
69 char *secret;
70 int cap_release_safety;
71};
72
73/*
74 * defaults
75 */
76#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
77#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
78#define CEPH_OSD_KEEPALIVE_DEFAULT 5
79#define CEPH_OSD_IDLE_TTL_DEFAULT 60
80#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
81
82#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
83#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
84
85#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
86#define CEPH_AUTH_NAME_DEFAULT "guest"
87
88/*
89 * Delay telling the MDS we no longer want caps, in case we reopen
90 * the file. Delay a minimum amount of time, even if we send a cap
91 * message for some other reason. Otherwise, take the oppotunity to
92 * update the mds to avoid sending another message later.
93 */
94#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
95#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
96
97
98/* mount state */
99enum {
100 CEPH_MOUNT_MOUNTING,
101 CEPH_MOUNT_MOUNTED,
102 CEPH_MOUNT_UNMOUNTING,
103 CEPH_MOUNT_UNMOUNTED,
104 CEPH_MOUNT_SHUTDOWN,
105};
106
107/*
108 * subtract jiffies
109 */
110static inline unsigned long time_sub(unsigned long a, unsigned long b)
111{
112 BUG_ON(time_after(b, a));
113 return (long)a - (long)b;
114}
115
116/*
117 * per-filesystem client state
118 *
119 * possibly shared by multiple mount points, if they are
120 * mounting the same ceph filesystem/cluster.
121 */
122struct ceph_client {
123 struct ceph_fsid fsid;
124 bool have_fsid;
125
126 struct mutex mount_mutex; /* serialize mount attempts */
127 struct ceph_mount_args *mount_args;
128
129 struct super_block *sb;
130
131 unsigned long mount_state;
132 wait_queue_head_t auth_wq;
133
134 int auth_err;
135
136 int min_caps; /* min caps i added */
137
138 struct ceph_messenger *msgr; /* messenger instance */
139 struct ceph_mon_client monc;
140 struct ceph_mds_client mdsc;
141 struct ceph_osd_client osdc;
142
143 /* writeback */
144 mempool_t *wb_pagevec_pool;
145 struct workqueue_struct *wb_wq;
146 struct workqueue_struct *pg_inv_wq;
147 struct workqueue_struct *trunc_wq;
148 atomic_long_t writeback_count;
149
150 struct backing_dev_info backing_dev_info;
151
152#ifdef CONFIG_DEBUG_FS
153 struct dentry *debugfs_monmap;
154 struct dentry *debugfs_mdsmap, *debugfs_osdmap;
155 struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
156 struct dentry *debugfs_congestion_kb;
157 struct dentry *debugfs_bdi;
158#endif
159};
160
161static inline struct ceph_client *ceph_client(struct super_block *sb)
162{
163 return sb->s_fs_info;
164}
165
166
167/*
168 * File i/o capability. This tracks shared state with the metadata
169 * server that allows us to cache or writeback attributes or to read
170 * and write data. For any given inode, we should have one or more
171 * capabilities, one issued by each metadata server, and our
172 * cumulative access is the OR of all issued capabilities.
173 *
174 * Each cap is referenced by the inode's i_caps rbtree and by per-mds
175 * session capability lists.
176 */
177struct ceph_cap {
178 struct ceph_inode_info *ci;
179 struct rb_node ci_node; /* per-ci cap tree */
180 struct ceph_mds_session *session;
181 struct list_head session_caps; /* per-session caplist */
182 int mds;
183 u64 cap_id; /* unique cap id (mds provided) */
184 int issued; /* latest, from the mds */
185 int implemented; /* implemented superset of issued (for revocation) */
186 int mds_wanted;
187 u32 seq, issue_seq, mseq;
188 u32 cap_gen; /* active/stale cycle */
189 unsigned long last_used;
190 struct list_head caps_item;
191};
192
193#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
194#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
195#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
196
197/*
198 * Snapped cap state that is pending flush to mds. When a snapshot occurs,
199 * we first complete any in-process sync writes and writeback any dirty
200 * data before flushing the snapped state (tracked here) back to the MDS.
201 */
202struct ceph_cap_snap {
203 atomic_t nref;
204 struct ceph_inode_info *ci;
205 struct list_head ci_item, flushing_item;
206
207 u64 follows, flush_tid;
208 int issued, dirty;
209 struct ceph_snap_context *context;
210
211 mode_t mode;
212 uid_t uid;
213 gid_t gid;
214
215 void *xattr_blob;
216 int xattr_len;
217 u64 xattr_version;
218
219 u64 size;
220 struct timespec mtime, atime, ctime;
221 u64 time_warp_seq;
222 int writing; /* a sync write is still in progress */
223 int dirty_pages; /* dirty pages awaiting writeback */
224};
225
226static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
227{
228 if (atomic_dec_and_test(&capsnap->nref))
229 kfree(capsnap);
230}
231
232/*
233 * The frag tree describes how a directory is fragmented, potentially across
234 * multiple metadata servers. It is also used to indicate points where
235 * metadata authority is delegated, and whether/where metadata is replicated.
236 *
237 * A _leaf_ frag will be present in the i_fragtree IFF there is
238 * delegation info. That is, if mds >= 0 || ndist > 0.
239 */
240#define CEPH_MAX_DIRFRAG_REP 4
241
242struct ceph_inode_frag {
243 struct rb_node node;
244
245 /* fragtree state */
246 u32 frag;
247 int split_by; /* i.e. 2^(split_by) children */
248
249 /* delegation and replication info */
250 int mds; /* -1 if same authority as parent */
251 int ndist; /* >0 if replicated */
252 int dist[CEPH_MAX_DIRFRAG_REP];
253};
254
255/*
256 * We cache inode xattrs as an encoded blob until they are first used,
257 * at which point we parse them into an rbtree.
258 */
259struct ceph_inode_xattr {
260 struct rb_node node;
261
262 const char *name;
263 int name_len;
264 const char *val;
265 int val_len;
266 int dirty;
267
268 int should_free_name;
269 int should_free_val;
270};
271
272struct ceph_inode_xattrs_info {
273 /*
274 * (still encoded) xattr blob. we avoid the overhead of parsing
275 * this until someone actually calls getxattr, etc.
276 *
277 * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
278 * NULL means we don't know.
279 */
280 struct ceph_buffer *blob, *prealloc_blob;
281
282 struct rb_root index;
283 bool dirty;
284 int count;
285 int names_size;
286 int vals_size;
287 u64 version, index_version;
288};
289
290/*
291 * Ceph inode.
292 */
293#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
294#define CEPH_I_NODELAY 4 /* do not delay cap release */
295#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
296#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
297
298struct ceph_inode_info {
299 struct ceph_vino i_vino; /* ceph ino + snap */
300
301 u64 i_version;
302 u32 i_time_warp_seq;
303
304 unsigned i_ceph_flags;
305 unsigned long i_release_count;
306
307 struct ceph_file_layout i_layout;
308 char *i_symlink;
309
310 /* for dirs */
311 struct timespec i_rctime;
312 u64 i_rbytes, i_rfiles, i_rsubdirs;
313 u64 i_files, i_subdirs;
314 u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */
315
316 struct rb_root i_fragtree;
317 struct mutex i_fragtree_mutex;
318
319 struct ceph_inode_xattrs_info i_xattrs;
320
321 /* capabilities. protected _both_ by i_lock and cap->session's
322 * s_mutex. */
323 struct rb_root i_caps; /* cap list */
324 struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
325 unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
326 struct list_head i_dirty_item, i_flushing_item;
327 u64 i_cap_flush_seq;
328 /* we need to track cap writeback on a per-cap-bit basis, to allow
329 * overlapping, pipelined cap flushes to the mds. we can probably
330 * reduce the tid to 8 bits if we're concerned about inode size. */
331 u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
332 wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
333 unsigned long i_hold_caps_min; /* jiffies */
334 unsigned long i_hold_caps_max; /* jiffies */
335 struct list_head i_cap_delay_list; /* for delayed cap release to mds */
336 int i_cap_exporting_mds; /* to handle cap migration between */
337 unsigned i_cap_exporting_mseq; /* mds's. */
338 unsigned i_cap_exporting_issued;
339 struct ceph_cap_reservation i_cap_migration_resv;
340 struct list_head i_cap_snaps; /* snapped state pending flush to mds */
341 struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */
342 unsigned i_snap_caps; /* cap bits for snapped files */
343
344 int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
345
346 u32 i_truncate_seq; /* last truncate to smaller size */
347 u64 i_truncate_size; /* and the size we last truncated down to */
348 int i_truncate_pending; /* still need to call vmtruncate */
349
350 u64 i_max_size; /* max file size authorized by mds */
351 u64 i_reported_size; /* (max_)size reported to or requested of mds */
352 u64 i_wanted_max_size; /* offset we'd like to write too */
353 u64 i_requested_max_size; /* max_size we've requested */
354
355 /* held references to caps */
356 int i_pin_ref;
357 int i_rd_ref, i_rdcache_ref, i_wr_ref;
358 int i_wrbuffer_ref, i_wrbuffer_ref_head;
359 u32 i_shared_gen; /* increment each time we get FILE_SHARED */
360 u32 i_rdcache_gen; /* we increment this each time we get
361 FILE_CACHE. If it's non-zero, we
362 _may_ have cached pages. */
363 u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
364
365 struct list_head i_unsafe_writes; /* uncommitted sync writes */
366 struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
367 spinlock_t i_unsafe_lock;
368
369 struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
370 int i_snap_realm_counter; /* snap realm (if caps) */
371 struct list_head i_snap_realm_item;
372 struct list_head i_snap_flush_item;
373
374 struct work_struct i_wb_work; /* writeback work */
375 struct work_struct i_pg_inv_work; /* page invalidation work */
376
377 struct work_struct i_vmtruncate_work;
378
379 struct inode vfs_inode; /* at end */
380};
381
382static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
383{
384 return container_of(inode, struct ceph_inode_info, vfs_inode);
385}
386
387static inline void ceph_i_clear(struct inode *inode, unsigned mask)
388{
389 struct ceph_inode_info *ci = ceph_inode(inode);
390
391 spin_lock(&inode->i_lock);
392 ci->i_ceph_flags &= ~mask;
393 spin_unlock(&inode->i_lock);
394}
395
396static inline void ceph_i_set(struct inode *inode, unsigned mask)
397{
398 struct ceph_inode_info *ci = ceph_inode(inode);
399
400 spin_lock(&inode->i_lock);
401 ci->i_ceph_flags |= mask;
402 spin_unlock(&inode->i_lock);
403}
404
405static inline bool ceph_i_test(struct inode *inode, unsigned mask)
406{
407 struct ceph_inode_info *ci = ceph_inode(inode);
408 bool r;
409
410 smp_mb();
411 r = (ci->i_ceph_flags & mask) == mask;
412 return r;
413}
414
415
416/* find a specific frag @f */
417extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
418 u32 f);
419
420/*
421 * choose fragment for value @v. copy frag content to pfrag, if leaf
422 * exists
423 */
424extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
425 struct ceph_inode_frag *pfrag,
426 int *found);
427
428/*
429 * Ceph dentry state
430 */
431struct ceph_dentry_info {
432 struct ceph_mds_session *lease_session;
433 u32 lease_gen, lease_shared_gen;
434 u32 lease_seq;
435 unsigned long lease_renew_after, lease_renew_from;
436 struct list_head lru;
437 struct dentry *dentry;
438 u64 time;
439 u64 offset;
440};
441
442static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
443{
444 return (struct ceph_dentry_info *)dentry->d_fsdata;
445}
446
447static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
448{
449 return ((loff_t)frag << 32) | (loff_t)off;
450}
451
452/*
453 * ino_t is <64 bits on many architectures, blech.
454 *
455 * don't include snap in ino hash, at least for now.
456 */
457static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
458{
459 ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
460#if BITS_PER_LONG == 32
461 ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
462 if (!ino)
463 ino = 1;
464#endif
465 return ino;
466}
467
468static inline int ceph_set_ino_cb(struct inode *inode, void *data)
469{
470 ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
471 inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
472 return 0;
473}
474
475static inline struct ceph_vino ceph_vino(struct inode *inode)
476{
477 return ceph_inode(inode)->i_vino;
478}
479
480/* for printf-style formatting */
481#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
482
483static inline u64 ceph_ino(struct inode *inode)
484{
485 return ceph_inode(inode)->i_vino.ino;
486}
487static inline u64 ceph_snap(struct inode *inode)
488{
489 return ceph_inode(inode)->i_vino.snap;
490}
491
492static inline int ceph_ino_compare(struct inode *inode, void *data)
493{
494 struct ceph_vino *pvino = (struct ceph_vino *)data;
495 struct ceph_inode_info *ci = ceph_inode(inode);
496 return ci->i_vino.ino == pvino->ino &&
497 ci->i_vino.snap == pvino->snap;
498}
499
500static inline struct inode *ceph_find_inode(struct super_block *sb,
501 struct ceph_vino vino)
502{
503 ino_t t = ceph_vino_to_ino(vino);
504 return ilookup5(sb, t, ceph_ino_compare, &vino);
505}
506
507
508/*
509 * caps helpers
510 */
511static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
512{
513 return !RB_EMPTY_ROOT(&ci->i_caps);
514}
515
516extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
517extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
518extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
519 struct ceph_cap *cap);
520
521static inline int ceph_caps_issued(struct ceph_inode_info *ci)
522{
523 int issued;
524 spin_lock(&ci->vfs_inode.i_lock);
525 issued = __ceph_caps_issued(ci, NULL);
526 spin_unlock(&ci->vfs_inode.i_lock);
527 return issued;
528}
529
530static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
531 int touch)
532{
533 int r;
534 spin_lock(&ci->vfs_inode.i_lock);
535 r = __ceph_caps_issued_mask(ci, mask, touch);
536 spin_unlock(&ci->vfs_inode.i_lock);
537 return r;
538}
539
540static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
541{
542 return ci->i_dirty_caps | ci->i_flushing_caps;
543}
544extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
545
546extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
547extern int __ceph_caps_used(struct ceph_inode_info *ci);
548
549extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
550
551/*
552 * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
553 */
554static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
555{
556 int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
557 if (w & CEPH_CAP_FILE_BUFFER)
558 w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
559 return w;
560}
561
562/* what the mds thinks we want */
563extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
564
565extern void ceph_caps_init(void);
566extern void ceph_caps_finalize(void);
567extern void ceph_adjust_min_caps(int delta);
568extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
569extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
570extern void ceph_reservation_status(struct ceph_client *client,
571 int *total, int *avail, int *used,
572 int *reserved, int *min);
573
574static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
575{
576 return (struct ceph_client *)inode->i_sb->s_fs_info;
577}
578
579static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
580{
581 return (struct ceph_client *)sb->s_fs_info;
582}
583
584
585/*
586 * we keep buffered readdir results attached to file->private_data
587 */
588struct ceph_file_info {
589 int fmode; /* initialized on open */
590
591 /* readdir: position within the dir */
592 u32 frag;
593 struct ceph_mds_request *last_readdir;
594 int at_end;
595
596 /* readdir: position within a frag */
597 unsigned offset; /* offset of last chunk, adjusted for . and .. */
598 u64 next_offset; /* offset of next chunk (last_name's + 1) */
599 char *last_name; /* last entry in previous chunk */
600 struct dentry *dentry; /* next dentry (for dcache readdir) */
601 unsigned long dir_release_count;
602
603 /* used for -o dirstat read() on directory thing */
604 char *dir_info;
605 int dir_info_len;
606};
607
608
609
610/*
611 * snapshots
612 */
613
614/*
615 * A "snap context" is the set of existing snapshots when we
616 * write data. It is used by the OSD to guide its COW behavior.
617 *
618 * The ceph_snap_context is refcounted, and attached to each dirty
619 * page, indicating which context the dirty data belonged when it was
620 * dirtied.
621 */
622struct ceph_snap_context {
623 atomic_t nref;
624 u64 seq;
625 int num_snaps;
626 u64 snaps[];
627};
628
629static inline struct ceph_snap_context *
630ceph_get_snap_context(struct ceph_snap_context *sc)
631{
632 /*
633 printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
634 atomic_read(&sc->nref)+1);
635 */
636 if (sc)
637 atomic_inc(&sc->nref);
638 return sc;
639}
640
641static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
642{
643 if (!sc)
644 return;
645 /*
646 printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
647 atomic_read(&sc->nref)-1);
648 */
649 if (atomic_dec_and_test(&sc->nref)) {
650 /*printk(" deleting snap_context %p\n", sc);*/
651 kfree(sc);
652 }
653}
654
655/*
656 * A "snap realm" describes a subset of the file hierarchy sharing
657 * the same set of snapshots that apply to it. The realms themselves
658 * are organized into a hierarchy, such that children inherit (some of)
659 * the snapshots of their parents.
660 *
661 * All inodes within the realm that have capabilities are linked into a
662 * per-realm list.
663 */
664struct ceph_snap_realm {
665 u64 ino;
666 atomic_t nref;
667 struct rb_node node;
668
669 u64 created, seq;
670 u64 parent_ino;
671 u64 parent_since; /* snapid when our current parent became so */
672
673 u64 *prior_parent_snaps; /* snaps inherited from any parents we */
674 int num_prior_parent_snaps; /* had prior to parent_since */
675 u64 *snaps; /* snaps specific to this realm */
676 int num_snaps;
677
678 struct ceph_snap_realm *parent;
679 struct list_head children; /* list of child realms */
680 struct list_head child_item;
681
682 struct list_head empty_item; /* if i have ref==0 */
683
684 /* the current set of snaps for this realm */
685 struct ceph_snap_context *cached_context;
686
687 struct list_head inodes_with_caps;
688 spinlock_t inodes_with_caps_lock;
689};
690
691
692
693/*
694 * calculate the number of pages a given length and offset map onto,
695 * if we align the data.
696 */
697static inline int calc_pages_for(u64 off, u64 len)
698{
699 return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
700 (off >> PAGE_CACHE_SHIFT);
701}
702
703
704
705/* snap.c */
706struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
707 u64 ino);
708extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
709 struct ceph_snap_realm *realm);
710extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
711 struct ceph_snap_realm *realm);
712extern int ceph_update_snap_trace(struct ceph_mds_client *m,
713 void *p, void *e, bool deletion);
714extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
715 struct ceph_mds_session *session,
716 struct ceph_msg *msg);
717extern void ceph_queue_cap_snap(struct ceph_inode_info *ci,
718 struct ceph_snap_context *snapc);
719extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
720 struct ceph_cap_snap *capsnap);
721extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
722
723/*
724 * a cap_snap is "pending" if it is still awaiting an in-progress
725 * sync write (that may/may not still update size, mtime, etc.).
726 */
727static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
728{
729 return !list_empty(&ci->i_cap_snaps) &&
730 list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
731 ci_item)->writing;
732}
733
734
735/* super.c */
736extern struct kmem_cache *ceph_inode_cachep;
737extern struct kmem_cache *ceph_cap_cachep;
738extern struct kmem_cache *ceph_dentry_cachep;
739extern struct kmem_cache *ceph_file_cachep;
740
741extern const char *ceph_msg_type_name(int type);
742extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
743
744#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
745 "%02x%02x%02x%02x%02x%02x"
746#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
747 (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \
748 (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \
749 (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
750
751/* inode.c */
752extern const struct inode_operations ceph_file_iops;
753
754extern struct inode *ceph_alloc_inode(struct super_block *sb);
755extern void ceph_destroy_inode(struct inode *inode);
756
757extern struct inode *ceph_get_inode(struct super_block *sb,
758 struct ceph_vino vino);
759extern struct inode *ceph_get_snapdir(struct inode *parent);
760extern int ceph_fill_file_size(struct inode *inode, int issued,
761 u32 truncate_seq, u64 truncate_size, u64 size);
762extern void ceph_fill_file_time(struct inode *inode, int issued,
763 u64 time_warp_seq, struct timespec *ctime,
764 struct timespec *mtime, struct timespec *atime);
765extern int ceph_fill_trace(struct super_block *sb,
766 struct ceph_mds_request *req,
767 struct ceph_mds_session *session);
768extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
769 struct ceph_mds_session *session);
770
771extern int ceph_inode_holds_cap(struct inode *inode, int mask);
772
773extern int ceph_inode_set_size(struct inode *inode, loff_t size);
774extern void __ceph_do_pending_vmtruncate(struct inode *inode);
775extern void ceph_queue_vmtruncate(struct inode *inode);
776
777extern void ceph_queue_invalidate(struct inode *inode);
778extern void ceph_queue_writeback(struct inode *inode);
779
780extern int ceph_do_getattr(struct inode *inode, int mask);
781extern int ceph_permission(struct inode *inode, int mask);
782extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
783extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
784 struct kstat *stat);
785
786/* xattr.c */
787extern int ceph_setxattr(struct dentry *, const char *, const void *,
788 size_t, int);
789extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
790extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
791extern int ceph_removexattr(struct dentry *, const char *);
792extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
793extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
794
795/* caps.c */
796extern const char *ceph_cap_string(int c);
797extern void ceph_handle_caps(struct ceph_mds_session *session,
798 struct ceph_msg *msg);
799extern int ceph_add_cap(struct inode *inode,
800 struct ceph_mds_session *session, u64 cap_id,
801 int fmode, unsigned issued, unsigned wanted,
802 unsigned cap, unsigned seq, u64 realmino, int flags,
803 struct ceph_cap_reservation *caps_reservation);
804extern void __ceph_remove_cap(struct ceph_cap *cap);
805static inline void ceph_remove_cap(struct ceph_cap *cap)
806{
807 struct inode *inode = &cap->ci->vfs_inode;
808 spin_lock(&inode->i_lock);
809 __ceph_remove_cap(cap);
810 spin_unlock(&inode->i_lock);
811}
812extern void ceph_put_cap(struct ceph_cap *cap);
813
814extern void ceph_queue_caps_release(struct inode *inode);
815extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
816extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
817extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
818 struct ceph_mds_session *session);
819extern int ceph_get_cap_mds(struct inode *inode);
820extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
821extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
822extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
823 struct ceph_snap_context *snapc);
824extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
825 struct ceph_mds_session **psession);
826extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
827 struct ceph_mds_session *session);
828extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
829extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
830
831extern int ceph_encode_inode_release(void **p, struct inode *inode,
832 int mds, int drop, int unless, int force);
833extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
834 int mds, int drop, int unless);
835
836extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
837 int *got, loff_t endoff);
838
839/* for counting open files by mode */
840static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
841{
842 ci->i_nr_by_mode[mode]++;
843}
844extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
845
846/* addr.c */
847extern const struct address_space_operations ceph_aops;
848extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
849
850/* file.c */
851extern const struct file_operations ceph_file_fops;
852extern const struct address_space_operations ceph_aops;
853extern int ceph_open(struct inode *inode, struct file *file);
854extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
855 struct nameidata *nd, int mode,
856 int locked_dir);
857extern int ceph_release(struct inode *inode, struct file *filp);
858extern void ceph_release_page_vector(struct page **pages, int num_pages);
859
860/* dir.c */
861extern const struct file_operations ceph_dir_fops;
862extern const struct inode_operations ceph_dir_iops;
863extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
864 ceph_snapdir_dentry_ops;
865
866extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
867extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
868 struct dentry *dentry, int err);
869
870extern void ceph_dentry_lru_add(struct dentry *dn);
871extern void ceph_dentry_lru_touch(struct dentry *dn);
872extern void ceph_dentry_lru_del(struct dentry *dn);
873
874/*
875 * our d_ops vary depending on whether the inode is live,
876 * snapshotted (read-only), or a virtual ".snap" directory.
877 */
878int ceph_init_dentry(struct dentry *dentry);
879
880
881/* ioctl.c */
882extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
883
884/* export.c */
885extern const struct export_operations ceph_export_ops;
886
887/* debugfs.c */
888extern int ceph_debugfs_init(void);
889extern void ceph_debugfs_cleanup(void);
890extern int ceph_debugfs_client_init(struct ceph_client *client);
891extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
892
893static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
894{
895 if (dentry && dentry->d_parent)
896 return dentry->d_parent->d_inode;
897
898 return NULL;
899}
900
901#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
new file mode 100644
index 000000000000..28b35a005ec2
--- /dev/null
+++ b/fs/ceph/types.h
@@ -0,0 +1,29 @@
1#ifndef _FS_CEPH_TYPES_H
2#define _FS_CEPH_TYPES_H
3
4/* needed before including ceph_fs.h */
5#include <linux/in.h>
6#include <linux/types.h>
7#include <linux/fcntl.h>
8#include <linux/string.h>
9
10#include "ceph_fs.h"
11#include "ceph_frag.h"
12#include "ceph_hash.h"
13
14/*
15 * Identify inodes by both their ino AND snapshot id (a u64).
16 */
17struct ceph_vino {
18 u64 ino;
19 u64 snap;
20};
21
22
23/* context for the caps reservation mechanism */
24struct ceph_cap_reservation {
25 int count;
26};
27
28
29#endif
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
new file mode 100644
index 000000000000..37d6ce645691
--- /dev/null
+++ b/fs/ceph/xattr.c
@@ -0,0 +1,844 @@
1#include "ceph_debug.h"
2#include "super.h"
3#include "decode.h"
4
5#include <linux/xattr.h>
6
7static bool ceph_is_valid_xattr(const char *name)
8{
9 return !strncmp(name, XATTR_SECURITY_PREFIX,
10 XATTR_SECURITY_PREFIX_LEN) ||
11 !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
12 !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
13}
14
15/*
16 * These define virtual xattrs exposing the recursive directory
17 * statistics and layout metadata.
18 */
19struct ceph_vxattr_cb {
20 bool readonly;
21 char *name;
22 size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
23 size_t size);
24};
25
26/* directories */
27
28static size_t ceph_vxattrcb_entries(struct ceph_inode_info *ci, char *val,
29 size_t size)
30{
31 return snprintf(val, size, "%lld", ci->i_files + ci->i_subdirs);
32}
33
34static size_t ceph_vxattrcb_files(struct ceph_inode_info *ci, char *val,
35 size_t size)
36{
37 return snprintf(val, size, "%lld", ci->i_files);
38}
39
40static size_t ceph_vxattrcb_subdirs(struct ceph_inode_info *ci, char *val,
41 size_t size)
42{
43 return snprintf(val, size, "%lld", ci->i_subdirs);
44}
45
46static size_t ceph_vxattrcb_rentries(struct ceph_inode_info *ci, char *val,
47 size_t size)
48{
49 return snprintf(val, size, "%lld", ci->i_rfiles + ci->i_rsubdirs);
50}
51
52static size_t ceph_vxattrcb_rfiles(struct ceph_inode_info *ci, char *val,
53 size_t size)
54{
55 return snprintf(val, size, "%lld", ci->i_rfiles);
56}
57
58static size_t ceph_vxattrcb_rsubdirs(struct ceph_inode_info *ci, char *val,
59 size_t size)
60{
61 return snprintf(val, size, "%lld", ci->i_rsubdirs);
62}
63
64static size_t ceph_vxattrcb_rbytes(struct ceph_inode_info *ci, char *val,
65 size_t size)
66{
67 return snprintf(val, size, "%lld", ci->i_rbytes);
68}
69
70static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
71 size_t size)
72{
73 return snprintf(val, size, "%ld.%ld", (long)ci->i_rctime.tv_sec,
74 (long)ci->i_rctime.tv_nsec);
75}
76
77static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
78 { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
79 { true, "user.ceph.dir.files", ceph_vxattrcb_files},
80 { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
81 { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
82 { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
83 { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
84 { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
85 { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
86 { true, NULL, NULL }
87};
88
89/* files */
90
91static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
92 size_t size)
93{
94 int ret;
95
96 ret = snprintf(val, size,
97 "chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
98 (unsigned long long)ceph_file_layout_su(ci->i_layout),
99 (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
100 (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
101 if (ceph_file_layout_pg_preferred(ci->i_layout))
102 ret += snprintf(val + ret, size, "preferred_osd=%lld\n",
103 (unsigned long long)ceph_file_layout_pg_preferred(
104 ci->i_layout));
105 return ret;
106}
107
108static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
109 { true, "user.ceph.layout", ceph_vxattrcb_layout},
110 { NULL, NULL }
111};
112
113static struct ceph_vxattr_cb *ceph_inode_vxattrs(struct inode *inode)
114{
115 if (S_ISDIR(inode->i_mode))
116 return ceph_dir_vxattrs;
117 else if (S_ISREG(inode->i_mode))
118 return ceph_file_vxattrs;
119 return NULL;
120}
121
122static struct ceph_vxattr_cb *ceph_match_vxattr(struct ceph_vxattr_cb *vxattr,
123 const char *name)
124{
125 do {
126 if (strcmp(vxattr->name, name) == 0)
127 return vxattr;
128 vxattr++;
129 } while (vxattr->name);
130 return NULL;
131}
132
133static int __set_xattr(struct ceph_inode_info *ci,
134 const char *name, int name_len,
135 const char *val, int val_len,
136 int dirty,
137 int should_free_name, int should_free_val,
138 struct ceph_inode_xattr **newxattr)
139{
140 struct rb_node **p;
141 struct rb_node *parent = NULL;
142 struct ceph_inode_xattr *xattr = NULL;
143 int c;
144 int new = 0;
145
146 p = &ci->i_xattrs.index.rb_node;
147 while (*p) {
148 parent = *p;
149 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
150 c = strncmp(name, xattr->name, min(name_len, xattr->name_len));
151 if (c < 0)
152 p = &(*p)->rb_left;
153 else if (c > 0)
154 p = &(*p)->rb_right;
155 else {
156 if (name_len == xattr->name_len)
157 break;
158 else if (name_len < xattr->name_len)
159 p = &(*p)->rb_left;
160 else
161 p = &(*p)->rb_right;
162 }
163 xattr = NULL;
164 }
165
166 if (!xattr) {
167 new = 1;
168 xattr = *newxattr;
169 xattr->name = name;
170 xattr->name_len = name_len;
171 xattr->should_free_name = should_free_name;
172
173 ci->i_xattrs.count++;
174 dout("__set_xattr count=%d\n", ci->i_xattrs.count);
175 } else {
176 kfree(*newxattr);
177 *newxattr = NULL;
178 if (xattr->should_free_val)
179 kfree((void *)xattr->val);
180
181 if (should_free_name) {
182 kfree((void *)name);
183 name = xattr->name;
184 }
185 ci->i_xattrs.names_size -= xattr->name_len;
186 ci->i_xattrs.vals_size -= xattr->val_len;
187 }
188 if (!xattr) {
189 pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
190 &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
191 xattr->val);
192 return -ENOMEM;
193 }
194 ci->i_xattrs.names_size += name_len;
195 ci->i_xattrs.vals_size += val_len;
196 if (val)
197 xattr->val = val;
198 else
199 xattr->val = "";
200
201 xattr->val_len = val_len;
202 xattr->dirty = dirty;
203 xattr->should_free_val = (val && should_free_val);
204
205 if (new) {
206 rb_link_node(&xattr->node, parent, p);
207 rb_insert_color(&xattr->node, &ci->i_xattrs.index);
208 dout("__set_xattr_val p=%p\n", p);
209 }
210
211 dout("__set_xattr_val added %llx.%llx xattr %p %s=%.*s\n",
212 ceph_vinop(&ci->vfs_inode), xattr, name, val_len, val);
213
214 return 0;
215}
216
217static struct ceph_inode_xattr *__get_xattr(struct ceph_inode_info *ci,
218 const char *name)
219{
220 struct rb_node **p;
221 struct rb_node *parent = NULL;
222 struct ceph_inode_xattr *xattr = NULL;
223 int c;
224
225 p = &ci->i_xattrs.index.rb_node;
226 while (*p) {
227 parent = *p;
228 xattr = rb_entry(parent, struct ceph_inode_xattr, node);
229 c = strncmp(name, xattr->name, xattr->name_len);
230 if (c < 0)
231 p = &(*p)->rb_left;
232 else if (c > 0)
233 p = &(*p)->rb_right;
234 else {
235 dout("__get_xattr %s: found %.*s\n", name,
236 xattr->val_len, xattr->val);
237 return xattr;
238 }
239 }
240
241 dout("__get_xattr %s: not found\n", name);
242
243 return NULL;
244}
245
246static void __free_xattr(struct ceph_inode_xattr *xattr)
247{
248 BUG_ON(!xattr);
249
250 if (xattr->should_free_name)
251 kfree((void *)xattr->name);
252 if (xattr->should_free_val)
253 kfree((void *)xattr->val);
254
255 kfree(xattr);
256}
257
258static int __remove_xattr(struct ceph_inode_info *ci,
259 struct ceph_inode_xattr *xattr)
260{
261 if (!xattr)
262 return -EOPNOTSUPP;
263
264 rb_erase(&xattr->node, &ci->i_xattrs.index);
265
266 if (xattr->should_free_name)
267 kfree((void *)xattr->name);
268 if (xattr->should_free_val)
269 kfree((void *)xattr->val);
270
271 ci->i_xattrs.names_size -= xattr->name_len;
272 ci->i_xattrs.vals_size -= xattr->val_len;
273 ci->i_xattrs.count--;
274 kfree(xattr);
275
276 return 0;
277}
278
279static int __remove_xattr_by_name(struct ceph_inode_info *ci,
280 const char *name)
281{
282 struct rb_node **p;
283 struct ceph_inode_xattr *xattr;
284 int err;
285
286 p = &ci->i_xattrs.index.rb_node;
287 xattr = __get_xattr(ci, name);
288 err = __remove_xattr(ci, xattr);
289 return err;
290}
291
292static char *__copy_xattr_names(struct ceph_inode_info *ci,
293 char *dest)
294{
295 struct rb_node *p;
296 struct ceph_inode_xattr *xattr = NULL;
297
298 p = rb_first(&ci->i_xattrs.index);
299 dout("__copy_xattr_names count=%d\n", ci->i_xattrs.count);
300
301 while (p) {
302 xattr = rb_entry(p, struct ceph_inode_xattr, node);
303 memcpy(dest, xattr->name, xattr->name_len);
304 dest[xattr->name_len] = '\0';
305
306 dout("dest=%s %p (%s) (%d/%d)\n", dest, xattr, xattr->name,
307 xattr->name_len, ci->i_xattrs.names_size);
308
309 dest += xattr->name_len + 1;
310 p = rb_next(p);
311 }
312
313 return dest;
314}
315
316void __ceph_destroy_xattrs(struct ceph_inode_info *ci)
317{
318 struct rb_node *p, *tmp;
319 struct ceph_inode_xattr *xattr = NULL;
320
321 p = rb_first(&ci->i_xattrs.index);
322
323 dout("__ceph_destroy_xattrs p=%p\n", p);
324
325 while (p) {
326 xattr = rb_entry(p, struct ceph_inode_xattr, node);
327 tmp = p;
328 p = rb_next(tmp);
329 dout("__ceph_destroy_xattrs next p=%p (%.*s)\n", p,
330 xattr->name_len, xattr->name);
331 rb_erase(tmp, &ci->i_xattrs.index);
332
333 __free_xattr(xattr);
334 }
335
336 ci->i_xattrs.names_size = 0;
337 ci->i_xattrs.vals_size = 0;
338 ci->i_xattrs.index_version = 0;
339 ci->i_xattrs.count = 0;
340 ci->i_xattrs.index = RB_ROOT;
341}
342
343static int __build_xattrs(struct inode *inode)
344{
345 u32 namelen;
346 u32 numattr = 0;
347 void *p, *end;
348 u32 len;
349 const char *name, *val;
350 struct ceph_inode_info *ci = ceph_inode(inode);
351 int xattr_version;
352 struct ceph_inode_xattr **xattrs = NULL;
353 int err = 0;
354 int i;
355
356 dout("__build_xattrs() len=%d\n",
357 ci->i_xattrs.blob ? (int)ci->i_xattrs.blob->vec.iov_len : 0);
358
359 if (ci->i_xattrs.index_version >= ci->i_xattrs.version)
360 return 0; /* already built */
361
362 __ceph_destroy_xattrs(ci);
363
364start:
365 /* updated internal xattr rb tree */
366 if (ci->i_xattrs.blob && ci->i_xattrs.blob->vec.iov_len > 4) {
367 p = ci->i_xattrs.blob->vec.iov_base;
368 end = p + ci->i_xattrs.blob->vec.iov_len;
369 ceph_decode_32_safe(&p, end, numattr, bad);
370 xattr_version = ci->i_xattrs.version;
371 spin_unlock(&inode->i_lock);
372
373 xattrs = kcalloc(numattr, sizeof(struct ceph_xattr *),
374 GFP_NOFS);
375 err = -ENOMEM;
376 if (!xattrs)
377 goto bad_lock;
378 memset(xattrs, 0, numattr*sizeof(struct ceph_xattr *));
379 for (i = 0; i < numattr; i++) {
380 xattrs[i] = kmalloc(sizeof(struct ceph_inode_xattr),
381 GFP_NOFS);
382 if (!xattrs[i])
383 goto bad_lock;
384 }
385
386 spin_lock(&inode->i_lock);
387 if (ci->i_xattrs.version != xattr_version) {
388 /* lost a race, retry */
389 for (i = 0; i < numattr; i++)
390 kfree(xattrs[i]);
391 kfree(xattrs);
392 goto start;
393 }
394 err = -EIO;
395 while (numattr--) {
396 ceph_decode_32_safe(&p, end, len, bad);
397 namelen = len;
398 name = p;
399 p += len;
400 ceph_decode_32_safe(&p, end, len, bad);
401 val = p;
402 p += len;
403
404 err = __set_xattr(ci, name, namelen, val, len,
405 0, 0, 0, &xattrs[numattr]);
406
407 if (err < 0)
408 goto bad;
409 }
410 kfree(xattrs);
411 }
412 ci->i_xattrs.index_version = ci->i_xattrs.version;
413 ci->i_xattrs.dirty = false;
414
415 return err;
416bad_lock:
417 spin_lock(&inode->i_lock);
418bad:
419 if (xattrs) {
420 for (i = 0; i < numattr; i++)
421 kfree(xattrs[i]);
422 kfree(xattrs);
423 }
424 ci->i_xattrs.names_size = 0;
425 return err;
426}
427
428static int __get_required_blob_size(struct ceph_inode_info *ci, int name_size,
429 int val_size)
430{
431 /*
432 * 4 bytes for the length, and additional 4 bytes per each xattr name,
433 * 4 bytes per each value
434 */
435 int size = 4 + ci->i_xattrs.count*(4 + 4) +
436 ci->i_xattrs.names_size +
437 ci->i_xattrs.vals_size;
438 dout("__get_required_blob_size c=%d names.size=%d vals.size=%d\n",
439 ci->i_xattrs.count, ci->i_xattrs.names_size,
440 ci->i_xattrs.vals_size);
441
442 if (name_size)
443 size += 4 + 4 + name_size + val_size;
444
445 return size;
446}
447
448/*
449 * If there are dirty xattrs, reencode xattrs into the prealloc_blob
450 * and swap into place.
451 */
452void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
453{
454 struct rb_node *p;
455 struct ceph_inode_xattr *xattr = NULL;
456 void *dest;
457
458 dout("__build_xattrs_blob %p\n", &ci->vfs_inode);
459 if (ci->i_xattrs.dirty) {
460 int need = __get_required_blob_size(ci, 0, 0);
461
462 BUG_ON(need > ci->i_xattrs.prealloc_blob->alloc_len);
463
464 p = rb_first(&ci->i_xattrs.index);
465 dest = ci->i_xattrs.prealloc_blob->vec.iov_base;
466
467 ceph_encode_32(&dest, ci->i_xattrs.count);
468 while (p) {
469 xattr = rb_entry(p, struct ceph_inode_xattr, node);
470
471 ceph_encode_32(&dest, xattr->name_len);
472 memcpy(dest, xattr->name, xattr->name_len);
473 dest += xattr->name_len;
474 ceph_encode_32(&dest, xattr->val_len);
475 memcpy(dest, xattr->val, xattr->val_len);
476 dest += xattr->val_len;
477
478 p = rb_next(p);
479 }
480
481 /* adjust buffer len; it may be larger than we need */
482 ci->i_xattrs.prealloc_blob->vec.iov_len =
483 dest - ci->i_xattrs.prealloc_blob->vec.iov_base;
484
485 if (ci->i_xattrs.blob)
486 ceph_buffer_put(ci->i_xattrs.blob);
487 ci->i_xattrs.blob = ci->i_xattrs.prealloc_blob;
488 ci->i_xattrs.prealloc_blob = NULL;
489 ci->i_xattrs.dirty = false;
490 }
491}
492
493ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
494 size_t size)
495{
496 struct inode *inode = dentry->d_inode;
497 struct ceph_inode_info *ci = ceph_inode(inode);
498 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
499 int err;
500 struct ceph_inode_xattr *xattr;
501 struct ceph_vxattr_cb *vxattr = NULL;
502
503 if (!ceph_is_valid_xattr(name))
504 return -ENODATA;
505
506 /* let's see if a virtual xattr was requested */
507 if (vxattrs)
508 vxattr = ceph_match_vxattr(vxattrs, name);
509
510 spin_lock(&inode->i_lock);
511 dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
512 ci->i_xattrs.version, ci->i_xattrs.index_version);
513
514 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
515 (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
516 goto get_xattr;
517 } else {
518 spin_unlock(&inode->i_lock);
519 /* get xattrs from mds (if we don't already have them) */
520 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
521 if (err)
522 return err;
523 }
524
525 spin_lock(&inode->i_lock);
526
527 if (vxattr && vxattr->readonly) {
528 err = vxattr->getxattr_cb(ci, value, size);
529 goto out;
530 }
531
532 err = __build_xattrs(inode);
533 if (err < 0)
534 goto out;
535
536get_xattr:
537 err = -ENODATA; /* == ENOATTR */
538 xattr = __get_xattr(ci, name);
539 if (!xattr) {
540 if (vxattr)
541 err = vxattr->getxattr_cb(ci, value, size);
542 goto out;
543 }
544
545 err = -ERANGE;
546 if (size && size < xattr->val_len)
547 goto out;
548
549 err = xattr->val_len;
550 if (size == 0)
551 goto out;
552
553 memcpy(value, xattr->val, xattr->val_len);
554
555out:
556 spin_unlock(&inode->i_lock);
557 return err;
558}
559
560ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
561{
562 struct inode *inode = dentry->d_inode;
563 struct ceph_inode_info *ci = ceph_inode(inode);
564 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
565 u32 vir_namelen = 0;
566 u32 namelen;
567 int err;
568 u32 len;
569 int i;
570
571 spin_lock(&inode->i_lock);
572 dout("listxattr %p ver=%lld index_ver=%lld\n", inode,
573 ci->i_xattrs.version, ci->i_xattrs.index_version);
574
575 if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
576 (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
577 goto list_xattr;
578 } else {
579 spin_unlock(&inode->i_lock);
580 err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR);
581 if (err)
582 return err;
583 }
584
585 spin_lock(&inode->i_lock);
586
587 err = __build_xattrs(inode);
588 if (err < 0)
589 goto out;
590
591list_xattr:
592 vir_namelen = 0;
593 /* include virtual dir xattrs */
594 if (vxattrs)
595 for (i = 0; vxattrs[i].name; i++)
596 vir_namelen += strlen(vxattrs[i].name) + 1;
597 /* adding 1 byte per each variable due to the null termination */
598 namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count;
599 err = -ERANGE;
600 if (size && namelen > size)
601 goto out;
602
603 err = namelen;
604 if (size == 0)
605 goto out;
606
607 names = __copy_xattr_names(ci, names);
608
609 /* virtual xattr names, too */
610 if (vxattrs)
611 for (i = 0; vxattrs[i].name; i++) {
612 len = sprintf(names, "%s", vxattrs[i].name);
613 names += len + 1;
614 }
615
616out:
617 spin_unlock(&inode->i_lock);
618 return err;
619}
620
621static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
622 const char *value, size_t size, int flags)
623{
624 struct ceph_client *client = ceph_client(dentry->d_sb);
625 struct inode *inode = dentry->d_inode;
626 struct ceph_inode_info *ci = ceph_inode(inode);
627 struct inode *parent_inode = dentry->d_parent->d_inode;
628 struct ceph_mds_request *req;
629 struct ceph_mds_client *mdsc = &client->mdsc;
630 int err;
631 int i, nr_pages;
632 struct page **pages = NULL;
633 void *kaddr;
634
635 /* copy value into some pages */
636 nr_pages = calc_pages_for(0, size);
637 if (nr_pages) {
638 pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS);
639 if (!pages)
640 return -ENOMEM;
641 err = -ENOMEM;
642 for (i = 0; i < nr_pages; i++) {
643 pages[i] = alloc_page(GFP_NOFS);
644 if (!pages[i]) {
645 nr_pages = i;
646 goto out;
647 }
648 kaddr = kmap(pages[i]);
649 memcpy(kaddr, value + i*PAGE_CACHE_SIZE,
650 min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE));
651 }
652 }
653
654 dout("setxattr value=%.*s\n", (int)size, value);
655
656 /* do request */
657 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR,
658 USE_AUTH_MDS);
659 if (IS_ERR(req)) {
660 err = PTR_ERR(req);
661 goto out;
662 }
663 req->r_inode = igrab(inode);
664 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
665 req->r_num_caps = 1;
666 req->r_args.setxattr.flags = cpu_to_le32(flags);
667 req->r_path2 = kstrdup(name, GFP_NOFS);
668
669 req->r_pages = pages;
670 req->r_num_pages = nr_pages;
671 req->r_data_len = size;
672
673 dout("xattr.ver (before): %lld\n", ci->i_xattrs.version);
674 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
675 ceph_mdsc_put_request(req);
676 dout("xattr.ver (after): %lld\n", ci->i_xattrs.version);
677
678out:
679 if (pages) {
680 for (i = 0; i < nr_pages; i++)
681 __free_page(pages[i]);
682 kfree(pages);
683 }
684 return err;
685}
686
687int ceph_setxattr(struct dentry *dentry, const char *name,
688 const void *value, size_t size, int flags)
689{
690 struct inode *inode = dentry->d_inode;
691 struct ceph_inode_info *ci = ceph_inode(inode);
692 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
693 int err;
694 int name_len = strlen(name);
695 int val_len = size;
696 char *newname = NULL;
697 char *newval = NULL;
698 struct ceph_inode_xattr *xattr = NULL;
699 int issued;
700 int required_blob_size;
701
702 if (ceph_snap(inode) != CEPH_NOSNAP)
703 return -EROFS;
704
705 if (!ceph_is_valid_xattr(name))
706 return -EOPNOTSUPP;
707
708 if (vxattrs) {
709 struct ceph_vxattr_cb *vxattr =
710 ceph_match_vxattr(vxattrs, name);
711 if (vxattr && vxattr->readonly)
712 return -EOPNOTSUPP;
713 }
714
715 /* preallocate memory for xattr name, value, index node */
716 err = -ENOMEM;
717 newname = kmalloc(name_len + 1, GFP_NOFS);
718 if (!newname)
719 goto out;
720 memcpy(newname, name, name_len + 1);
721
722 if (val_len) {
723 newval = kmalloc(val_len + 1, GFP_NOFS);
724 if (!newval)
725 goto out;
726 memcpy(newval, value, val_len);
727 newval[val_len] = '\0';
728 }
729
730 xattr = kmalloc(sizeof(struct ceph_inode_xattr), GFP_NOFS);
731 if (!xattr)
732 goto out;
733
734 spin_lock(&inode->i_lock);
735retry:
736 issued = __ceph_caps_issued(ci, NULL);
737 if (!(issued & CEPH_CAP_XATTR_EXCL))
738 goto do_sync;
739 __build_xattrs(inode);
740
741 required_blob_size = __get_required_blob_size(ci, name_len, val_len);
742
743 if (!ci->i_xattrs.prealloc_blob ||
744 required_blob_size > ci->i_xattrs.prealloc_blob->alloc_len) {
745 struct ceph_buffer *blob = NULL;
746
747 spin_unlock(&inode->i_lock);
748 dout(" preaallocating new blob size=%d\n", required_blob_size);
749 blob = ceph_buffer_new(required_blob_size, GFP_NOFS);
750 if (!blob)
751 goto out;
752 spin_lock(&inode->i_lock);
753 if (ci->i_xattrs.prealloc_blob)
754 ceph_buffer_put(ci->i_xattrs.prealloc_blob);
755 ci->i_xattrs.prealloc_blob = blob;
756 goto retry;
757 }
758
759 dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued));
760 err = __set_xattr(ci, newname, name_len, newval,
761 val_len, 1, 1, 1, &xattr);
762 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
763 ci->i_xattrs.dirty = true;
764 inode->i_ctime = CURRENT_TIME;
765 spin_unlock(&inode->i_lock);
766
767 return err;
768
769do_sync:
770 spin_unlock(&inode->i_lock);
771 err = ceph_sync_setxattr(dentry, name, value, size, flags);
772out:
773 kfree(newname);
774 kfree(newval);
775 kfree(xattr);
776 return err;
777}
778
779static int ceph_send_removexattr(struct dentry *dentry, const char *name)
780{
781 struct ceph_client *client = ceph_client(dentry->d_sb);
782 struct ceph_mds_client *mdsc = &client->mdsc;
783 struct inode *inode = dentry->d_inode;
784 struct inode *parent_inode = dentry->d_parent->d_inode;
785 struct ceph_mds_request *req;
786 int err;
787
788 req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RMXATTR,
789 USE_AUTH_MDS);
790 if (IS_ERR(req))
791 return PTR_ERR(req);
792 req->r_inode = igrab(inode);
793 req->r_inode_drop = CEPH_CAP_XATTR_SHARED;
794 req->r_num_caps = 1;
795 req->r_path2 = kstrdup(name, GFP_NOFS);
796
797 err = ceph_mdsc_do_request(mdsc, parent_inode, req);
798 ceph_mdsc_put_request(req);
799 return err;
800}
801
802int ceph_removexattr(struct dentry *dentry, const char *name)
803{
804 struct inode *inode = dentry->d_inode;
805 struct ceph_inode_info *ci = ceph_inode(inode);
806 struct ceph_vxattr_cb *vxattrs = ceph_inode_vxattrs(inode);
807 int issued;
808 int err;
809
810 if (ceph_snap(inode) != CEPH_NOSNAP)
811 return -EROFS;
812
813 if (!ceph_is_valid_xattr(name))
814 return -EOPNOTSUPP;
815
816 if (vxattrs) {
817 struct ceph_vxattr_cb *vxattr =
818 ceph_match_vxattr(vxattrs, name);
819 if (vxattr && vxattr->readonly)
820 return -EOPNOTSUPP;
821 }
822
823 spin_lock(&inode->i_lock);
824 __build_xattrs(inode);
825 issued = __ceph_caps_issued(ci, NULL);
826 dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued));
827
828 if (!(issued & CEPH_CAP_XATTR_EXCL))
829 goto do_sync;
830
831 err = __remove_xattr_by_name(ceph_inode(inode), name);
832 __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL);
833 ci->i_xattrs.dirty = true;
834 inode->i_ctime = CURRENT_TIME;
835
836 spin_unlock(&inode->i_lock);
837
838 return err;
839do_sync:
840 spin_unlock(&inode->i_lock);
841 err = ceph_send_removexattr(dentry, name);
842 return err;
843}
844
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 8c6a03627176..5183bc2a1916 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -312,6 +312,7 @@ cifs_alloc_inode(struct super_block *sb)
312 cifs_inode->clientCanCacheRead = false; 312 cifs_inode->clientCanCacheRead = false;
313 cifs_inode->clientCanCacheAll = false; 313 cifs_inode->clientCanCacheAll = false;
314 cifs_inode->delete_pending = false; 314 cifs_inode->delete_pending = false;
315 cifs_inode->invalid_mapping = false;
315 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */ 316 cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
316 cifs_inode->server_eof = 0; 317 cifs_inode->server_eof = 0;
317 318
@@ -638,7 +639,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int origin)
638 setting the revalidate time to zero */ 639 setting the revalidate time to zero */
639 CIFS_I(file->f_path.dentry->d_inode)->time = 0; 640 CIFS_I(file->f_path.dentry->d_inode)->time = 0;
640 641
641 retval = cifs_revalidate(file->f_path.dentry); 642 retval = cifs_revalidate_file(file);
642 if (retval < 0) 643 if (retval < 0)
643 return (loff_t)retval; 644 return (loff_t)retval;
644 } 645 }
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 78c1b86d55f6..7aa57ecdc437 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -61,7 +61,8 @@ extern int cifs_mkdir(struct inode *, struct dentry *, int);
61extern int cifs_rmdir(struct inode *, struct dentry *); 61extern int cifs_rmdir(struct inode *, struct dentry *);
62extern int cifs_rename(struct inode *, struct dentry *, struct inode *, 62extern int cifs_rename(struct inode *, struct dentry *, struct inode *,
63 struct dentry *); 63 struct dentry *);
64extern int cifs_revalidate(struct dentry *); 64extern int cifs_revalidate_file(struct file *filp);
65extern int cifs_revalidate_dentry(struct dentry *);
65extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *); 66extern int cifs_getattr(struct vfsmount *, struct dentry *, struct kstat *);
66extern int cifs_setattr(struct dentry *, struct iattr *); 67extern int cifs_setattr(struct dentry *, struct iattr *);
67 68
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index a1c817eb291a..63c89d1d70b5 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -389,6 +389,7 @@ struct cifsInodeInfo {
389 bool clientCanCacheRead:1; /* read oplock */ 389 bool clientCanCacheRead:1; /* read oplock */
390 bool clientCanCacheAll:1; /* read and writebehind oplock */ 390 bool clientCanCacheAll:1; /* read and writebehind oplock */
391 bool delete_pending:1; /* DELETE_ON_CLOSE is set */ 391 bool delete_pending:1; /* DELETE_ON_CLOSE is set */
392 bool invalid_mapping:1; /* pagecache is invalid */
392 u64 server_eof; /* current file size on server */ 393 u64 server_eof; /* current file size on server */
393 u64 uniqueid; /* server inode number */ 394 u64 uniqueid; /* server inode number */
394 struct inode vfs_inode; 395 struct inode vfs_inode;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 88e2bc44ac58..39e47f46dea5 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -104,10 +104,12 @@ extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr);
104extern struct inode *cifs_iget(struct super_block *sb, 104extern struct inode *cifs_iget(struct super_block *sb,
105 struct cifs_fattr *fattr); 105 struct cifs_fattr *fattr);
106 106
107extern int cifs_get_file_info(struct file *filp);
107extern int cifs_get_inode_info(struct inode **pinode, 108extern int cifs_get_inode_info(struct inode **pinode,
108 const unsigned char *search_path, 109 const unsigned char *search_path,
109 FILE_ALL_INFO *pfile_info, 110 FILE_ALL_INFO *pfile_info,
110 struct super_block *sb, int xid, const __u16 *pfid); 111 struct super_block *sb, int xid, const __u16 *pfid);
112extern int cifs_get_file_info_unix(struct file *filp);
111extern int cifs_get_inode_info_unix(struct inode **pinode, 113extern int cifs_get_inode_info_unix(struct inode **pinode,
112 const unsigned char *search_path, 114 const unsigned char *search_path,
113 struct super_block *sb, int xid); 115 struct super_block *sb, int xid);
@@ -142,6 +144,8 @@ extern int CIFSFindNext(const int xid, struct cifsTconInfo *tcon,
142extern int CIFSFindClose(const int, struct cifsTconInfo *tcon, 144extern int CIFSFindClose(const int, struct cifsTconInfo *tcon,
143 const __u16 search_handle); 145 const __u16 search_handle);
144 146
147extern int CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
148 u16 netfid, FILE_ALL_INFO *pFindData);
145extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon, 149extern int CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
146 const unsigned char *searchName, 150 const unsigned char *searchName,
147 FILE_ALL_INFO *findData, 151 FILE_ALL_INFO *findData,
@@ -152,6 +156,8 @@ extern int SMBQueryInformation(const int xid, struct cifsTconInfo *tcon,
152 FILE_ALL_INFO *findData, 156 FILE_ALL_INFO *findData,
153 const struct nls_table *nls_codepage, int remap); 157 const struct nls_table *nls_codepage, int remap);
154 158
159extern int CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
160 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData);
155extern int CIFSSMBUnixQPathInfo(const int xid, 161extern int CIFSSMBUnixQPathInfo(const int xid,
156 struct cifsTconInfo *tcon, 162 struct cifsTconInfo *tcon,
157 const unsigned char *searchName, 163 const unsigned char *searchName,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 611835899844..7cc7f83e9314 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -500,7 +500,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
500 } else if (pSMBr->hdr.WordCount == 13) { 500 } else if (pSMBr->hdr.WordCount == 13) {
501 cERROR(1, ("mount failed, cifs module not built " 501 cERROR(1, ("mount failed, cifs module not built "
502 "with CIFS_WEAK_PW_HASH support")); 502 "with CIFS_WEAK_PW_HASH support"));
503 rc = -EOPNOTSUPP; 503 rc = -EOPNOTSUPP;
504#endif /* WEAK_PW_HASH */ 504#endif /* WEAK_PW_HASH */
505 goto neg_err_exit; 505 goto neg_err_exit;
506 } else if (pSMBr->hdr.WordCount != 17) { 506 } else if (pSMBr->hdr.WordCount != 17) {
@@ -3230,8 +3230,72 @@ QInfRetry:
3230 return rc; 3230 return rc;
3231} 3231}
3232 3232
3233int
3234CIFSSMBQFileInfo(const int xid, struct cifsTconInfo *tcon,
3235 u16 netfid, FILE_ALL_INFO *pFindData)
3236{
3237 struct smb_t2_qfi_req *pSMB = NULL;
3238 struct smb_t2_qfi_rsp *pSMBr = NULL;
3239 int rc = 0;
3240 int bytes_returned;
3241 __u16 params, byte_count;
3233 3242
3243QFileInfoRetry:
3244 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3245 (void **) &pSMBr);
3246 if (rc)
3247 return rc;
3234 3248
3249 params = 2 /* level */ + 2 /* fid */;
3250 pSMB->t2.TotalDataCount = 0;
3251 pSMB->t2.MaxParameterCount = cpu_to_le16(4);
3252 /* BB find exact max data count below from sess structure BB */
3253 pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
3254 pSMB->t2.MaxSetupCount = 0;
3255 pSMB->t2.Reserved = 0;
3256 pSMB->t2.Flags = 0;
3257 pSMB->t2.Timeout = 0;
3258 pSMB->t2.Reserved2 = 0;
3259 pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
3260 Fid) - 4);
3261 pSMB->t2.DataCount = 0;
3262 pSMB->t2.DataOffset = 0;
3263 pSMB->t2.SetupCount = 1;
3264 pSMB->t2.Reserved3 = 0;
3265 pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
3266 byte_count = params + 1 /* pad */ ;
3267 pSMB->t2.TotalParameterCount = cpu_to_le16(params);
3268 pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
3269 pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_ALL_INFO);
3270 pSMB->Pad = 0;
3271 pSMB->Fid = netfid;
3272 pSMB->hdr.smb_buf_length += byte_count;
3273
3274 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3275 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3276 if (rc) {
3277 cFYI(1, ("Send error in QPathInfo = %d", rc));
3278 } else { /* decode response */
3279 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3280
3281 if (rc) /* BB add auto retry on EOPNOTSUPP? */
3282 rc = -EIO;
3283 else if (pSMBr->ByteCount < 40)
3284 rc = -EIO; /* bad smb */
3285 else if (pFindData) {
3286 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
3287 memcpy((char *) pFindData,
3288 (char *) &pSMBr->hdr.Protocol +
3289 data_offset, sizeof(FILE_ALL_INFO));
3290 } else
3291 rc = -ENOMEM;
3292 }
3293 cifs_buf_release(pSMB);
3294 if (rc == -EAGAIN)
3295 goto QFileInfoRetry;
3296
3297 return rc;
3298}
3235 3299
3236int 3300int
3237CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon, 3301CIFSSMBQPathInfo(const int xid, struct cifsTconInfo *tcon,
@@ -3335,6 +3399,75 @@ QPathInfoRetry:
3335} 3399}
3336 3400
3337int 3401int
3402CIFSSMBUnixQFileInfo(const int xid, struct cifsTconInfo *tcon,
3403 u16 netfid, FILE_UNIX_BASIC_INFO *pFindData)
3404{
3405 struct smb_t2_qfi_req *pSMB = NULL;
3406 struct smb_t2_qfi_rsp *pSMBr = NULL;
3407 int rc = 0;
3408 int bytes_returned;
3409 __u16 params, byte_count;
3410
3411UnixQFileInfoRetry:
3412 rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB,
3413 (void **) &pSMBr);
3414 if (rc)
3415 return rc;
3416
3417 params = 2 /* level */ + 2 /* fid */;
3418 pSMB->t2.TotalDataCount = 0;
3419 pSMB->t2.MaxParameterCount = cpu_to_le16(4);
3420 /* BB find exact max data count below from sess structure BB */
3421 pSMB->t2.MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
3422 pSMB->t2.MaxSetupCount = 0;
3423 pSMB->t2.Reserved = 0;
3424 pSMB->t2.Flags = 0;
3425 pSMB->t2.Timeout = 0;
3426 pSMB->t2.Reserved2 = 0;
3427 pSMB->t2.ParameterOffset = cpu_to_le16(offsetof(struct smb_t2_qfi_req,
3428 Fid) - 4);
3429 pSMB->t2.DataCount = 0;
3430 pSMB->t2.DataOffset = 0;
3431 pSMB->t2.SetupCount = 1;
3432 pSMB->t2.Reserved3 = 0;
3433 pSMB->t2.SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION);
3434 byte_count = params + 1 /* pad */ ;
3435 pSMB->t2.TotalParameterCount = cpu_to_le16(params);
3436 pSMB->t2.ParameterCount = pSMB->t2.TotalParameterCount;
3437 pSMB->InformationLevel = cpu_to_le16(SMB_QUERY_FILE_UNIX_BASIC);
3438 pSMB->Pad = 0;
3439 pSMB->Fid = netfid;
3440 pSMB->hdr.smb_buf_length += byte_count;
3441
3442 rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB,
3443 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
3444 if (rc) {
3445 cFYI(1, ("Send error in QPathInfo = %d", rc));
3446 } else { /* decode response */
3447 rc = validate_t2((struct smb_t2_rsp *)pSMBr);
3448
3449 if (rc || (pSMBr->ByteCount < sizeof(FILE_UNIX_BASIC_INFO))) {
3450 cERROR(1, ("Malformed FILE_UNIX_BASIC_INFO response.\n"
3451 "Unix Extensions can be disabled on mount "
3452 "by specifying the nosfu mount option."));
3453 rc = -EIO; /* bad smb */
3454 } else {
3455 __u16 data_offset = le16_to_cpu(pSMBr->t2.DataOffset);
3456 memcpy((char *) pFindData,
3457 (char *) &pSMBr->hdr.Protocol +
3458 data_offset,
3459 sizeof(FILE_UNIX_BASIC_INFO));
3460 }
3461 }
3462
3463 cifs_buf_release(pSMB);
3464 if (rc == -EAGAIN)
3465 goto UnixQFileInfoRetry;
3466
3467 return rc;
3468}
3469
3470int
3338CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon, 3471CIFSSMBUnixQPathInfo(const int xid, struct cifsTconInfo *tcon,
3339 const unsigned char *searchName, 3472 const unsigned char *searchName,
3340 FILE_UNIX_BASIC_INFO *pFindData, 3473 FILE_UNIX_BASIC_INFO *pFindData,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 6ccf7262d1b7..e9f7ecc2714b 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -739,7 +739,7 @@ cifs_d_revalidate(struct dentry *direntry, struct nameidata *nd)
739 int isValid = 1; 739 int isValid = 1;
740 740
741 if (direntry->d_inode) { 741 if (direntry->d_inode) {
742 if (cifs_revalidate(direntry)) 742 if (cifs_revalidate_dentry(direntry))
743 return 0; 743 return 0;
744 } else { 744 } else {
745 cFYI(1, ("neg dentry 0x%p name = %s", 745 cFYI(1, ("neg dentry 0x%p name = %s",
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 3d8f8a96f5a3..ca2ba7a0193c 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -219,8 +219,8 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
219 cFYI(1, ("inode unchanged on server")); 219 cFYI(1, ("inode unchanged on server"));
220 } else { 220 } else {
221 if (file->f_path.dentry->d_inode->i_mapping) { 221 if (file->f_path.dentry->d_inode->i_mapping) {
222 /* BB no need to lock inode until after invalidate 222 /* BB no need to lock inode until after invalidate
223 since namei code should already have it locked? */ 223 since namei code should already have it locked? */
224 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping); 224 rc = filemap_write_and_wait(file->f_path.dentry->d_inode->i_mapping);
225 if (rc != 0) 225 if (rc != 0)
226 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc; 226 CIFS_I(file->f_path.dentry->d_inode)->write_behind_rc = rc;
@@ -1890,11 +1890,10 @@ static ssize_t cifs_read(struct file *file, char *read_data, size_t read_size,
1890 1890
1891int cifs_file_mmap(struct file *file, struct vm_area_struct *vma) 1891int cifs_file_mmap(struct file *file, struct vm_area_struct *vma)
1892{ 1892{
1893 struct dentry *dentry = file->f_path.dentry;
1894 int rc, xid; 1893 int rc, xid;
1895 1894
1896 xid = GetXid(); 1895 xid = GetXid();
1897 rc = cifs_revalidate(dentry); 1896 rc = cifs_revalidate_file(file);
1898 if (rc) { 1897 if (rc) {
1899 cFYI(1, ("Validation prior to mmap failed, error=%d", rc)); 1898 cFYI(1, ("Validation prior to mmap failed, error=%d", rc));
1900 FreeXid(xid); 1899 FreeXid(xid);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 8bdbc818164c..723daaccbd0e 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -77,6 +77,41 @@ static void cifs_set_ops(struct inode *inode, const bool is_dfs_referral)
77 } 77 }
78} 78}
79 79
80/* check inode attributes against fattr. If they don't match, tag the
81 * inode for cache invalidation
82 */
83static void
84cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
85{
86 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
87
88 cFYI(1, ("%s: revalidating inode %llu", __func__, cifs_i->uniqueid));
89
90 if (inode->i_state & I_NEW) {
91 cFYI(1, ("%s: inode %llu is new", __func__, cifs_i->uniqueid));
92 return;
93 }
94
95 /* don't bother with revalidation if we have an oplock */
96 if (cifs_i->clientCanCacheRead) {
97 cFYI(1, ("%s: inode %llu is oplocked", __func__,
98 cifs_i->uniqueid));
99 return;
100 }
101
102 /* revalidate if mtime or size have changed */
103 if (timespec_equal(&inode->i_mtime, &fattr->cf_mtime) &&
104 cifs_i->server_eof == fattr->cf_eof) {
105 cFYI(1, ("%s: inode %llu is unchanged", __func__,
106 cifs_i->uniqueid));
107 return;
108 }
109
110 cFYI(1, ("%s: invalidating inode %llu mapping", __func__,
111 cifs_i->uniqueid));
112 cifs_i->invalid_mapping = true;
113}
114
80/* populate an inode with info from a cifs_fattr struct */ 115/* populate an inode with info from a cifs_fattr struct */
81void 116void
82cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) 117cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
@@ -85,6 +120,8 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr)
85 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); 120 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
86 unsigned long oldtime = cifs_i->time; 121 unsigned long oldtime = cifs_i->time;
87 122
123 cifs_revalidate_cache(inode, fattr);
124
88 inode->i_atime = fattr->cf_atime; 125 inode->i_atime = fattr->cf_atime;
89 inode->i_mtime = fattr->cf_mtime; 126 inode->i_mtime = fattr->cf_mtime;
90 inode->i_ctime = fattr->cf_ctime; 127 inode->i_ctime = fattr->cf_ctime;
@@ -231,6 +268,31 @@ cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
231 fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL; 268 fattr->cf_flags |= CIFS_FATTR_DFS_REFERRAL;
232} 269}
233 270
271int cifs_get_file_info_unix(struct file *filp)
272{
273 int rc;
274 int xid;
275 FILE_UNIX_BASIC_INFO find_data;
276 struct cifs_fattr fattr;
277 struct inode *inode = filp->f_path.dentry->d_inode;
278 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
279 struct cifsTconInfo *tcon = cifs_sb->tcon;
280 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
281
282 xid = GetXid();
283 rc = CIFSSMBUnixQFileInfo(xid, tcon, cfile->netfid, &find_data);
284 if (!rc) {
285 cifs_unix_basic_to_fattr(&fattr, &find_data, cifs_sb);
286 } else if (rc == -EREMOTE) {
287 cifs_create_dfs_fattr(&fattr, inode->i_sb);
288 rc = 0;
289 }
290
291 cifs_fattr_to_inode(inode, &fattr);
292 FreeXid(xid);
293 return rc;
294}
295
234int cifs_get_inode_info_unix(struct inode **pinode, 296int cifs_get_inode_info_unix(struct inode **pinode,
235 const unsigned char *full_path, 297 const unsigned char *full_path,
236 struct super_block *sb, int xid) 298 struct super_block *sb, int xid)
@@ -432,6 +494,47 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
432 fattr->cf_gid = cifs_sb->mnt_gid; 494 fattr->cf_gid = cifs_sb->mnt_gid;
433} 495}
434 496
497int cifs_get_file_info(struct file *filp)
498{
499 int rc;
500 int xid;
501 FILE_ALL_INFO find_data;
502 struct cifs_fattr fattr;
503 struct inode *inode = filp->f_path.dentry->d_inode;
504 struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
505 struct cifsTconInfo *tcon = cifs_sb->tcon;
506 struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data;
507
508 xid = GetXid();
509 rc = CIFSSMBQFileInfo(xid, tcon, cfile->netfid, &find_data);
510 if (rc == -EOPNOTSUPP || rc == -EINVAL) {
511 /*
512 * FIXME: legacy server -- fall back to path-based call?
513 * for now, just skip revalidating and mark inode for
514 * immediate reval.
515 */
516 rc = 0;
517 CIFS_I(inode)->time = 0;
518 goto cgfi_exit;
519 } else if (rc == -EREMOTE) {
520 cifs_create_dfs_fattr(&fattr, inode->i_sb);
521 rc = 0;
522 } else if (rc)
523 goto cgfi_exit;
524
525 /*
526 * don't bother with SFU junk here -- just mark inode as needing
527 * revalidation.
528 */
529 cifs_all_info_to_fattr(&fattr, &find_data, cifs_sb, false);
530 fattr.cf_uniqueid = CIFS_I(inode)->uniqueid;
531 fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
532 cifs_fattr_to_inode(inode, &fattr);
533cgfi_exit:
534 FreeXid(xid);
535 return rc;
536}
537
435int cifs_get_inode_info(struct inode **pinode, 538int cifs_get_inode_info(struct inode **pinode,
436 const unsigned char *full_path, FILE_ALL_INFO *pfindData, 539 const unsigned char *full_path, FILE_ALL_INFO *pfindData,
437 struct super_block *sb, int xid, const __u16 *pfid) 540 struct super_block *sb, int xid, const __u16 *pfid)
@@ -1389,135 +1492,103 @@ cifs_rename_exit:
1389 return rc; 1492 return rc;
1390} 1493}
1391 1494
1392int cifs_revalidate(struct dentry *direntry) 1495static bool
1496cifs_inode_needs_reval(struct inode *inode)
1393{ 1497{
1394 int xid; 1498 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1395 int rc = 0, wbrc = 0;
1396 char *full_path;
1397 struct cifs_sb_info *cifs_sb;
1398 struct cifsInodeInfo *cifsInode;
1399 loff_t local_size;
1400 struct timespec local_mtime;
1401 bool invalidate_inode = false;
1402 1499
1403 if (direntry->d_inode == NULL) 1500 if (cifs_i->clientCanCacheRead)
1404 return -ENOENT; 1501 return false;
1405 1502
1406 cifsInode = CIFS_I(direntry->d_inode); 1503 if (!lookupCacheEnabled)
1504 return true;
1407 1505
1408 if (cifsInode == NULL) 1506 if (cifs_i->time == 0)
1409 return -ENOENT; 1507 return true;
1410 1508
1411 /* no sense revalidating inode info on file that no one can write */ 1509 /* FIXME: the actimeo should be tunable */
1412 if (CIFS_I(direntry->d_inode)->clientCanCacheRead) 1510 if (time_after_eq(jiffies, cifs_i->time + HZ))
1413 return rc; 1511 return true;
1512
1513 return false;
1514}
1515
1516/* check invalid_mapping flag and zap the cache if it's set */
1517static void
1518cifs_invalidate_mapping(struct inode *inode)
1519{
1520 int rc;
1521 struct cifsInodeInfo *cifs_i = CIFS_I(inode);
1522
1523 cifs_i->invalid_mapping = false;
1524
1525 /* write back any cached data */
1526 if (inode->i_mapping && inode->i_mapping->nrpages != 0) {
1527 rc = filemap_write_and_wait(inode->i_mapping);
1528 if (rc)
1529 cifs_i->write_behind_rc = rc;
1530 }
1531 invalidate_remote_inode(inode);
1532}
1533
1534int cifs_revalidate_file(struct file *filp)
1535{
1536 int rc = 0;
1537 struct inode *inode = filp->f_path.dentry->d_inode;
1538
1539 if (!cifs_inode_needs_reval(inode))
1540 goto check_inval;
1541
1542 if (CIFS_SB(inode->i_sb)->tcon->unix_ext)
1543 rc = cifs_get_file_info_unix(filp);
1544 else
1545 rc = cifs_get_file_info(filp);
1546
1547check_inval:
1548 if (CIFS_I(inode)->invalid_mapping)
1549 cifs_invalidate_mapping(inode);
1550
1551 return rc;
1552}
1553
1554/* revalidate a dentry's inode attributes */
1555int cifs_revalidate_dentry(struct dentry *dentry)
1556{
1557 int xid;
1558 int rc = 0;
1559 char *full_path = NULL;
1560 struct inode *inode = dentry->d_inode;
1561 struct super_block *sb = dentry->d_sb;
1562
1563 if (inode == NULL)
1564 return -ENOENT;
1414 1565
1415 xid = GetXid(); 1566 xid = GetXid();
1416 1567
1417 cifs_sb = CIFS_SB(direntry->d_sb); 1568 if (!cifs_inode_needs_reval(inode))
1569 goto check_inval;
1418 1570
1419 /* can not safely grab the rename sem here if rename calls revalidate 1571 /* can not safely grab the rename sem here if rename calls revalidate
1420 since that would deadlock */ 1572 since that would deadlock */
1421 full_path = build_path_from_dentry(direntry); 1573 full_path = build_path_from_dentry(dentry);
1422 if (full_path == NULL) { 1574 if (full_path == NULL) {
1423 rc = -ENOMEM; 1575 rc = -ENOMEM;
1424 FreeXid(xid); 1576 goto check_inval;
1425 return rc;
1426 }
1427 cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
1428 "jiffies %ld", full_path, direntry->d_inode,
1429 direntry->d_inode->i_count.counter, direntry,
1430 direntry->d_time, jiffies));
1431
1432 if (cifsInode->time == 0) {
1433 /* was set to zero previously to force revalidate */
1434 } else if (time_before(jiffies, cifsInode->time + HZ) &&
1435 lookupCacheEnabled) {
1436 if ((S_ISREG(direntry->d_inode->i_mode) == 0) ||
1437 (direntry->d_inode->i_nlink == 1)) {
1438 kfree(full_path);
1439 FreeXid(xid);
1440 return rc;
1441 } else {
1442 cFYI(1, ("Have to revalidate file due to hardlinks"));
1443 }
1444 }
1445
1446 /* save mtime and size */
1447 local_mtime = direntry->d_inode->i_mtime;
1448 local_size = direntry->d_inode->i_size;
1449
1450 if (cifs_sb->tcon->unix_ext) {
1451 rc = cifs_get_inode_info_unix(&direntry->d_inode, full_path,
1452 direntry->d_sb, xid);
1453 if (rc) {
1454 cFYI(1, ("error on getting revalidate info %d", rc));
1455/* if (rc != -ENOENT)
1456 rc = 0; */ /* BB should we cache info on
1457 certain errors? */
1458 }
1459 } else {
1460 rc = cifs_get_inode_info(&direntry->d_inode, full_path, NULL,
1461 direntry->d_sb, xid, NULL);
1462 if (rc) {
1463 cFYI(1, ("error on getting revalidate info %d", rc));
1464/* if (rc != -ENOENT)
1465 rc = 0; */ /* BB should we cache info on
1466 certain errors? */
1467 }
1468 } 1577 }
1469 /* should we remap certain errors, access denied?, to zero */
1470 1578
1471 /* if not oplocked, we invalidate inode pages if mtime or file size 1579 cFYI(1, ("Revalidate: %s inode 0x%p count %d dentry: 0x%p d_time %ld "
1472 had changed on server */ 1580 "jiffies %ld", full_path, inode, inode->i_count.counter,
1581 dentry, dentry->d_time, jiffies));
1473 1582
1474 if (timespec_equal(&local_mtime, &direntry->d_inode->i_mtime) && 1583 if (CIFS_SB(sb)->tcon->unix_ext)
1475 (local_size == direntry->d_inode->i_size)) { 1584 rc = cifs_get_inode_info_unix(&inode, full_path, sb, xid);
1476 cFYI(1, ("cifs_revalidate - inode unchanged")); 1585 else
1477 } else { 1586 rc = cifs_get_inode_info(&inode, full_path, NULL, sb,
1478 /* file may have changed on server */ 1587 xid, NULL);
1479 if (cifsInode->clientCanCacheRead) {
1480 /* no need to invalidate inode pages since we were the
1481 only ones who could have modified the file and the
1482 server copy is staler than ours */
1483 } else {
1484 invalidate_inode = true;
1485 }
1486 }
1487 1588
1488 /* can not grab this sem since kernel filesys locking documentation 1589check_inval:
1489 indicates i_mutex may be taken by the kernel on lookup and rename 1590 if (CIFS_I(inode)->invalid_mapping)
1490 which could deadlock if we grab the i_mutex here as well */ 1591 cifs_invalidate_mapping(inode);
1491/* mutex_lock(&direntry->d_inode->i_mutex);*/
1492 /* need to write out dirty pages here */
1493 if (direntry->d_inode->i_mapping) {
1494 /* do we need to lock inode until after invalidate completes
1495 below? */
1496 wbrc = filemap_fdatawrite(direntry->d_inode->i_mapping);
1497 if (wbrc)
1498 CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
1499 }
1500 if (invalidate_inode) {
1501 /* shrink_dcache not necessary now that cifs dentry ops
1502 are exported for negative dentries */
1503/* if (S_ISDIR(direntry->d_inode->i_mode))
1504 shrink_dcache_parent(direntry); */
1505 if (S_ISREG(direntry->d_inode->i_mode)) {
1506 if (direntry->d_inode->i_mapping) {
1507 wbrc = filemap_fdatawait(direntry->d_inode->i_mapping);
1508 if (wbrc)
1509 CIFS_I(direntry->d_inode)->write_behind_rc = wbrc;
1510 }
1511 /* may eventually have to do this for open files too */
1512 if (list_empty(&(cifsInode->openFileList))) {
1513 /* changed on server - flush read ahead pages */
1514 cFYI(1, ("Invalidating read ahead data on "
1515 "closed file"));
1516 invalidate_remote_inode(direntry->d_inode);
1517 }
1518 }
1519 }
1520/* mutex_unlock(&direntry->d_inode->i_mutex); */
1521 1592
1522 kfree(full_path); 1593 kfree(full_path);
1523 FreeXid(xid); 1594 FreeXid(xid);
@@ -1527,7 +1598,7 @@ int cifs_revalidate(struct dentry *direntry)
1527int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry, 1598int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
1528 struct kstat *stat) 1599 struct kstat *stat)
1529{ 1600{
1530 int err = cifs_revalidate(dentry); 1601 int err = cifs_revalidate_dentry(dentry);
1531 if (!err) { 1602 if (!err) {
1532 generic_fillattr(dentry->d_inode, stat); 1603 generic_fillattr(dentry->d_inode, stat);
1533 stat->blksize = CIFS_MAX_MSGSIZE; 1604 stat->blksize = CIFS_MAX_MSGSIZE;
diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index e22de8397b74..d32ee9412cb9 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -567,7 +567,7 @@ static void jffs2_free_tmp_dnode_info_list(struct rb_root *list)
567 else BUG(); 567 else BUG();
568 } 568 }
569 } 569 }
570 list->rb_node = NULL; 570 *list = RB_ROOT;
571} 571}
572 572
573static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd) 573static void jffs2_free_full_dirent_list(struct jffs2_full_dirent *fd)
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index db30c0b398b5..a2b8b4df125d 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -782,6 +782,7 @@ struct svc_version nfs4_callback_version1 = {
782 .vs_proc = nfs4_callback_procedures1, 782 .vs_proc = nfs4_callback_procedures1,
783 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE, 783 .vs_xdrsize = NFS4_CALLBACK_XDRSIZE,
784 .vs_dispatch = NULL, 784 .vs_dispatch = NULL,
785 .vs_hidden = 1,
785}; 786};
786 787
787struct svc_version nfs4_callback_version4 = { 788struct svc_version nfs4_callback_version4 = {
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 944b627ec6e1..69e7b8140122 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -71,4 +71,10 @@ static inline int nfs_inode_return_delegation(struct inode *inode)
71} 71}
72#endif 72#endif
73 73
74static inline int nfs_have_delegated_attributes(struct inode *inode)
75{
76 return nfs_have_delegation(inode, FMODE_READ) &&
77 !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED);
78}
79
74#endif 80#endif
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a1f6b4438fb1..c6f2750648f4 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1789,7 +1789,7 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
1789 cache = nfs_access_search_rbtree(inode, cred); 1789 cache = nfs_access_search_rbtree(inode, cred);
1790 if (cache == NULL) 1790 if (cache == NULL)
1791 goto out; 1791 goto out;
1792 if (!nfs_have_delegation(inode, FMODE_READ) && 1792 if (!nfs_have_delegated_attributes(inode) &&
1793 !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo)) 1793 !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
1794 goto out_stale; 1794 goto out_stale;
1795 res->jiffies = cache->jiffies; 1795 res->jiffies = cache->jiffies;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 657201acda84..e358df75a6ad 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -729,7 +729,7 @@ int nfs_attribute_timeout(struct inode *inode)
729{ 729{
730 struct nfs_inode *nfsi = NFS_I(inode); 730 struct nfs_inode *nfsi = NFS_I(inode);
731 731
732 if (nfs_have_delegation(inode, FMODE_READ)) 732 if (nfs_have_delegated_attributes(inode))
733 return 0; 733 return 0;
734 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); 734 return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
735} 735}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index eda74c42d552..f9254fb0c9d0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -5107,6 +5107,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp,
5107 res = kzalloc(sizeof(*res), GFP_KERNEL); 5107 res = kzalloc(sizeof(*res), GFP_KERNEL);
5108 if (!args || !res) { 5108 if (!args || !res) {
5109 kfree(args); 5109 kfree(args);
5110 kfree(res);
5110 nfs_put_client(clp); 5111 nfs_put_client(clp);
5111 return -ENOMEM; 5112 return -ENOMEM;
5112 } 5113 }
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index a12c45b65dd4..29d9d36cd5f4 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -112,12 +112,10 @@ void nfs_unlock_request(struct nfs_page *req)
112 */ 112 */
113int nfs_set_page_tag_locked(struct nfs_page *req) 113int nfs_set_page_tag_locked(struct nfs_page *req)
114{ 114{
115 struct nfs_inode *nfsi = NFS_I(req->wb_context->path.dentry->d_inode);
116
117 if (!nfs_lock_request_dontget(req)) 115 if (!nfs_lock_request_dontget(req))
118 return 0; 116 return 0;
119 if (req->wb_page != NULL) 117 if (req->wb_page != NULL)
120 radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); 118 radix_tree_tag_set(&NFS_I(req->wb_context->path.dentry->d_inode)->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
121 return 1; 119 return 1;
122} 120}
123 121
@@ -126,10 +124,10 @@ int nfs_set_page_tag_locked(struct nfs_page *req)
126 */ 124 */
127void nfs_clear_page_tag_locked(struct nfs_page *req) 125void nfs_clear_page_tag_locked(struct nfs_page *req)
128{ 126{
129 struct inode *inode = req->wb_context->path.dentry->d_inode;
130 struct nfs_inode *nfsi = NFS_I(inode);
131
132 if (req->wb_page != NULL) { 127 if (req->wb_page != NULL) {
128 struct inode *inode = req->wb_context->path.dentry->d_inode;
129 struct nfs_inode *nfsi = NFS_I(inode);
130
133 spin_lock(&inode->i_lock); 131 spin_lock(&inode->i_lock);
134 radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); 132 radix_tree_tag_clear(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED);
135 nfs_unlock_request(req); 133 nfs_unlock_request(req);
@@ -142,16 +140,22 @@ void nfs_clear_page_tag_locked(struct nfs_page *req)
142 * nfs_clear_request - Free up all resources allocated to the request 140 * nfs_clear_request - Free up all resources allocated to the request
143 * @req: 141 * @req:
144 * 142 *
145 * Release page resources associated with a write request after it 143 * Release page and open context resources associated with a read/write
146 * has completed. 144 * request after it has completed.
147 */ 145 */
148void nfs_clear_request(struct nfs_page *req) 146void nfs_clear_request(struct nfs_page *req)
149{ 147{
150 struct page *page = req->wb_page; 148 struct page *page = req->wb_page;
149 struct nfs_open_context *ctx = req->wb_context;
150
151 if (page != NULL) { 151 if (page != NULL) {
152 page_cache_release(page); 152 page_cache_release(page);
153 req->wb_page = NULL; 153 req->wb_page = NULL;
154 } 154 }
155 if (ctx != NULL) {
156 put_nfs_open_context(ctx);
157 req->wb_context = NULL;
158 }
155} 159}
156 160
157 161
@@ -165,9 +169,8 @@ static void nfs_free_request(struct kref *kref)
165{ 169{
166 struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); 170 struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
167 171
168 /* Release struct file or cached credential */ 172 /* Release struct file and open context */
169 nfs_clear_request(req); 173 nfs_clear_request(req);
170 put_nfs_open_context(req->wb_context);
171 nfs_page_free(req); 174 nfs_page_free(req);
172} 175}
173 176
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index f1afee4eea77..6baf9a393466 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2214,7 +2214,7 @@ static int nfs_get_sb(struct file_system_type *fs_type,
2214 } else { 2214 } else {
2215 error = nfs_bdi_register(server); 2215 error = nfs_bdi_register(server);
2216 if (error) 2216 if (error)
2217 goto error_splat_super; 2217 goto error_splat_bdi;
2218 } 2218 }
2219 2219
2220 if (!s->s_root) { 2220 if (!s->s_root) {
@@ -2256,6 +2256,9 @@ out_err_nosb:
2256error_splat_root: 2256error_splat_root:
2257 dput(mntroot); 2257 dput(mntroot);
2258error_splat_super: 2258error_splat_super:
2259 if (server && !s->s_root)
2260 bdi_unregister(&server->backing_dev_info);
2261error_splat_bdi:
2259 deactivate_locked_super(s); 2262 deactivate_locked_super(s);
2260 goto out; 2263 goto out;
2261} 2264}
@@ -2326,7 +2329,7 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags,
2326 } else { 2329 } else {
2327 error = nfs_bdi_register(server); 2330 error = nfs_bdi_register(server);
2328 if (error) 2331 if (error)
2329 goto error_splat_super; 2332 goto error_splat_bdi;
2330 } 2333 }
2331 2334
2332 if (!s->s_root) { 2335 if (!s->s_root) {
@@ -2363,6 +2366,9 @@ out_err_noserver:
2363 return error; 2366 return error;
2364 2367
2365error_splat_super: 2368error_splat_super:
2369 if (server && !s->s_root)
2370 bdi_unregister(&server->backing_dev_info);
2371error_splat_bdi:
2366 deactivate_locked_super(s); 2372 deactivate_locked_super(s);
2367 dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error); 2373 dprintk("<-- nfs_xdev_get_sb() = %d [splat]\n", error);
2368 return error; 2374 return error;
@@ -2578,7 +2584,7 @@ static int nfs4_remote_get_sb(struct file_system_type *fs_type,
2578 } else { 2584 } else {
2579 error = nfs_bdi_register(server); 2585 error = nfs_bdi_register(server);
2580 if (error) 2586 if (error)
2581 goto error_splat_super; 2587 goto error_splat_bdi;
2582 } 2588 }
2583 2589
2584 if (!s->s_root) { 2590 if (!s->s_root) {
@@ -2616,6 +2622,9 @@ out_free:
2616error_splat_root: 2622error_splat_root:
2617 dput(mntroot); 2623 dput(mntroot);
2618error_splat_super: 2624error_splat_super:
2625 if (server && !s->s_root)
2626 bdi_unregister(&server->backing_dev_info);
2627error_splat_bdi:
2619 deactivate_locked_super(s); 2628 deactivate_locked_super(s);
2620 goto out; 2629 goto out;
2621} 2630}
@@ -2811,7 +2820,7 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags,
2811 } else { 2820 } else {
2812 error = nfs_bdi_register(server); 2821 error = nfs_bdi_register(server);
2813 if (error) 2822 if (error)
2814 goto error_splat_super; 2823 goto error_splat_bdi;
2815 } 2824 }
2816 2825
2817 if (!s->s_root) { 2826 if (!s->s_root) {
@@ -2847,6 +2856,9 @@ out_err_noserver:
2847 return error; 2856 return error;
2848 2857
2849error_splat_super: 2858error_splat_super:
2859 if (server && !s->s_root)
2860 bdi_unregister(&server->backing_dev_info);
2861error_splat_bdi:
2850 deactivate_locked_super(s); 2862 deactivate_locked_super(s);
2851 dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error); 2863 dprintk("<-- nfs4_xdev_get_sb() = %d [splat]\n", error);
2852 return error; 2864 return error;
@@ -2893,7 +2905,7 @@ static int nfs4_remote_referral_get_sb(struct file_system_type *fs_type,
2893 } else { 2905 } else {
2894 error = nfs_bdi_register(server); 2906 error = nfs_bdi_register(server);
2895 if (error) 2907 if (error)
2896 goto error_splat_super; 2908 goto error_splat_bdi;
2897 } 2909 }
2898 2910
2899 if (!s->s_root) { 2911 if (!s->s_root) {
@@ -2929,6 +2941,9 @@ out_err_noserver:
2929 return error; 2941 return error;
2930 2942
2931error_splat_super: 2943error_splat_super:
2944 if (server && !s->s_root)
2945 bdi_unregister(&server->backing_dev_info);
2946error_splat_bdi:
2932 deactivate_locked_super(s); 2947 deactivate_locked_super(s);
2933 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error); 2948 dprintk("<-- nfs4_referral_get_sb() = %d [splat]\n", error);
2934 return error; 2949 return error;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 1cf39dfaee7a..0de1db6cddbf 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -31,6 +31,7 @@
31#include <linux/vfs.h> 31#include <linux/vfs.h>
32#include <linux/moduleparam.h> 32#include <linux/moduleparam.h>
33#include <linux/smp_lock.h> 33#include <linux/smp_lock.h>
34#include <linux/bitmap.h>
34 35
35#include "sysctl.h" 36#include "sysctl.h"
36#include "logfile.h" 37#include "logfile.h"
@@ -2458,7 +2459,6 @@ static void ntfs_put_super(struct super_block *sb)
2458static s64 get_nr_free_clusters(ntfs_volume *vol) 2459static s64 get_nr_free_clusters(ntfs_volume *vol)
2459{ 2460{
2460 s64 nr_free = vol->nr_clusters; 2461 s64 nr_free = vol->nr_clusters;
2461 u32 *kaddr;
2462 struct address_space *mapping = vol->lcnbmp_ino->i_mapping; 2462 struct address_space *mapping = vol->lcnbmp_ino->i_mapping;
2463 struct page *page; 2463 struct page *page;
2464 pgoff_t index, max_index; 2464 pgoff_t index, max_index;
@@ -2477,7 +2477,8 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2477 ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.", 2477 ntfs_debug("Reading $Bitmap, max_index = 0x%lx, max_size = 0x%lx.",
2478 max_index, PAGE_CACHE_SIZE / 4); 2478 max_index, PAGE_CACHE_SIZE / 4);
2479 for (index = 0; index < max_index; index++) { 2479 for (index = 0; index < max_index; index++) {
2480 unsigned int i; 2480 unsigned long *kaddr;
2481
2481 /* 2482 /*
2482 * Read the page from page cache, getting it from backing store 2483 * Read the page from page cache, getting it from backing store
2483 * if necessary, and increment the use count. 2484 * if necessary, and increment the use count.
@@ -2490,16 +2491,16 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2490 nr_free -= PAGE_CACHE_SIZE * 8; 2491 nr_free -= PAGE_CACHE_SIZE * 8;
2491 continue; 2492 continue;
2492 } 2493 }
2493 kaddr = (u32*)kmap_atomic(page, KM_USER0); 2494 kaddr = kmap_atomic(page, KM_USER0);
2494 /* 2495 /*
2495 * For each 4 bytes, subtract the number of set bits. If this 2496 * Subtract the number of set bits. If this
2496 * is the last page and it is partial we don't really care as 2497 * is the last page and it is partial we don't really care as
2497 * it just means we do a little extra work but it won't affect 2498 * it just means we do a little extra work but it won't affect
2498 * the result as all out of range bytes are set to zero by 2499 * the result as all out of range bytes are set to zero by
2499 * ntfs_readpage(). 2500 * ntfs_readpage().
2500 */ 2501 */
2501 for (i = 0; i < PAGE_CACHE_SIZE / 4; i++) 2502 nr_free -= bitmap_weight(kaddr,
2502 nr_free -= (s64)hweight32(kaddr[i]); 2503 PAGE_CACHE_SIZE * BITS_PER_BYTE);
2503 kunmap_atomic(kaddr, KM_USER0); 2504 kunmap_atomic(kaddr, KM_USER0);
2504 page_cache_release(page); 2505 page_cache_release(page);
2505 } 2506 }
@@ -2538,7 +2539,6 @@ static s64 get_nr_free_clusters(ntfs_volume *vol)
2538static unsigned long __get_nr_free_mft_records(ntfs_volume *vol, 2539static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2539 s64 nr_free, const pgoff_t max_index) 2540 s64 nr_free, const pgoff_t max_index)
2540{ 2541{
2541 u32 *kaddr;
2542 struct address_space *mapping = vol->mftbmp_ino->i_mapping; 2542 struct address_space *mapping = vol->mftbmp_ino->i_mapping;
2543 struct page *page; 2543 struct page *page;
2544 pgoff_t index; 2544 pgoff_t index;
@@ -2548,7 +2548,8 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2548 ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = " 2548 ntfs_debug("Reading $MFT/$BITMAP, max_index = 0x%lx, max_size = "
2549 "0x%lx.", max_index, PAGE_CACHE_SIZE / 4); 2549 "0x%lx.", max_index, PAGE_CACHE_SIZE / 4);
2550 for (index = 0; index < max_index; index++) { 2550 for (index = 0; index < max_index; index++) {
2551 unsigned int i; 2551 unsigned long *kaddr;
2552
2552 /* 2553 /*
2553 * Read the page from page cache, getting it from backing store 2554 * Read the page from page cache, getting it from backing store
2554 * if necessary, and increment the use count. 2555 * if necessary, and increment the use count.
@@ -2561,16 +2562,16 @@ static unsigned long __get_nr_free_mft_records(ntfs_volume *vol,
2561 nr_free -= PAGE_CACHE_SIZE * 8; 2562 nr_free -= PAGE_CACHE_SIZE * 8;
2562 continue; 2563 continue;
2563 } 2564 }
2564 kaddr = (u32*)kmap_atomic(page, KM_USER0); 2565 kaddr = kmap_atomic(page, KM_USER0);
2565 /* 2566 /*
2566 * For each 4 bytes, subtract the number of set bits. If this 2567 * Subtract the number of set bits. If this
2567 * is the last page and it is partial we don't really care as 2568 * is the last page and it is partial we don't really care as
2568 * it just means we do a little extra work but it won't affect 2569 * it just means we do a little extra work but it won't affect
2569 * the result as all out of range bytes are set to zero by 2570 * the result as all out of range bytes are set to zero by
2570 * ntfs_readpage(). 2571 * ntfs_readpage().
2571 */ 2572 */
2572 for (i = 0; i < PAGE_CACHE_SIZE / 4; i++) 2573 nr_free -= bitmap_weight(kaddr,
2573 nr_free -= (s64)hweight32(kaddr[i]); 2574 PAGE_CACHE_SIZE * BITS_PER_BYTE);
2574 kunmap_atomic(kaddr, KM_USER0); 2575 kunmap_atomic(kaddr, KM_USER0);
2575 page_cache_release(page); 2576 page_cache_release(page);
2576 } 2577 }
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 9083357f9e44..99628508cb11 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -932,6 +932,9 @@ xfs_aops_discard_page(
932 if (!xfs_is_delayed_page(page, IOMAP_DELAY)) 932 if (!xfs_is_delayed_page(page, IOMAP_DELAY))
933 goto out_invalidate; 933 goto out_invalidate;
934 934
935 if (XFS_FORCED_SHUTDOWN(ip->i_mount))
936 goto out_invalidate;
937
935 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 938 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
936 "page discard on page %p, inode 0x%llx, offset %llu.", 939 "page discard on page %p, inode 0x%llx, offset %llu.",
937 page, ip->i_ino, offset); 940 page, ip->i_ino, offset);
@@ -964,8 +967,10 @@ xfs_aops_discard_page(
964 967
965 if (error) { 968 if (error) {
966 /* something screwed, just bail */ 969 /* something screwed, just bail */
967 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 970 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
968 "page discard failed delalloc mapping lookup."); 971 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
972 "page discard failed delalloc mapping lookup.");
973 }
969 break; 974 break;
970 } 975 }
971 if (!nimaps) { 976 if (!nimaps) {
@@ -991,8 +996,10 @@ xfs_aops_discard_page(
991 ASSERT(!flist.xbf_count && !flist.xbf_first); 996 ASSERT(!flist.xbf_count && !flist.xbf_first);
992 if (error) { 997 if (error) {
993 /* something screwed, just bail */ 998 /* something screwed, just bail */
994 xfs_fs_cmn_err(CE_ALERT, ip->i_mount, 999 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
1000 xfs_fs_cmn_err(CE_ALERT, ip->i_mount,
995 "page discard unable to remove delalloc mapping."); 1001 "page discard unable to remove delalloc mapping.");
1002 }
996 break; 1003 break;
997 } 1004 }
998next_buffer: 1005next_buffer:
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 6f76ba85f193..bd111b7e1daa 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -168,75 +168,6 @@ test_page_region(
168} 168}
169 169
170/* 170/*
171 * Mapping of multi-page buffers into contiguous virtual space
172 */
173
174typedef struct a_list {
175 void *vm_addr;
176 struct a_list *next;
177} a_list_t;
178
179static a_list_t *as_free_head;
180static int as_list_len;
181static DEFINE_SPINLOCK(as_lock);
182
183/*
184 * Try to batch vunmaps because they are costly.
185 */
186STATIC void
187free_address(
188 void *addr)
189{
190 a_list_t *aentry;
191
192#ifdef CONFIG_XEN
193 /*
194 * Xen needs to be able to make sure it can get an exclusive
195 * RO mapping of pages it wants to turn into a pagetable. If
196 * a newly allocated page is also still being vmap()ed by xfs,
197 * it will cause pagetable construction to fail. This is a
198 * quick workaround to always eagerly unmap pages so that Xen
199 * is happy.
200 */
201 vunmap(addr);
202 return;
203#endif
204
205 aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
206 if (likely(aentry)) {
207 spin_lock(&as_lock);
208 aentry->next = as_free_head;
209 aentry->vm_addr = addr;
210 as_free_head = aentry;
211 as_list_len++;
212 spin_unlock(&as_lock);
213 } else {
214 vunmap(addr);
215 }
216}
217
218STATIC void
219purge_addresses(void)
220{
221 a_list_t *aentry, *old;
222
223 if (as_free_head == NULL)
224 return;
225
226 spin_lock(&as_lock);
227 aentry = as_free_head;
228 as_free_head = NULL;
229 as_list_len = 0;
230 spin_unlock(&as_lock);
231
232 while ((old = aentry) != NULL) {
233 vunmap(aentry->vm_addr);
234 aentry = aentry->next;
235 kfree(old);
236 }
237}
238
239/*
240 * Internal xfs_buf_t object manipulation 171 * Internal xfs_buf_t object manipulation
241 */ 172 */
242 173
@@ -337,7 +268,8 @@ xfs_buf_free(
337 uint i; 268 uint i;
338 269
339 if (xfs_buf_is_vmapped(bp)) 270 if (xfs_buf_is_vmapped(bp))
340 free_address(bp->b_addr - bp->b_offset); 271 vm_unmap_ram(bp->b_addr - bp->b_offset,
272 bp->b_page_count);
341 273
342 for (i = 0; i < bp->b_page_count; i++) { 274 for (i = 0; i < bp->b_page_count; i++) {
343 struct page *page = bp->b_pages[i]; 275 struct page *page = bp->b_pages[i];
@@ -457,10 +389,8 @@ _xfs_buf_map_pages(
457 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; 389 bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
458 bp->b_flags |= XBF_MAPPED; 390 bp->b_flags |= XBF_MAPPED;
459 } else if (flags & XBF_MAPPED) { 391 } else if (flags & XBF_MAPPED) {
460 if (as_list_len > 64) 392 bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
461 purge_addresses(); 393 -1, PAGE_KERNEL);
462 bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
463 VM_MAP, PAGE_KERNEL);
464 if (unlikely(bp->b_addr == NULL)) 394 if (unlikely(bp->b_addr == NULL))
465 return -ENOMEM; 395 return -ENOMEM;
466 bp->b_addr += bp->b_offset; 396 bp->b_addr += bp->b_offset;
@@ -1955,9 +1885,6 @@ xfsbufd(
1955 xfs_buf_iostrategy(bp); 1885 xfs_buf_iostrategy(bp);
1956 count++; 1886 count++;
1957 } 1887 }
1958
1959 if (as_list_len > 0)
1960 purge_addresses();
1961 if (count) 1888 if (count)
1962 blk_run_address_space(target->bt_mapping); 1889 blk_run_address_space(target->bt_mapping);
1963 1890