aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ceph
diff options
context:
space:
mode:
authorYan, Zheng <zyan@redhat.com>2017-12-14 02:11:09 -0500
committerIlya Dryomov <idryomov@gmail.com>2019-03-05 12:55:16 -0500
commit75c9627efb7288e1725e9903ea275cc6b5992f17 (patch)
tree065ca7fbb792935029bc5c94875ead92cca81b9e /fs/ceph
parent81c5a1487e52a316e5e7d79e9911376648a79e85 (diff)
ceph: map snapid to anonymous bdev ID
ceph_getattr() return zero dev ID for head inodes and set dev ID to snapid directly for snaphost inodes. This is not good because userspace utilities may consider device ID of 0 as invalid, snapid may conflict with other device's ID. This patch introduces "snapids to anonymous bdev IDs" map. we create a new mapping when we see a snapid for the first time. we trim unused mapping after it is ilde for 5 minutes. Link: http://tracker.ceph.com/issues/22353 Signed-off-by: "Yan, Zheng" <zyan@redhat.com> Acked-by: Jeff Layton <jlayton@redhat.com> Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Diffstat (limited to 'fs/ceph')
-rw-r--r--fs/ceph/inode.c37
-rw-r--r--fs/ceph/mds_client.c8
-rw-r--r--fs/ceph/mds_client.h13
-rw-r--r--fs/ceph/snap.c156
-rw-r--r--fs/ceph/super.h13
5 files changed, 210 insertions, 17 deletions
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index c080b133c08b..7f82ceff510a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -548,17 +548,22 @@ void ceph_destroy_inode(struct inode *inode)
548 */ 548 */
549 if (ci->i_snap_realm) { 549 if (ci->i_snap_realm) {
550 struct ceph_mds_client *mdsc = 550 struct ceph_mds_client *mdsc =
551 ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; 551 ceph_inode_to_client(inode)->mdsc;
552 struct ceph_snap_realm *realm = ci->i_snap_realm; 552 if (ceph_snap(inode) == CEPH_NOSNAP) {
553 553 struct ceph_snap_realm *realm = ci->i_snap_realm;
554 dout(" dropping residual ref to snap realm %p\n", realm); 554 dout(" dropping residual ref to snap realm %p\n",
555 spin_lock(&realm->inodes_with_caps_lock); 555 realm);
556 list_del_init(&ci->i_snap_realm_item); 556 spin_lock(&realm->inodes_with_caps_lock);
557 ci->i_snap_realm = NULL; 557 list_del_init(&ci->i_snap_realm_item);
558 if (realm->ino == ci->i_vino.ino) 558 ci->i_snap_realm = NULL;
559 realm->inode = NULL; 559 if (realm->ino == ci->i_vino.ino)
560 spin_unlock(&realm->inodes_with_caps_lock); 560 realm->inode = NULL;
561 ceph_put_snap_realm(mdsc, realm); 561 spin_unlock(&realm->inodes_with_caps_lock);
562 ceph_put_snap_realm(mdsc, realm);
563 } else {
564 ceph_put_snapid_map(mdsc, ci->i_snapid_map);
565 ci->i_snap_realm = NULL;
566 }
562 } 567 }
563 568
564 kfree(ci->i_symlink); 569 kfree(ci->i_symlink);
@@ -776,6 +781,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
776 pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data, 781 pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
777 iinfo->pool_ns_len); 782 iinfo->pool_ns_len);
778 783
784 if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
785 ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
786
779 spin_lock(&ci->i_ceph_lock); 787 spin_lock(&ci->i_ceph_lock);
780 788
781 /* 789 /*
@@ -2260,10 +2268,11 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
2260 if (!err) { 2268 if (!err) {
2261 generic_fillattr(inode, stat); 2269 generic_fillattr(inode, stat);
2262 stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); 2270 stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
2263 if (ceph_snap(inode) != CEPH_NOSNAP) 2271 if (ceph_snap(inode) == CEPH_NOSNAP)
2264 stat->dev = ceph_snap(inode); 2272 stat->dev = inode->i_sb->s_dev;
2265 else 2273 else
2266 stat->dev = 0; 2274 stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
2275
2267 if (S_ISDIR(inode->i_mode)) { 2276 if (S_ISDIR(inode->i_mode)) {
2268 if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), 2277 if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
2269 RBYTES)) 2278 RBYTES))
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index cce4e4b9ea57..f2f57775d2d5 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3791,6 +3791,8 @@ static void delayed_work(struct work_struct *work)
3791 dout("mdsc delayed_work\n"); 3791 dout("mdsc delayed_work\n");
3792 ceph_check_delayed_caps(mdsc); 3792 ceph_check_delayed_caps(mdsc);
3793 3793
3794 ceph_trim_snapid_map(mdsc);
3795
3794 mutex_lock(&mdsc->mutex); 3796 mutex_lock(&mdsc->mutex);
3795 renew_interval = mdsc->mdsmap->m_session_timeout >> 2; 3797 renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
3796 renew_caps = time_after_eq(jiffies, HZ*renew_interval + 3798 renew_caps = time_after_eq(jiffies, HZ*renew_interval +
@@ -3893,6 +3895,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
3893 ceph_caps_init(mdsc); 3895 ceph_caps_init(mdsc);
3894 ceph_adjust_min_caps(mdsc, fsc->min_caps); 3896 ceph_adjust_min_caps(mdsc, fsc->min_caps);
3895 3897
3898 spin_lock_init(&mdsc->snapid_map_lock);
3899 mdsc->snapid_map_tree = RB_ROOT;
3900 INIT_LIST_HEAD(&mdsc->snapid_map_lru);
3901
3896 init_rwsem(&mdsc->pool_perm_rwsem); 3902 init_rwsem(&mdsc->pool_perm_rwsem);
3897 mdsc->pool_perm_tree = RB_ROOT; 3903 mdsc->pool_perm_tree = RB_ROOT;
3898 3904
@@ -4086,6 +4092,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
4086 WARN_ON(!list_empty(&mdsc->cap_delay_list)); 4092 WARN_ON(!list_empty(&mdsc->cap_delay_list));
4087 mutex_unlock(&mdsc->mutex); 4093 mutex_unlock(&mdsc->mutex);
4088 4094
4095 ceph_cleanup_snapid_map(mdsc);
4096
4089 ceph_cleanup_empty_realms(mdsc); 4097 ceph_cleanup_empty_realms(mdsc);
4090 4098
4091 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ 4099 cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 4f962642fee4..d3a5c4046316 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -313,6 +313,15 @@ struct ceph_pool_perm {
313 char pool_ns[]; 313 char pool_ns[];
314}; 314};
315 315
316struct ceph_snapid_map {
317 struct rb_node node;
318 struct list_head lru;
319 atomic_t ref;
320 u64 snap;
321 dev_t dev;
322 unsigned long last_used;
323};
324
316/* 325/*
317 * mds client state 326 * mds client state
318 */ 327 */
@@ -390,6 +399,10 @@ struct ceph_mds_client {
390 struct list_head dentry_lru; 399 struct list_head dentry_lru;
391 int num_dentry; 400 int num_dentry;
392 401
402 spinlock_t snapid_map_lock;
403 struct rb_root snapid_map_tree;
404 struct list_head snapid_map_lru;
405
393 struct rw_semaphore pool_perm_rwsem; 406 struct rw_semaphore pool_perm_rwsem;
394 struct rb_root pool_perm_tree; 407 struct rb_root pool_perm_tree;
395 408
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index dfc25ceeffed..89aa37fa0f84 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -3,12 +3,13 @@
3 3
4#include <linux/sort.h> 4#include <linux/sort.h>
5#include <linux/slab.h> 5#include <linux/slab.h>
6
7#include "super.h" 6#include "super.h"
8#include "mds_client.h" 7#include "mds_client.h"
9
10#include <linux/ceph/decode.h> 8#include <linux/ceph/decode.h>
11 9
10/* unused map expires after 5 minutes */
11#define CEPH_SNAPID_MAP_TIMEOUT (5 * 60 * HZ)
12
12/* 13/*
13 * Snapshots in ceph are driven in large part by cooperation from the 14 * Snapshots in ceph are driven in large part by cooperation from the
14 * client. In contrast to local file systems or file servers that 15 * client. In contrast to local file systems or file servers that
@@ -989,3 +990,154 @@ out:
989 up_write(&mdsc->snap_rwsem); 990 up_write(&mdsc->snap_rwsem);
990 return; 991 return;
991} 992}
993
994struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
995 u64 snap)
996{
997 struct ceph_snapid_map *sm, *exist;
998 struct rb_node **p, *parent;
999 int ret;
1000
1001 exist = NULL;
1002 spin_lock(&mdsc->snapid_map_lock);
1003 p = &mdsc->snapid_map_tree.rb_node;
1004 while (*p) {
1005 exist = rb_entry(*p, struct ceph_snapid_map, node);
1006 if (snap > exist->snap) {
1007 p = &(*p)->rb_left;
1008 } else if (snap < exist->snap) {
1009 p = &(*p)->rb_right;
1010 } else {
1011 if (atomic_inc_return(&exist->ref) == 1)
1012 list_del_init(&exist->lru);
1013 break;
1014 }
1015 exist = NULL;
1016 }
1017 spin_unlock(&mdsc->snapid_map_lock);
1018 if (exist) {
1019 dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
1020 return exist;
1021 }
1022
1023 sm = kmalloc(sizeof(*sm), GFP_NOFS);
1024 if (!sm)
1025 return NULL;
1026
1027 ret = get_anon_bdev(&sm->dev);
1028 if (ret < 0) {
1029 kfree(sm);
1030 return NULL;
1031 }
1032
1033 INIT_LIST_HEAD(&sm->lru);
1034 atomic_set(&sm->ref, 1);
1035 sm->snap = snap;
1036
1037 exist = NULL;
1038 parent = NULL;
1039 p = &mdsc->snapid_map_tree.rb_node;
1040 spin_lock(&mdsc->snapid_map_lock);
1041 while (*p) {
1042 parent = *p;
1043 exist = rb_entry(*p, struct ceph_snapid_map, node);
1044 if (snap > exist->snap)
1045 p = &(*p)->rb_left;
1046 else if (snap < exist->snap)
1047 p = &(*p)->rb_right;
1048 else
1049 break;
1050 exist = NULL;
1051 }
1052 if (exist) {
1053 if (atomic_inc_return(&exist->ref) == 1)
1054 list_del_init(&exist->lru);
1055 } else {
1056 rb_link_node(&sm->node, parent, p);
1057 rb_insert_color(&sm->node, &mdsc->snapid_map_tree);
1058 }
1059 spin_unlock(&mdsc->snapid_map_lock);
1060 if (exist) {
1061 free_anon_bdev(sm->dev);
1062 kfree(sm);
1063 dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
1064 return exist;
1065 }
1066
1067 dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
1068 return sm;
1069}
1070
1071void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
1072 struct ceph_snapid_map *sm)
1073{
1074 if (!sm)
1075 return;
1076 if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) {
1077 if (!RB_EMPTY_NODE(&sm->node)) {
1078 sm->last_used = jiffies;
1079 list_add_tail(&sm->lru, &mdsc->snapid_map_lru);
1080 spin_unlock(&mdsc->snapid_map_lock);
1081 } else {
1082 /* already cleaned up by
1083 * ceph_cleanup_snapid_map() */
1084 spin_unlock(&mdsc->snapid_map_lock);
1085 kfree(sm);
1086 }
1087 }
1088}
1089
1090void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
1091{
1092 struct ceph_snapid_map *sm;
1093 unsigned long now;
1094 LIST_HEAD(to_free);
1095
1096 spin_lock(&mdsc->snapid_map_lock);
1097 now = jiffies;
1098
1099 while (!list_empty(&mdsc->snapid_map_lru)) {
1100 sm = list_first_entry(&mdsc->snapid_map_lru,
1101 struct ceph_snapid_map, lru);
1102 if (time_after(sm->last_used + CEPH_SNAPID_MAP_TIMEOUT, now))
1103 break;
1104
1105 rb_erase(&sm->node, &mdsc->snapid_map_tree);
1106 list_move(&sm->lru, &to_free);
1107 }
1108 spin_unlock(&mdsc->snapid_map_lock);
1109
1110 while (!list_empty(&to_free)) {
1111 sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
1112 list_del(&sm->lru);
1113 dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev);
1114 free_anon_bdev(sm->dev);
1115 kfree(sm);
1116 }
1117}
1118
1119void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
1120{
1121 struct ceph_snapid_map *sm;
1122 struct rb_node *p;
1123 LIST_HEAD(to_free);
1124
1125 spin_lock(&mdsc->snapid_map_lock);
1126 while ((p = rb_first(&mdsc->snapid_map_tree))) {
1127 sm = rb_entry(p, struct ceph_snapid_map, node);
1128 rb_erase(p, &mdsc->snapid_map_tree);
1129 RB_CLEAR_NODE(p);
1130 list_move(&sm->lru, &to_free);
1131 }
1132 spin_unlock(&mdsc->snapid_map_lock);
1133
1134 while (!list_empty(&to_free)) {
1135 sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
1136 list_del(&sm->lru);
1137 free_anon_bdev(sm->dev);
1138 if (WARN_ON_ONCE(atomic_read(&sm->ref))) {
1139 pr_err("snapid map %llx -> %x still in use\n",
1140 sm->snap, sm->dev);
1141 }
1142 }
1143}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index dfb64a5211b6..5b15ae91024a 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -370,7 +370,10 @@ struct ceph_inode_info {
370 struct list_head i_unsafe_iops; /* uncommitted mds inode ops */ 370 struct list_head i_unsafe_iops; /* uncommitted mds inode ops */
371 spinlock_t i_unsafe_lock; 371 spinlock_t i_unsafe_lock;
372 372
373 struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ 373 union {
374 struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
375 struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
376 };
374 int i_snap_realm_counter; /* snap realm (if caps) */ 377 int i_snap_realm_counter; /* snap realm (if caps) */
375 struct list_head i_snap_realm_item; 378 struct list_head i_snap_realm_item;
376 struct list_head i_snap_flush_item; 379 struct list_head i_snap_flush_item;
@@ -837,6 +840,14 @@ extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
837 struct ceph_cap_snap *capsnap); 840 struct ceph_cap_snap *capsnap);
838extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); 841extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
839 842
843extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
844 u64 snap);
845extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
846 struct ceph_snapid_map *sm);
847extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc);
848extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc);
849
850
840/* 851/*
841 * a cap_snap is "pending" if it is still awaiting an in-progress 852 * a cap_snap is "pending" if it is still awaiting an in-progress
842 * sync write (that may/may not still update size, mtime, etc.). 853 * sync write (that may/may not still update size, mtime, etc.).