diff options
author | Yan, Zheng <zyan@redhat.com> | 2017-12-14 02:11:09 -0500 |
---|---|---|
committer | Ilya Dryomov <idryomov@gmail.com> | 2019-03-05 12:55:16 -0500 |
commit | 75c9627efb7288e1725e9903ea275cc6b5992f17 (patch) | |
tree | 065ca7fbb792935029bc5c94875ead92cca81b9e /fs/ceph | |
parent | 81c5a1487e52a316e5e7d79e9911376648a79e85 (diff) |
ceph: map snapid to anonymous bdev ID
ceph_getattr() return zero dev ID for head inodes and set dev ID to
snapid directly for snaphost inodes. This is not good because userspace
utilities may consider device ID of 0 as invalid, snapid may conflict
with other device's ID.
This patch introduces "snapids to anonymous bdev IDs" map. we create a
new mapping when we see a snapid for the first time. we trim unused
mapping after it is ilde for 5 minutes.
Link: http://tracker.ceph.com/issues/22353
Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Diffstat (limited to 'fs/ceph')
-rw-r--r-- | fs/ceph/inode.c | 37 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 8 | ||||
-rw-r--r-- | fs/ceph/mds_client.h | 13 | ||||
-rw-r--r-- | fs/ceph/snap.c | 156 | ||||
-rw-r--r-- | fs/ceph/super.h | 13 |
5 files changed, 210 insertions, 17 deletions
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index c080b133c08b..7f82ceff510a 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c | |||
@@ -548,17 +548,22 @@ void ceph_destroy_inode(struct inode *inode) | |||
548 | */ | 548 | */ |
549 | if (ci->i_snap_realm) { | 549 | if (ci->i_snap_realm) { |
550 | struct ceph_mds_client *mdsc = | 550 | struct ceph_mds_client *mdsc = |
551 | ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; | 551 | ceph_inode_to_client(inode)->mdsc; |
552 | struct ceph_snap_realm *realm = ci->i_snap_realm; | 552 | if (ceph_snap(inode) == CEPH_NOSNAP) { |
553 | 553 | struct ceph_snap_realm *realm = ci->i_snap_realm; | |
554 | dout(" dropping residual ref to snap realm %p\n", realm); | 554 | dout(" dropping residual ref to snap realm %p\n", |
555 | spin_lock(&realm->inodes_with_caps_lock); | 555 | realm); |
556 | list_del_init(&ci->i_snap_realm_item); | 556 | spin_lock(&realm->inodes_with_caps_lock); |
557 | ci->i_snap_realm = NULL; | 557 | list_del_init(&ci->i_snap_realm_item); |
558 | if (realm->ino == ci->i_vino.ino) | 558 | ci->i_snap_realm = NULL; |
559 | realm->inode = NULL; | 559 | if (realm->ino == ci->i_vino.ino) |
560 | spin_unlock(&realm->inodes_with_caps_lock); | 560 | realm->inode = NULL; |
561 | ceph_put_snap_realm(mdsc, realm); | 561 | spin_unlock(&realm->inodes_with_caps_lock); |
562 | ceph_put_snap_realm(mdsc, realm); | ||
563 | } else { | ||
564 | ceph_put_snapid_map(mdsc, ci->i_snapid_map); | ||
565 | ci->i_snap_realm = NULL; | ||
566 | } | ||
562 | } | 567 | } |
563 | 568 | ||
564 | kfree(ci->i_symlink); | 569 | kfree(ci->i_symlink); |
@@ -776,6 +781,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page, | |||
776 | pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data, | 781 | pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data, |
777 | iinfo->pool_ns_len); | 782 | iinfo->pool_ns_len); |
778 | 783 | ||
784 | if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map) | ||
785 | ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode)); | ||
786 | |||
779 | spin_lock(&ci->i_ceph_lock); | 787 | spin_lock(&ci->i_ceph_lock); |
780 | 788 | ||
781 | /* | 789 | /* |
@@ -2260,10 +2268,11 @@ int ceph_getattr(const struct path *path, struct kstat *stat, | |||
2260 | if (!err) { | 2268 | if (!err) { |
2261 | generic_fillattr(inode, stat); | 2269 | generic_fillattr(inode, stat); |
2262 | stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); | 2270 | stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); |
2263 | if (ceph_snap(inode) != CEPH_NOSNAP) | 2271 | if (ceph_snap(inode) == CEPH_NOSNAP) |
2264 | stat->dev = ceph_snap(inode); | 2272 | stat->dev = inode->i_sb->s_dev; |
2265 | else | 2273 | else |
2266 | stat->dev = 0; | 2274 | stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0; |
2275 | |||
2267 | if (S_ISDIR(inode->i_mode)) { | 2276 | if (S_ISDIR(inode->i_mode)) { |
2268 | if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), | 2277 | if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb), |
2269 | RBYTES)) | 2278 | RBYTES)) |
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index cce4e4b9ea57..f2f57775d2d5 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c | |||
@@ -3791,6 +3791,8 @@ static void delayed_work(struct work_struct *work) | |||
3791 | dout("mdsc delayed_work\n"); | 3791 | dout("mdsc delayed_work\n"); |
3792 | ceph_check_delayed_caps(mdsc); | 3792 | ceph_check_delayed_caps(mdsc); |
3793 | 3793 | ||
3794 | ceph_trim_snapid_map(mdsc); | ||
3795 | |||
3794 | mutex_lock(&mdsc->mutex); | 3796 | mutex_lock(&mdsc->mutex); |
3795 | renew_interval = mdsc->mdsmap->m_session_timeout >> 2; | 3797 | renew_interval = mdsc->mdsmap->m_session_timeout >> 2; |
3796 | renew_caps = time_after_eq(jiffies, HZ*renew_interval + | 3798 | renew_caps = time_after_eq(jiffies, HZ*renew_interval + |
@@ -3893,6 +3895,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) | |||
3893 | ceph_caps_init(mdsc); | 3895 | ceph_caps_init(mdsc); |
3894 | ceph_adjust_min_caps(mdsc, fsc->min_caps); | 3896 | ceph_adjust_min_caps(mdsc, fsc->min_caps); |
3895 | 3897 | ||
3898 | spin_lock_init(&mdsc->snapid_map_lock); | ||
3899 | mdsc->snapid_map_tree = RB_ROOT; | ||
3900 | INIT_LIST_HEAD(&mdsc->snapid_map_lru); | ||
3901 | |||
3896 | init_rwsem(&mdsc->pool_perm_rwsem); | 3902 | init_rwsem(&mdsc->pool_perm_rwsem); |
3897 | mdsc->pool_perm_tree = RB_ROOT; | 3903 | mdsc->pool_perm_tree = RB_ROOT; |
3898 | 3904 | ||
@@ -4086,6 +4092,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) | |||
4086 | WARN_ON(!list_empty(&mdsc->cap_delay_list)); | 4092 | WARN_ON(!list_empty(&mdsc->cap_delay_list)); |
4087 | mutex_unlock(&mdsc->mutex); | 4093 | mutex_unlock(&mdsc->mutex); |
4088 | 4094 | ||
4095 | ceph_cleanup_snapid_map(mdsc); | ||
4096 | |||
4089 | ceph_cleanup_empty_realms(mdsc); | 4097 | ceph_cleanup_empty_realms(mdsc); |
4090 | 4098 | ||
4091 | cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ | 4099 | cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */ |
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 4f962642fee4..d3a5c4046316 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h | |||
@@ -313,6 +313,15 @@ struct ceph_pool_perm { | |||
313 | char pool_ns[]; | 313 | char pool_ns[]; |
314 | }; | 314 | }; |
315 | 315 | ||
316 | struct ceph_snapid_map { | ||
317 | struct rb_node node; | ||
318 | struct list_head lru; | ||
319 | atomic_t ref; | ||
320 | u64 snap; | ||
321 | dev_t dev; | ||
322 | unsigned long last_used; | ||
323 | }; | ||
324 | |||
316 | /* | 325 | /* |
317 | * mds client state | 326 | * mds client state |
318 | */ | 327 | */ |
@@ -390,6 +399,10 @@ struct ceph_mds_client { | |||
390 | struct list_head dentry_lru; | 399 | struct list_head dentry_lru; |
391 | int num_dentry; | 400 | int num_dentry; |
392 | 401 | ||
402 | spinlock_t snapid_map_lock; | ||
403 | struct rb_root snapid_map_tree; | ||
404 | struct list_head snapid_map_lru; | ||
405 | |||
393 | struct rw_semaphore pool_perm_rwsem; | 406 | struct rw_semaphore pool_perm_rwsem; |
394 | struct rb_root pool_perm_tree; | 407 | struct rb_root pool_perm_tree; |
395 | 408 | ||
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index dfc25ceeffed..89aa37fa0f84 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c | |||
@@ -3,12 +3,13 @@ | |||
3 | 3 | ||
4 | #include <linux/sort.h> | 4 | #include <linux/sort.h> |
5 | #include <linux/slab.h> | 5 | #include <linux/slab.h> |
6 | |||
7 | #include "super.h" | 6 | #include "super.h" |
8 | #include "mds_client.h" | 7 | #include "mds_client.h" |
9 | |||
10 | #include <linux/ceph/decode.h> | 8 | #include <linux/ceph/decode.h> |
11 | 9 | ||
10 | /* unused map expires after 5 minutes */ | ||
11 | #define CEPH_SNAPID_MAP_TIMEOUT (5 * 60 * HZ) | ||
12 | |||
12 | /* | 13 | /* |
13 | * Snapshots in ceph are driven in large part by cooperation from the | 14 | * Snapshots in ceph are driven in large part by cooperation from the |
14 | * client. In contrast to local file systems or file servers that | 15 | * client. In contrast to local file systems or file servers that |
@@ -989,3 +990,154 @@ out: | |||
989 | up_write(&mdsc->snap_rwsem); | 990 | up_write(&mdsc->snap_rwsem); |
990 | return; | 991 | return; |
991 | } | 992 | } |
993 | |||
994 | struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc, | ||
995 | u64 snap) | ||
996 | { | ||
997 | struct ceph_snapid_map *sm, *exist; | ||
998 | struct rb_node **p, *parent; | ||
999 | int ret; | ||
1000 | |||
1001 | exist = NULL; | ||
1002 | spin_lock(&mdsc->snapid_map_lock); | ||
1003 | p = &mdsc->snapid_map_tree.rb_node; | ||
1004 | while (*p) { | ||
1005 | exist = rb_entry(*p, struct ceph_snapid_map, node); | ||
1006 | if (snap > exist->snap) { | ||
1007 | p = &(*p)->rb_left; | ||
1008 | } else if (snap < exist->snap) { | ||
1009 | p = &(*p)->rb_right; | ||
1010 | } else { | ||
1011 | if (atomic_inc_return(&exist->ref) == 1) | ||
1012 | list_del_init(&exist->lru); | ||
1013 | break; | ||
1014 | } | ||
1015 | exist = NULL; | ||
1016 | } | ||
1017 | spin_unlock(&mdsc->snapid_map_lock); | ||
1018 | if (exist) { | ||
1019 | dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); | ||
1020 | return exist; | ||
1021 | } | ||
1022 | |||
1023 | sm = kmalloc(sizeof(*sm), GFP_NOFS); | ||
1024 | if (!sm) | ||
1025 | return NULL; | ||
1026 | |||
1027 | ret = get_anon_bdev(&sm->dev); | ||
1028 | if (ret < 0) { | ||
1029 | kfree(sm); | ||
1030 | return NULL; | ||
1031 | } | ||
1032 | |||
1033 | INIT_LIST_HEAD(&sm->lru); | ||
1034 | atomic_set(&sm->ref, 1); | ||
1035 | sm->snap = snap; | ||
1036 | |||
1037 | exist = NULL; | ||
1038 | parent = NULL; | ||
1039 | p = &mdsc->snapid_map_tree.rb_node; | ||
1040 | spin_lock(&mdsc->snapid_map_lock); | ||
1041 | while (*p) { | ||
1042 | parent = *p; | ||
1043 | exist = rb_entry(*p, struct ceph_snapid_map, node); | ||
1044 | if (snap > exist->snap) | ||
1045 | p = &(*p)->rb_left; | ||
1046 | else if (snap < exist->snap) | ||
1047 | p = &(*p)->rb_right; | ||
1048 | else | ||
1049 | break; | ||
1050 | exist = NULL; | ||
1051 | } | ||
1052 | if (exist) { | ||
1053 | if (atomic_inc_return(&exist->ref) == 1) | ||
1054 | list_del_init(&exist->lru); | ||
1055 | } else { | ||
1056 | rb_link_node(&sm->node, parent, p); | ||
1057 | rb_insert_color(&sm->node, &mdsc->snapid_map_tree); | ||
1058 | } | ||
1059 | spin_unlock(&mdsc->snapid_map_lock); | ||
1060 | if (exist) { | ||
1061 | free_anon_bdev(sm->dev); | ||
1062 | kfree(sm); | ||
1063 | dout("found snapid map %llx -> %x\n", exist->snap, exist->dev); | ||
1064 | return exist; | ||
1065 | } | ||
1066 | |||
1067 | dout("create snapid map %llx -> %x\n", sm->snap, sm->dev); | ||
1068 | return sm; | ||
1069 | } | ||
1070 | |||
1071 | void ceph_put_snapid_map(struct ceph_mds_client* mdsc, | ||
1072 | struct ceph_snapid_map *sm) | ||
1073 | { | ||
1074 | if (!sm) | ||
1075 | return; | ||
1076 | if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) { | ||
1077 | if (!RB_EMPTY_NODE(&sm->node)) { | ||
1078 | sm->last_used = jiffies; | ||
1079 | list_add_tail(&sm->lru, &mdsc->snapid_map_lru); | ||
1080 | spin_unlock(&mdsc->snapid_map_lock); | ||
1081 | } else { | ||
1082 | /* already cleaned up by | ||
1083 | * ceph_cleanup_snapid_map() */ | ||
1084 | spin_unlock(&mdsc->snapid_map_lock); | ||
1085 | kfree(sm); | ||
1086 | } | ||
1087 | } | ||
1088 | } | ||
1089 | |||
1090 | void ceph_trim_snapid_map(struct ceph_mds_client *mdsc) | ||
1091 | { | ||
1092 | struct ceph_snapid_map *sm; | ||
1093 | unsigned long now; | ||
1094 | LIST_HEAD(to_free); | ||
1095 | |||
1096 | spin_lock(&mdsc->snapid_map_lock); | ||
1097 | now = jiffies; | ||
1098 | |||
1099 | while (!list_empty(&mdsc->snapid_map_lru)) { | ||
1100 | sm = list_first_entry(&mdsc->snapid_map_lru, | ||
1101 | struct ceph_snapid_map, lru); | ||
1102 | if (time_after(sm->last_used + CEPH_SNAPID_MAP_TIMEOUT, now)) | ||
1103 | break; | ||
1104 | |||
1105 | rb_erase(&sm->node, &mdsc->snapid_map_tree); | ||
1106 | list_move(&sm->lru, &to_free); | ||
1107 | } | ||
1108 | spin_unlock(&mdsc->snapid_map_lock); | ||
1109 | |||
1110 | while (!list_empty(&to_free)) { | ||
1111 | sm = list_first_entry(&to_free, struct ceph_snapid_map, lru); | ||
1112 | list_del(&sm->lru); | ||
1113 | dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev); | ||
1114 | free_anon_bdev(sm->dev); | ||
1115 | kfree(sm); | ||
1116 | } | ||
1117 | } | ||
1118 | |||
1119 | void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc) | ||
1120 | { | ||
1121 | struct ceph_snapid_map *sm; | ||
1122 | struct rb_node *p; | ||
1123 | LIST_HEAD(to_free); | ||
1124 | |||
1125 | spin_lock(&mdsc->snapid_map_lock); | ||
1126 | while ((p = rb_first(&mdsc->snapid_map_tree))) { | ||
1127 | sm = rb_entry(p, struct ceph_snapid_map, node); | ||
1128 | rb_erase(p, &mdsc->snapid_map_tree); | ||
1129 | RB_CLEAR_NODE(p); | ||
1130 | list_move(&sm->lru, &to_free); | ||
1131 | } | ||
1132 | spin_unlock(&mdsc->snapid_map_lock); | ||
1133 | |||
1134 | while (!list_empty(&to_free)) { | ||
1135 | sm = list_first_entry(&to_free, struct ceph_snapid_map, lru); | ||
1136 | list_del(&sm->lru); | ||
1137 | free_anon_bdev(sm->dev); | ||
1138 | if (WARN_ON_ONCE(atomic_read(&sm->ref))) { | ||
1139 | pr_err("snapid map %llx -> %x still in use\n", | ||
1140 | sm->snap, sm->dev); | ||
1141 | } | ||
1142 | } | ||
1143 | } | ||
diff --git a/fs/ceph/super.h b/fs/ceph/super.h index dfb64a5211b6..5b15ae91024a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h | |||
@@ -370,7 +370,10 @@ struct ceph_inode_info { | |||
370 | struct list_head i_unsafe_iops; /* uncommitted mds inode ops */ | 370 | struct list_head i_unsafe_iops; /* uncommitted mds inode ops */ |
371 | spinlock_t i_unsafe_lock; | 371 | spinlock_t i_unsafe_lock; |
372 | 372 | ||
373 | struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ | 373 | union { |
374 | struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */ | ||
375 | struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */ | ||
376 | }; | ||
374 | int i_snap_realm_counter; /* snap realm (if caps) */ | 377 | int i_snap_realm_counter; /* snap realm (if caps) */ |
375 | struct list_head i_snap_realm_item; | 378 | struct list_head i_snap_realm_item; |
376 | struct list_head i_snap_flush_item; | 379 | struct list_head i_snap_flush_item; |
@@ -837,6 +840,14 @@ extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, | |||
837 | struct ceph_cap_snap *capsnap); | 840 | struct ceph_cap_snap *capsnap); |
838 | extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); | 841 | extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); |
839 | 842 | ||
843 | extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc, | ||
844 | u64 snap); | ||
845 | extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc, | ||
846 | struct ceph_snapid_map *sm); | ||
847 | extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc); | ||
848 | extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc); | ||
849 | |||
850 | |||
840 | /* | 851 | /* |
841 | * a cap_snap is "pending" if it is still awaiting an in-progress | 852 | * a cap_snap is "pending" if it is still awaiting an in-progress |
842 | * sync write (that may/may not still update size, mtime, etc.). | 853 | * sync write (that may/may not still update size, mtime, etc.). |