diff options
| author | Eric Dumazet <edumazet@google.com> | 2017-09-19 19:27:06 -0400 |
|---|---|---|
| committer | David S. Miller <davem@davemloft.net> | 2017-09-19 19:32:23 -0400 |
| commit | a90c9347e90ed1e9323d71402ed18023bc910cd8 (patch) | |
| tree | 2b5eeeed2992686abdeb1121705aada9a5c44aff /net/ipv6/addrlabel.c | |
| parent | d464e84eed02993d40ad55fdc19f4523e4deee5b (diff) | |
ipv6: addrlabel: per netns list
Having a global list of labels do not scale to thousands of
netns in the cloud era. This causes quadratic behavior on
netns creation and deletion.
This is time having a per netns list of ~10 labels.
Tested:
$ time perf record (for f in `seq 1 3000` ; do ip netns add tast$f; done)
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 3.637 MB perf.data (~158898 samples) ]
real 0m20.837s # instead of 0m24.227s
user 0m0.328s
sys 0m20.338s # instead of 0m23.753s
16.17% ip [kernel.kallsyms] [k] netlink_broadcast_filtered
12.30% ip [kernel.kallsyms] [k] netlink_has_listeners
6.76% ip [kernel.kallsyms] [k] _raw_spin_lock_irqsave
5.78% ip [kernel.kallsyms] [k] memset_erms
5.77% ip [kernel.kallsyms] [k] kobject_uevent_env
5.18% ip [kernel.kallsyms] [k] refcount_sub_and_test
4.96% ip [kernel.kallsyms] [k] _raw_read_lock
3.82% ip [kernel.kallsyms] [k] refcount_inc_not_zero
3.33% ip [kernel.kallsyms] [k] _raw_spin_unlock_irqrestore
2.11% ip [kernel.kallsyms] [k] unmap_page_range
1.77% ip [kernel.kallsyms] [k] __wake_up
1.69% ip [kernel.kallsyms] [k] strlen
1.17% ip [kernel.kallsyms] [k] __wake_up_common
1.09% ip [kernel.kallsyms] [k] insert_header
1.04% ip [kernel.kallsyms] [k] page_remove_rmap
1.01% ip [kernel.kallsyms] [k] consume_skb
0.98% ip [kernel.kallsyms] [k] netlink_trim
0.51% ip [kernel.kallsyms] [k] kernfs_link_sibling
0.51% ip [kernel.kallsyms] [k] filemap_map_pages
0.46% ip [kernel.kallsyms] [k] memcpy_erms
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net/ipv6/addrlabel.c')
| -rw-r--r-- | net/ipv6/addrlabel.c | 81 |
1 files changed, 30 insertions, 51 deletions
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index b055bc79f56d..c6311d7108f6 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c | |||
| @@ -30,7 +30,6 @@ | |||
| 30 | * Policy Table | 30 | * Policy Table |
| 31 | */ | 31 | */ |
| 32 | struct ip6addrlbl_entry { | 32 | struct ip6addrlbl_entry { |
| 33 | possible_net_t lbl_net; | ||
| 34 | struct in6_addr prefix; | 33 | struct in6_addr prefix; |
| 35 | int prefixlen; | 34 | int prefixlen; |
| 36 | int ifindex; | 35 | int ifindex; |
| @@ -41,19 +40,6 @@ struct ip6addrlbl_entry { | |||
| 41 | struct rcu_head rcu; | 40 | struct rcu_head rcu; |
| 42 | }; | 41 | }; |
| 43 | 42 | ||
| 44 | static struct ip6addrlbl_table | ||
| 45 | { | ||
| 46 | struct hlist_head head; | ||
| 47 | spinlock_t lock; | ||
| 48 | u32 seq; | ||
| 49 | } ip6addrlbl_table; | ||
| 50 | |||
| 51 | static inline | ||
| 52 | struct net *ip6addrlbl_net(const struct ip6addrlbl_entry *lbl) | ||
| 53 | { | ||
| 54 | return read_pnet(&lbl->lbl_net); | ||
| 55 | } | ||
| 56 | |||
| 57 | /* | 43 | /* |
| 58 | * Default policy table (RFC6724 + extensions) | 44 | * Default policy table (RFC6724 + extensions) |
| 59 | * | 45 | * |
| @@ -148,13 +134,10 @@ static inline void ip6addrlbl_put(struct ip6addrlbl_entry *p) | |||
| 148 | } | 134 | } |
| 149 | 135 | ||
| 150 | /* Find label */ | 136 | /* Find label */ |
| 151 | static bool __ip6addrlbl_match(struct net *net, | 137 | static bool __ip6addrlbl_match(const struct ip6addrlbl_entry *p, |
| 152 | const struct ip6addrlbl_entry *p, | ||
| 153 | const struct in6_addr *addr, | 138 | const struct in6_addr *addr, |
| 154 | int addrtype, int ifindex) | 139 | int addrtype, int ifindex) |
| 155 | { | 140 | { |
| 156 | if (!net_eq(ip6addrlbl_net(p), net)) | ||
| 157 | return false; | ||
| 158 | if (p->ifindex && p->ifindex != ifindex) | 141 | if (p->ifindex && p->ifindex != ifindex) |
| 159 | return false; | 142 | return false; |
| 160 | if (p->addrtype && p->addrtype != addrtype) | 143 | if (p->addrtype && p->addrtype != addrtype) |
| @@ -169,8 +152,9 @@ static struct ip6addrlbl_entry *__ipv6_addr_label(struct net *net, | |||
| 169 | int type, int ifindex) | 152 | int type, int ifindex) |
| 170 | { | 153 | { |
| 171 | struct ip6addrlbl_entry *p; | 154 | struct ip6addrlbl_entry *p; |
| 172 | hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) { | 155 | |
| 173 | if (__ip6addrlbl_match(net, p, addr, type, ifindex)) | 156 | hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { |
| 157 | if (__ip6addrlbl_match(p, addr, type, ifindex)) | ||
| 174 | return p; | 158 | return p; |
| 175 | } | 159 | } |
| 176 | return NULL; | 160 | return NULL; |
| @@ -196,8 +180,7 @@ u32 ipv6_addr_label(struct net *net, | |||
| 196 | } | 180 | } |
| 197 | 181 | ||
| 198 | /* allocate one entry */ | 182 | /* allocate one entry */ |
| 199 | static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net, | 183 | static struct ip6addrlbl_entry *ip6addrlbl_alloc(const struct in6_addr *prefix, |
| 200 | const struct in6_addr *prefix, | ||
| 201 | int prefixlen, int ifindex, | 184 | int prefixlen, int ifindex, |
| 202 | u32 label) | 185 | u32 label) |
| 203 | { | 186 | { |
| @@ -236,24 +219,23 @@ static struct ip6addrlbl_entry *ip6addrlbl_alloc(struct net *net, | |||
| 236 | newp->addrtype = addrtype; | 219 | newp->addrtype = addrtype; |
| 237 | newp->label = label; | 220 | newp->label = label; |
| 238 | INIT_HLIST_NODE(&newp->list); | 221 | INIT_HLIST_NODE(&newp->list); |
| 239 | write_pnet(&newp->lbl_net, net); | ||
| 240 | refcount_set(&newp->refcnt, 1); | 222 | refcount_set(&newp->refcnt, 1); |
| 241 | return newp; | 223 | return newp; |
| 242 | } | 224 | } |
| 243 | 225 | ||
| 244 | /* add a label */ | 226 | /* add a label */ |
| 245 | static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) | 227 | static int __ip6addrlbl_add(struct net *net, struct ip6addrlbl_entry *newp, |
| 228 | int replace) | ||
| 246 | { | 229 | { |
| 247 | struct hlist_node *n; | ||
| 248 | struct ip6addrlbl_entry *last = NULL, *p = NULL; | 230 | struct ip6addrlbl_entry *last = NULL, *p = NULL; |
| 231 | struct hlist_node *n; | ||
| 249 | int ret = 0; | 232 | int ret = 0; |
| 250 | 233 | ||
| 251 | ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp, | 234 | ADDRLABEL(KERN_DEBUG "%s(newp=%p, replace=%d)\n", __func__, newp, |
| 252 | replace); | 235 | replace); |
| 253 | 236 | ||
| 254 | hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { | 237 | hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { |
| 255 | if (p->prefixlen == newp->prefixlen && | 238 | if (p->prefixlen == newp->prefixlen && |
| 256 | net_eq(ip6addrlbl_net(p), ip6addrlbl_net(newp)) && | ||
| 257 | p->ifindex == newp->ifindex && | 239 | p->ifindex == newp->ifindex && |
| 258 | ipv6_addr_equal(&p->prefix, &newp->prefix)) { | 240 | ipv6_addr_equal(&p->prefix, &newp->prefix)) { |
| 259 | if (!replace) { | 241 | if (!replace) { |
| @@ -273,10 +255,10 @@ static int __ip6addrlbl_add(struct ip6addrlbl_entry *newp, int replace) | |||
| 273 | if (last) | 255 | if (last) |
| 274 | hlist_add_behind_rcu(&newp->list, &last->list); | 256 | hlist_add_behind_rcu(&newp->list, &last->list); |
| 275 | else | 257 | else |
| 276 | hlist_add_head_rcu(&newp->list, &ip6addrlbl_table.head); | 258 | hlist_add_head_rcu(&newp->list, &net->ipv6.ip6addrlbl_table.head); |
| 277 | out: | 259 | out: |
| 278 | if (!ret) | 260 | if (!ret) |
| 279 | ip6addrlbl_table.seq++; | 261 | net->ipv6.ip6addrlbl_table.seq++; |
| 280 | return ret; | 262 | return ret; |
| 281 | } | 263 | } |
| 282 | 264 | ||
| @@ -292,12 +274,12 @@ static int ip6addrlbl_add(struct net *net, | |||
| 292 | __func__, prefix, prefixlen, ifindex, (unsigned int)label, | 274 | __func__, prefix, prefixlen, ifindex, (unsigned int)label, |
| 293 | replace); | 275 | replace); |
| 294 | 276 | ||
| 295 | newp = ip6addrlbl_alloc(net, prefix, prefixlen, ifindex, label); | 277 | newp = ip6addrlbl_alloc(prefix, prefixlen, ifindex, label); |
| 296 | if (IS_ERR(newp)) | 278 | if (IS_ERR(newp)) |
| 297 | return PTR_ERR(newp); | 279 | return PTR_ERR(newp); |
| 298 | spin_lock(&ip6addrlbl_table.lock); | 280 | spin_lock(&net->ipv6.ip6addrlbl_table.lock); |
| 299 | ret = __ip6addrlbl_add(newp, replace); | 281 | ret = __ip6addrlbl_add(net, newp, replace); |
| 300 | spin_unlock(&ip6addrlbl_table.lock); | 282 | spin_unlock(&net->ipv6.ip6addrlbl_table.lock); |
| 301 | if (ret) | 283 | if (ret) |
| 302 | ip6addrlbl_free(newp); | 284 | ip6addrlbl_free(newp); |
| 303 | return ret; | 285 | return ret; |
| @@ -315,9 +297,8 @@ static int __ip6addrlbl_del(struct net *net, | |||
| 315 | ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", | 297 | ADDRLABEL(KERN_DEBUG "%s(prefix=%pI6, prefixlen=%d, ifindex=%d)\n", |
| 316 | __func__, prefix, prefixlen, ifindex); | 298 | __func__, prefix, prefixlen, ifindex); |
| 317 | 299 | ||
| 318 | hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { | 300 | hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { |
| 319 | if (p->prefixlen == prefixlen && | 301 | if (p->prefixlen == prefixlen && |
| 320 | net_eq(ip6addrlbl_net(p), net) && | ||
| 321 | p->ifindex == ifindex && | 302 | p->ifindex == ifindex && |
| 322 | ipv6_addr_equal(&p->prefix, prefix)) { | 303 | ipv6_addr_equal(&p->prefix, prefix)) { |
| 323 | hlist_del_rcu(&p->list); | 304 | hlist_del_rcu(&p->list); |
| @@ -340,9 +321,9 @@ static int ip6addrlbl_del(struct net *net, | |||
| 340 | __func__, prefix, prefixlen, ifindex); | 321 | __func__, prefix, prefixlen, ifindex); |
| 341 | 322 | ||
| 342 | ipv6_addr_prefix(&prefix_buf, prefix, prefixlen); | 323 | ipv6_addr_prefix(&prefix_buf, prefix, prefixlen); |
| 343 | spin_lock(&ip6addrlbl_table.lock); | 324 | spin_lock(&net->ipv6.ip6addrlbl_table.lock); |
| 344 | ret = __ip6addrlbl_del(net, &prefix_buf, prefixlen, ifindex); | 325 | ret = __ip6addrlbl_del(net, &prefix_buf, prefixlen, ifindex); |
| 345 | spin_unlock(&ip6addrlbl_table.lock); | 326 | spin_unlock(&net->ipv6.ip6addrlbl_table.lock); |
| 346 | return ret; | 327 | return ret; |
| 347 | } | 328 | } |
| 348 | 329 | ||
| @@ -354,6 +335,9 @@ static int __net_init ip6addrlbl_net_init(struct net *net) | |||
| 354 | 335 | ||
| 355 | ADDRLABEL(KERN_DEBUG "%s\n", __func__); | 336 | ADDRLABEL(KERN_DEBUG "%s\n", __func__); |
| 356 | 337 | ||
| 338 | spin_lock_init(&net->ipv6.ip6addrlbl_table.lock); | ||
| 339 | INIT_HLIST_HEAD(&net->ipv6.ip6addrlbl_table.head); | ||
| 340 | |||
| 357 | for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) { | 341 | for (i = 0; i < ARRAY_SIZE(ip6addrlbl_init_table); i++) { |
| 358 | int ret = ip6addrlbl_add(net, | 342 | int ret = ip6addrlbl_add(net, |
| 359 | ip6addrlbl_init_table[i].prefix, | 343 | ip6addrlbl_init_table[i].prefix, |
| @@ -373,14 +357,12 @@ static void __net_exit ip6addrlbl_net_exit(struct net *net) | |||
| 373 | struct hlist_node *n; | 357 | struct hlist_node *n; |
| 374 | 358 | ||
| 375 | /* Remove all labels belonging to the exiting net */ | 359 | /* Remove all labels belonging to the exiting net */ |
| 376 | spin_lock(&ip6addrlbl_table.lock); | 360 | spin_lock(&net->ipv6.ip6addrlbl_table.lock); |
| 377 | hlist_for_each_entry_safe(p, n, &ip6addrlbl_table.head, list) { | 361 | hlist_for_each_entry_safe(p, n, &net->ipv6.ip6addrlbl_table.head, list) { |
| 378 | if (net_eq(ip6addrlbl_net(p), net)) { | 362 | hlist_del_rcu(&p->list); |
| 379 | hlist_del_rcu(&p->list); | 363 | ip6addrlbl_put(p); |
| 380 | ip6addrlbl_put(p); | ||
| 381 | } | ||
| 382 | } | 364 | } |
| 383 | spin_unlock(&ip6addrlbl_table.lock); | 365 | spin_unlock(&net->ipv6.ip6addrlbl_table.lock); |
| 384 | } | 366 | } |
| 385 | 367 | ||
| 386 | static struct pernet_operations ipv6_addr_label_ops = { | 368 | static struct pernet_operations ipv6_addr_label_ops = { |
| @@ -390,8 +372,6 @@ static struct pernet_operations ipv6_addr_label_ops = { | |||
| 390 | 372 | ||
| 391 | int __init ipv6_addr_label_init(void) | 373 | int __init ipv6_addr_label_init(void) |
| 392 | { | 374 | { |
| 393 | spin_lock_init(&ip6addrlbl_table.lock); | ||
| 394 | |||
| 395 | return register_pernet_subsys(&ipv6_addr_label_ops); | 375 | return register_pernet_subsys(&ipv6_addr_label_ops); |
| 396 | } | 376 | } |
| 397 | 377 | ||
| @@ -510,11 +490,10 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) | |||
| 510 | int err; | 490 | int err; |
| 511 | 491 | ||
| 512 | rcu_read_lock(); | 492 | rcu_read_lock(); |
| 513 | hlist_for_each_entry_rcu(p, &ip6addrlbl_table.head, list) { | 493 | hlist_for_each_entry_rcu(p, &net->ipv6.ip6addrlbl_table.head, list) { |
| 514 | if (idx >= s_idx && | 494 | if (idx >= s_idx) { |
| 515 | net_eq(ip6addrlbl_net(p), net)) { | ||
| 516 | err = ip6addrlbl_fill(skb, p, | 495 | err = ip6addrlbl_fill(skb, p, |
| 517 | ip6addrlbl_table.seq, | 496 | net->ipv6.ip6addrlbl_table.seq, |
| 518 | NETLINK_CB(cb->skb).portid, | 497 | NETLINK_CB(cb->skb).portid, |
| 519 | cb->nlh->nlmsg_seq, | 498 | cb->nlh->nlmsg_seq, |
| 520 | RTM_NEWADDRLABEL, | 499 | RTM_NEWADDRLABEL, |
| @@ -571,7 +550,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh, | |||
| 571 | p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); | 550 | p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index); |
| 572 | if (p && !ip6addrlbl_hold(p)) | 551 | if (p && !ip6addrlbl_hold(p)) |
| 573 | p = NULL; | 552 | p = NULL; |
| 574 | lseq = ip6addrlbl_table.seq; | 553 | lseq = net->ipv6.ip6addrlbl_table.seq; |
| 575 | rcu_read_unlock(); | 554 | rcu_read_unlock(); |
| 576 | 555 | ||
| 577 | if (!p) { | 556 | if (!p) { |
