aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorDavid S. Miller <davem@davemloft.net>2015-01-03 14:33:03 -0500
committerDavid S. Miller <davem@davemloft.net>2015-01-03 14:33:03 -0500
commit7beceebf5b9d14e333ab6025a6feccdc8e765225 (patch)
tree8c1d2761c3959356151eed7bb677df633d64c0dd /net
parentdd9553988879a3ff71a86323b88409e7631c4e5d (diff)
parent21e4902aea80ef35afc00ee8d2abdea4f519b7f7 (diff)
Merge branch 'rhashtable-next'
Thomas Graf says: ==================== rhashtable: Per bucket locks & deferred table resizing Prepares for and introduces per bucket spinlocks and deferred table resizing. This allows for parallel table mutations in different hash buckets from atomic context. The resizing occurs in the background in a separate worker thread while lookups, inserts, and removals can continue. Also modified the chain linked list to be terminated with a special nulls marker to allow entries to move between multiple lists. Last but not least, reintroduces lockless netlink_lookup() with deferred Netlink socket destruction to avoid the side effect of increased netlink_release() runtime. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/netfilter/nft_hash.c92
-rw-r--r--net/netlink/af_netlink.c64
-rw-r--r--net/netlink/af_netlink.h1
-rw-r--r--net/netlink/diag.c4
4 files changed, 79 insertions, 82 deletions
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 1e316ce4cb5d..75887d7d2c6a 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -33,7 +33,7 @@ static bool nft_hash_lookup(const struct nft_set *set,
33 const struct nft_data *key, 33 const struct nft_data *key,
34 struct nft_data *data) 34 struct nft_data *data)
35{ 35{
36 const struct rhashtable *priv = nft_set_priv(set); 36 struct rhashtable *priv = nft_set_priv(set);
37 const struct nft_hash_elem *he; 37 const struct nft_hash_elem *he;
38 38
39 he = rhashtable_lookup(priv, key); 39 he = rhashtable_lookup(priv, key);
@@ -83,46 +83,53 @@ static void nft_hash_remove(const struct nft_set *set,
83 const struct nft_set_elem *elem) 83 const struct nft_set_elem *elem)
84{ 84{
85 struct rhashtable *priv = nft_set_priv(set); 85 struct rhashtable *priv = nft_set_priv(set);
86 struct rhash_head *he, __rcu **pprev;
87 86
88 pprev = elem->cookie; 87 rhashtable_remove(priv, elem->cookie);
89 he = rht_dereference((*pprev), priv); 88 synchronize_rcu();
89 kfree(elem->cookie);
90}
90 91
91 rhashtable_remove_pprev(priv, he, pprev); 92struct nft_compare_arg {
93 const struct nft_set *set;
94 struct nft_set_elem *elem;
95};
92 96
93 synchronize_rcu(); 97static bool nft_hash_compare(void *ptr, void *arg)
94 kfree(he); 98{
99 struct nft_hash_elem *he = ptr;
100 struct nft_compare_arg *x = arg;
101
102 if (!nft_data_cmp(&he->key, &x->elem->key, x->set->klen)) {
103 x->elem->cookie = he;
104 x->elem->flags = 0;
105 if (x->set->flags & NFT_SET_MAP)
106 nft_data_copy(&x->elem->data, he->data);
107
108 return true;
109 }
110
111 return false;
95} 112}
96 113
97static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem) 114static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem)
98{ 115{
99 const struct rhashtable *priv = nft_set_priv(set); 116 struct rhashtable *priv = nft_set_priv(set);
100 const struct bucket_table *tbl = rht_dereference_rcu(priv->tbl, priv); 117 struct nft_compare_arg arg = {
101 struct rhash_head __rcu * const *pprev; 118 .set = set,
102 struct nft_hash_elem *he; 119 .elem = elem,
103 u32 h; 120 };
104
105 h = rhashtable_hashfn(priv, &elem->key, set->klen);
106 pprev = &tbl->buckets[h];
107 rht_for_each_entry_rcu(he, tbl->buckets[h], node) {
108 if (nft_data_cmp(&he->key, &elem->key, set->klen)) {
109 pprev = &he->node.next;
110 continue;
111 }
112 121
113 elem->cookie = (void *)pprev; 122 if (rhashtable_lookup_compare(priv, &elem->key,
114 elem->flags = 0; 123 &nft_hash_compare, &arg))
115 if (set->flags & NFT_SET_MAP)
116 nft_data_copy(&elem->data, he->data);
117 return 0; 124 return 0;
118 } 125
119 return -ENOENT; 126 return -ENOENT;
120} 127}
121 128
122static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set, 129static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
123 struct nft_set_iter *iter) 130 struct nft_set_iter *iter)
124{ 131{
125 const struct rhashtable *priv = nft_set_priv(set); 132 struct rhashtable *priv = nft_set_priv(set);
126 const struct bucket_table *tbl; 133 const struct bucket_table *tbl;
127 const struct nft_hash_elem *he; 134 const struct nft_hash_elem *he;
128 struct nft_set_elem elem; 135 struct nft_set_elem elem;
@@ -130,7 +137,9 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
130 137
131 tbl = rht_dereference_rcu(priv->tbl, priv); 138 tbl = rht_dereference_rcu(priv->tbl, priv);
132 for (i = 0; i < tbl->size; i++) { 139 for (i = 0; i < tbl->size; i++) {
133 rht_for_each_entry_rcu(he, tbl->buckets[i], node) { 140 struct rhash_head *pos;
141
142 rht_for_each_entry_rcu(he, pos, tbl, i, node) {
134 if (iter->count < iter->skip) 143 if (iter->count < iter->skip)
135 goto cont; 144 goto cont;
136 145
@@ -153,13 +162,6 @@ static unsigned int nft_hash_privsize(const struct nlattr * const nla[])
153 return sizeof(struct rhashtable); 162 return sizeof(struct rhashtable);
154} 163}
155 164
156#ifdef CONFIG_PROVE_LOCKING
157static int lockdep_nfnl_lock_is_held(void *parent)
158{
159 return lockdep_nfnl_is_held(NFNL_SUBSYS_NFTABLES);
160}
161#endif
162
163static int nft_hash_init(const struct nft_set *set, 165static int nft_hash_init(const struct nft_set *set,
164 const struct nft_set_desc *desc, 166 const struct nft_set_desc *desc,
165 const struct nlattr * const tb[]) 167 const struct nlattr * const tb[])
@@ -173,9 +175,6 @@ static int nft_hash_init(const struct nft_set *set,
173 .hashfn = jhash, 175 .hashfn = jhash,
174 .grow_decision = rht_grow_above_75, 176 .grow_decision = rht_grow_above_75,
175 .shrink_decision = rht_shrink_below_30, 177 .shrink_decision = rht_shrink_below_30,
176#ifdef CONFIG_PROVE_LOCKING
177 .mutex_is_held = lockdep_nfnl_lock_is_held,
178#endif
179 }; 178 };
180 179
181 return rhashtable_init(priv, &params); 180 return rhashtable_init(priv, &params);
@@ -183,18 +182,23 @@ static int nft_hash_init(const struct nft_set *set,
183 182
184static void nft_hash_destroy(const struct nft_set *set) 183static void nft_hash_destroy(const struct nft_set *set)
185{ 184{
186 const struct rhashtable *priv = nft_set_priv(set); 185 struct rhashtable *priv = nft_set_priv(set);
187 const struct bucket_table *tbl = priv->tbl; 186 const struct bucket_table *tbl;
188 struct nft_hash_elem *he, *next; 187 struct nft_hash_elem *he;
188 struct rhash_head *pos, *next;
189 unsigned int i; 189 unsigned int i;
190 190
191 /* Stop an eventual async resizing */
192 priv->being_destroyed = true;
193 mutex_lock(&priv->mutex);
194
195 tbl = rht_dereference(priv->tbl, priv);
191 for (i = 0; i < tbl->size; i++) { 196 for (i = 0; i < tbl->size; i++) {
192 for (he = rht_entry(tbl->buckets[i], struct nft_hash_elem, node); 197 rht_for_each_entry_safe(he, pos, next, tbl, i, node)
193 he != NULL; he = next) {
194 next = rht_entry(he->node.next, struct nft_hash_elem, node);
195 nft_hash_elem_destroy(set, he); 198 nft_hash_elem_destroy(set, he);
196 }
197 } 199 }
200 mutex_unlock(&priv->mutex);
201
198 rhashtable_destroy(priv); 202 rhashtable_destroy(priv);
199} 203}
200 204
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index 84ea76ca3f1f..298e1df7132a 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -97,12 +97,12 @@ static int netlink_dump(struct sock *sk);
97static void netlink_skb_destructor(struct sk_buff *skb); 97static void netlink_skb_destructor(struct sk_buff *skb);
98 98
99/* nl_table locking explained: 99/* nl_table locking explained:
100 * Lookup and traversal are protected with nl_sk_hash_lock or nl_table_lock 100 * Lookup and traversal are protected with an RCU read-side lock. Insertion
101 * combined with an RCU read-side lock. Insertion and removal are protected 101 * and removal are protected with nl_sk_hash_lock while using RCU list
102 * with nl_sk_hash_lock while using RCU list modification primitives and may 102 * modification primitives and may run in parallel to RCU protected lookups.
103 * run in parallel to nl_table_lock protected lookups. Destruction of the 103 * Destruction of the Netlink socket may only occur *after* nl_table_lock has
104 * Netlink socket may only occur *after* nl_table_lock has been acquired 104 * been acquired * either during or after the socket has been removed from
105 * either during or after the socket has been removed from the list. 105 * the list and after an RCU grace period.
106 */ 106 */
107DEFINE_RWLOCK(nl_table_lock); 107DEFINE_RWLOCK(nl_table_lock);
108EXPORT_SYMBOL_GPL(nl_table_lock); 108EXPORT_SYMBOL_GPL(nl_table_lock);
@@ -114,15 +114,6 @@ static atomic_t nl_table_users = ATOMIC_INIT(0);
114DEFINE_MUTEX(nl_sk_hash_lock); 114DEFINE_MUTEX(nl_sk_hash_lock);
115EXPORT_SYMBOL_GPL(nl_sk_hash_lock); 115EXPORT_SYMBOL_GPL(nl_sk_hash_lock);
116 116
117#ifdef CONFIG_PROVE_LOCKING
118static int lockdep_nl_sk_hash_is_held(void *parent)
119{
120 if (debug_locks)
121 return lockdep_is_held(&nl_sk_hash_lock) || lockdep_is_held(&nl_table_lock);
122 return 1;
123}
124#endif
125
126static ATOMIC_NOTIFIER_HEAD(netlink_chain); 117static ATOMIC_NOTIFIER_HEAD(netlink_chain);
127 118
128static DEFINE_SPINLOCK(netlink_tap_lock); 119static DEFINE_SPINLOCK(netlink_tap_lock);
@@ -1002,11 +993,8 @@ static struct sock *__netlink_lookup(struct netlink_table *table, u32 portid,
1002 .net = net, 993 .net = net,
1003 .portid = portid, 994 .portid = portid,
1004 }; 995 };
1005 u32 hash;
1006
1007 hash = rhashtable_hashfn(&table->hash, &portid, sizeof(portid));
1008 996
1009 return rhashtable_lookup_compare(&table->hash, hash, 997 return rhashtable_lookup_compare(&table->hash, &portid,
1010 &netlink_compare, &arg); 998 &netlink_compare, &arg);
1011} 999}
1012 1000
@@ -1015,13 +1003,11 @@ static struct sock *netlink_lookup(struct net *net, int protocol, u32 portid)
1015 struct netlink_table *table = &nl_table[protocol]; 1003 struct netlink_table *table = &nl_table[protocol];
1016 struct sock *sk; 1004 struct sock *sk;
1017 1005
1018 read_lock(&nl_table_lock);
1019 rcu_read_lock(); 1006 rcu_read_lock();
1020 sk = __netlink_lookup(table, portid, net); 1007 sk = __netlink_lookup(table, portid, net);
1021 if (sk) 1008 if (sk)
1022 sock_hold(sk); 1009 sock_hold(sk);
1023 rcu_read_unlock(); 1010 rcu_read_unlock();
1024 read_unlock(&nl_table_lock);
1025 1011
1026 return sk; 1012 return sk;
1027} 1013}
@@ -1066,7 +1052,8 @@ static int netlink_insert(struct sock *sk, struct net *net, u32 portid)
1066 goto err; 1052 goto err;
1067 1053
1068 err = -ENOMEM; 1054 err = -ENOMEM;
1069 if (BITS_PER_LONG > 32 && unlikely(table->hash.nelems >= UINT_MAX)) 1055 if (BITS_PER_LONG > 32 &&
1056 unlikely(atomic_read(&table->hash.nelems) >= UINT_MAX))
1070 goto err; 1057 goto err;
1071 1058
1072 nlk_sk(sk)->portid = portid; 1059 nlk_sk(sk)->portid = portid;
@@ -1194,6 +1181,13 @@ out_module:
1194 goto out; 1181 goto out;
1195} 1182}
1196 1183
1184static void deferred_put_nlk_sk(struct rcu_head *head)
1185{
1186 struct netlink_sock *nlk = container_of(head, struct netlink_sock, rcu);
1187
1188 sock_put(&nlk->sk);
1189}
1190
1197static int netlink_release(struct socket *sock) 1191static int netlink_release(struct socket *sock)
1198{ 1192{
1199 struct sock *sk = sock->sk; 1193 struct sock *sk = sock->sk;
@@ -1259,7 +1253,7 @@ static int netlink_release(struct socket *sock)
1259 local_bh_disable(); 1253 local_bh_disable();
1260 sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); 1254 sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
1261 local_bh_enable(); 1255 local_bh_enable();
1262 sock_put(sk); 1256 call_rcu(&nlk->rcu, deferred_put_nlk_sk);
1263 return 0; 1257 return 0;
1264} 1258}
1265 1259
@@ -1274,7 +1268,6 @@ static int netlink_autobind(struct socket *sock)
1274 1268
1275retry: 1269retry:
1276 cond_resched(); 1270 cond_resched();
1277 netlink_table_grab();
1278 rcu_read_lock(); 1271 rcu_read_lock();
1279 if (__netlink_lookup(table, portid, net)) { 1272 if (__netlink_lookup(table, portid, net)) {
1280 /* Bind collision, search negative portid values. */ 1273 /* Bind collision, search negative portid values. */
@@ -1282,11 +1275,9 @@ retry:
1282 if (rover > -4097) 1275 if (rover > -4097)
1283 rover = -4097; 1276 rover = -4097;
1284 rcu_read_unlock(); 1277 rcu_read_unlock();
1285 netlink_table_ungrab();
1286 goto retry; 1278 goto retry;
1287 } 1279 }
1288 rcu_read_unlock(); 1280 rcu_read_unlock();
1289 netlink_table_ungrab();
1290 1281
1291 err = netlink_insert(sk, net, portid); 1282 err = netlink_insert(sk, net, portid);
1292 if (err == -EADDRINUSE) 1283 if (err == -EADDRINUSE)
@@ -2901,7 +2892,9 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos)
2901 const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht); 2892 const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
2902 2893
2903 for (j = 0; j < tbl->size; j++) { 2894 for (j = 0; j < tbl->size; j++) {
2904 rht_for_each_entry_rcu(nlk, tbl->buckets[j], node) { 2895 struct rhash_head *node;
2896
2897 rht_for_each_entry_rcu(nlk, node, tbl, j, node) {
2905 s = (struct sock *)nlk; 2898 s = (struct sock *)nlk;
2906 2899
2907 if (sock_net(s) != seq_file_net(seq)) 2900 if (sock_net(s) != seq_file_net(seq))
@@ -2919,9 +2912,8 @@ static struct sock *netlink_seq_socket_idx(struct seq_file *seq, loff_t pos)
2919} 2912}
2920 2913
2921static void *netlink_seq_start(struct seq_file *seq, loff_t *pos) 2914static void *netlink_seq_start(struct seq_file *seq, loff_t *pos)
2922 __acquires(nl_table_lock) __acquires(RCU) 2915 __acquires(RCU)
2923{ 2916{
2924 read_lock(&nl_table_lock);
2925 rcu_read_lock(); 2917 rcu_read_lock();
2926 return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN; 2918 return *pos ? netlink_seq_socket_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2927} 2919}
@@ -2929,6 +2921,8 @@ static void *netlink_seq_start(struct seq_file *seq, loff_t *pos)
2929static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos) 2921static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2930{ 2922{
2931 struct rhashtable *ht; 2923 struct rhashtable *ht;
2924 const struct bucket_table *tbl;
2925 struct rhash_head *node;
2932 struct netlink_sock *nlk; 2926 struct netlink_sock *nlk;
2933 struct nl_seq_iter *iter; 2927 struct nl_seq_iter *iter;
2934 struct net *net; 2928 struct net *net;
@@ -2945,17 +2939,17 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2945 2939
2946 i = iter->link; 2940 i = iter->link;
2947 ht = &nl_table[i].hash; 2941 ht = &nl_table[i].hash;
2948 rht_for_each_entry(nlk, nlk->node.next, ht, node) 2942 tbl = rht_dereference_rcu(ht->tbl, ht);
2943 rht_for_each_entry_rcu_continue(nlk, node, nlk->node.next, tbl, iter->hash_idx, node)
2949 if (net_eq(sock_net((struct sock *)nlk), net)) 2944 if (net_eq(sock_net((struct sock *)nlk), net))
2950 return nlk; 2945 return nlk;
2951 2946
2952 j = iter->hash_idx + 1; 2947 j = iter->hash_idx + 1;
2953 2948
2954 do { 2949 do {
2955 const struct bucket_table *tbl = rht_dereference_rcu(ht->tbl, ht);
2956 2950
2957 for (; j < tbl->size; j++) { 2951 for (; j < tbl->size; j++) {
2958 rht_for_each_entry(nlk, tbl->buckets[j], ht, node) { 2952 rht_for_each_entry_rcu(nlk, node, tbl, j, node) {
2959 if (net_eq(sock_net((struct sock *)nlk), net)) { 2953 if (net_eq(sock_net((struct sock *)nlk), net)) {
2960 iter->link = i; 2954 iter->link = i;
2961 iter->hash_idx = j; 2955 iter->hash_idx = j;
@@ -2971,10 +2965,9 @@ static void *netlink_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2971} 2965}
2972 2966
2973static void netlink_seq_stop(struct seq_file *seq, void *v) 2967static void netlink_seq_stop(struct seq_file *seq, void *v)
2974 __releases(RCU) __releases(nl_table_lock) 2968 __releases(RCU)
2975{ 2969{
2976 rcu_read_unlock(); 2970 rcu_read_unlock();
2977 read_unlock(&nl_table_lock);
2978} 2971}
2979 2972
2980 2973
@@ -3121,9 +3114,6 @@ static int __init netlink_proto_init(void)
3121 .max_shift = 16, /* 64K */ 3114 .max_shift = 16, /* 64K */
3122 .grow_decision = rht_grow_above_75, 3115 .grow_decision = rht_grow_above_75,
3123 .shrink_decision = rht_shrink_below_30, 3116 .shrink_decision = rht_shrink_below_30,
3124#ifdef CONFIG_PROVE_LOCKING
3125 .mutex_is_held = lockdep_nl_sk_hash_is_held,
3126#endif
3127 }; 3117 };
3128 3118
3129 if (err != 0) 3119 if (err != 0)
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index f123a88496f8..fd96fa76202a 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -50,6 +50,7 @@ struct netlink_sock {
50#endif /* CONFIG_NETLINK_MMAP */ 50#endif /* CONFIG_NETLINK_MMAP */
51 51
52 struct rhash_head node; 52 struct rhash_head node;
53 struct rcu_head rcu;
53}; 54};
54 55
55static inline struct netlink_sock *nlk_sk(struct sock *sk) 56static inline struct netlink_sock *nlk_sk(struct sock *sk)
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index de8c74a3c061..fcca36d81a62 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -113,7 +113,9 @@ static int __netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
113 req = nlmsg_data(cb->nlh); 113 req = nlmsg_data(cb->nlh);
114 114
115 for (i = 0; i < htbl->size; i++) { 115 for (i = 0; i < htbl->size; i++) {
116 rht_for_each_entry(nlsk, htbl->buckets[i], ht, node) { 116 struct rhash_head *pos;
117
118 rht_for_each_entry(nlsk, pos, htbl, i, node) {
117 sk = (struct sock *)nlsk; 119 sk = (struct sock *)nlsk;
118 120
119 if (!net_eq(sock_net(sk), net)) 121 if (!net_eq(sock_net(sk), net))