aboutsummaryrefslogtreecommitdiffstats
path: root/net
diff options
context:
space:
mode:
authorMiklos Szeredi <mszeredi@suse.cz>2007-07-11 17:22:39 -0400
committerDavid S. Miller <davem@davemloft.net>2007-07-11 17:22:39 -0400
commit1fd05ba5a2f2aa8e7b9b52ef55df850e2e7d54c9 (patch)
tree3403194403ab25f1f7a360a002adf63be2b4e9b0 /net
parent99d24edeb6abc6ca3a0d0fbdb83c664c04403c8c (diff)
[AF_UNIX]: Rewrite garbage collector, fixes race.
Throw out the old mark & sweep garbage collector and put in a refcounting cycle detecting one. The old one had a race with recvmsg, that resulted in false positives and hence data loss. The old algorithm operated on all unix sockets in the system, so any additional locking would have meant performance problems for all users of these. The new algorithm instead only operates on "in flight" sockets, which are very rare, and the additional locking for these doesn't negatively impact the vast majority of users. In fact it's probable, that there weren't *any* heavy senders of sockets over sockets, otherwise the above race would have been discovered long ago. The patch works OK with the app that exposed the race with the old code. The garbage collection has also been verified to work in a few simple cases. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Signed-off-by: David S. Miller <davem@davemloft.net>
Diffstat (limited to 'net')
-rw-r--r--net/unix/af_unix.c6
-rw-r--r--net/unix/garbage.c325
2 files changed, 188 insertions, 143 deletions
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index 3654b647ac79..65ebccc0a698 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -592,7 +592,8 @@ static struct sock * unix_create1(struct socket *sock)
592 u->dentry = NULL; 592 u->dentry = NULL;
593 u->mnt = NULL; 593 u->mnt = NULL;
594 spin_lock_init(&u->lock); 594 spin_lock_init(&u->lock);
595 atomic_set(&u->inflight, sock ? 0 : -1); 595 atomic_set(&u->inflight, 0);
596 INIT_LIST_HEAD(&u->link);
596 mutex_init(&u->readlock); /* single task reading lock */ 597 mutex_init(&u->readlock); /* single task reading lock */
597 init_waitqueue_head(&u->peer_wait); 598 init_waitqueue_head(&u->peer_wait);
598 unix_insert_socket(unix_sockets_unbound, sk); 599 unix_insert_socket(unix_sockets_unbound, sk);
@@ -1134,9 +1135,6 @@ restart:
1134 /* take ten and and send info to listening sock */ 1135 /* take ten and and send info to listening sock */
1135 spin_lock(&other->sk_receive_queue.lock); 1136 spin_lock(&other->sk_receive_queue.lock);
1136 __skb_queue_tail(&other->sk_receive_queue, skb); 1137 __skb_queue_tail(&other->sk_receive_queue, skb);
1137 /* Undo artificially decreased inflight after embrion
1138 * is installed to listening socket. */
1139 atomic_inc(&newu->inflight);
1140 spin_unlock(&other->sk_receive_queue.lock); 1138 spin_unlock(&other->sk_receive_queue.lock);
1141 unix_state_unlock(other); 1139 unix_state_unlock(other);
1142 other->sk_data_ready(other, 0); 1140 other->sk_data_ready(other, 0);
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index f20b7ea7c555..406b6433e467 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -62,6 +62,10 @@
62 * AV 1 Mar 1999 62 * AV 1 Mar 1999
63 * Damn. Added missing check for ->dead in listen queues scanning. 63 * Damn. Added missing check for ->dead in listen queues scanning.
64 * 64 *
65 * Miklos Szeredi 25 Jun 2007
66 * Reimplement with a cycle collecting algorithm. This should
67 * solve several problems with the previous code, like being racy
68 * wrt receive and holding up unrelated socket operations.
65 */ 69 */
66 70
67#include <linux/kernel.h> 71#include <linux/kernel.h>
@@ -84,10 +88,9 @@
84 88
85/* Internal data structures and random procedures: */ 89/* Internal data structures and random procedures: */
86 90
87#define GC_HEAD ((struct sock *)(-1)) 91static LIST_HEAD(gc_inflight_list);
88#define GC_ORPHAN ((struct sock *)(-3)) 92static LIST_HEAD(gc_candidates);
89 93static DEFINE_SPINLOCK(unix_gc_lock);
90static struct sock *gc_current = GC_HEAD; /* stack of objects to mark */
91 94
92atomic_t unix_tot_inflight = ATOMIC_INIT(0); 95atomic_t unix_tot_inflight = ATOMIC_INIT(0);
93 96
@@ -122,8 +125,16 @@ void unix_inflight(struct file *fp)
122{ 125{
123 struct sock *s = unix_get_socket(fp); 126 struct sock *s = unix_get_socket(fp);
124 if(s) { 127 if(s) {
125 atomic_inc(&unix_sk(s)->inflight); 128 struct unix_sock *u = unix_sk(s);
129 spin_lock(&unix_gc_lock);
130 if (atomic_inc_return(&u->inflight) == 1) {
131 BUG_ON(!list_empty(&u->link));
132 list_add_tail(&u->link, &gc_inflight_list);
133 } else {
134 BUG_ON(list_empty(&u->link));
135 }
126 atomic_inc(&unix_tot_inflight); 136 atomic_inc(&unix_tot_inflight);
137 spin_unlock(&unix_gc_lock);
127 } 138 }
128} 139}
129 140
@@ -131,182 +142,218 @@ void unix_notinflight(struct file *fp)
131{ 142{
132 struct sock *s = unix_get_socket(fp); 143 struct sock *s = unix_get_socket(fp);
133 if(s) { 144 if(s) {
134 atomic_dec(&unix_sk(s)->inflight); 145 struct unix_sock *u = unix_sk(s);
146 spin_lock(&unix_gc_lock);
147 BUG_ON(list_empty(&u->link));
148 if (atomic_dec_and_test(&u->inflight))
149 list_del_init(&u->link);
135 atomic_dec(&unix_tot_inflight); 150 atomic_dec(&unix_tot_inflight);
151 spin_unlock(&unix_gc_lock);
136 } 152 }
137} 153}
138 154
155static inline struct sk_buff *sock_queue_head(struct sock *sk)
156{
157 return (struct sk_buff *) &sk->sk_receive_queue;
158}
139 159
140/* 160#define receive_queue_for_each_skb(sk, next, skb) \
141 * Garbage Collector Support Functions 161 for (skb = sock_queue_head(sk)->next, next = skb->next; \
142 */ 162 skb != sock_queue_head(sk); skb = next, next = skb->next)
143 163
144static inline struct sock *pop_stack(void) 164static void scan_inflight(struct sock *x, void (*func)(struct sock *),
165 struct sk_buff_head *hitlist)
145{ 166{
146 struct sock *p = gc_current; 167 struct sk_buff *skb;
147 gc_current = unix_sk(p)->gc_tree; 168 struct sk_buff *next;
148 return p; 169
170 spin_lock(&x->sk_receive_queue.lock);
171 receive_queue_for_each_skb(x, next, skb) {
172 /*
173 * Do we have file descriptors ?
174 */
175 if (UNIXCB(skb).fp) {
176 bool hit = false;
177 /*
178 * Process the descriptors of this socket
179 */
180 int nfd = UNIXCB(skb).fp->count;
181 struct file **fp = UNIXCB(skb).fp->fp;
182 while (nfd--) {
183 /*
184 * Get the socket the fd matches
185 * if it indeed does so
186 */
187 struct sock *sk = unix_get_socket(*fp++);
188 if(sk) {
189 hit = true;
190 func(sk);
191 }
192 }
193 if (hit && hitlist != NULL) {
194 __skb_unlink(skb, &x->sk_receive_queue);
195 __skb_queue_tail(hitlist, skb);
196 }
197 }
198 }
199 spin_unlock(&x->sk_receive_queue.lock);
149} 200}
150 201
151static inline int empty_stack(void) 202static void scan_children(struct sock *x, void (*func)(struct sock *),
203 struct sk_buff_head *hitlist)
152{ 204{
153 return gc_current == GC_HEAD; 205 if (x->sk_state != TCP_LISTEN)
206 scan_inflight(x, func, hitlist);
207 else {
208 struct sk_buff *skb;
209 struct sk_buff *next;
210 struct unix_sock *u;
211 LIST_HEAD(embryos);
212
213 /*
214 * For a listening socket collect the queued embryos
215 * and perform a scan on them as well.
216 */
217 spin_lock(&x->sk_receive_queue.lock);
218 receive_queue_for_each_skb(x, next, skb) {
219 u = unix_sk(skb->sk);
220
221 /*
222 * An embryo cannot be in-flight, so it's safe
223 * to use the list link.
224 */
225 BUG_ON(!list_empty(&u->link));
226 list_add_tail(&u->link, &embryos);
227 }
228 spin_unlock(&x->sk_receive_queue.lock);
229
230 while (!list_empty(&embryos)) {
231 u = list_entry(embryos.next, struct unix_sock, link);
232 scan_inflight(&u->sk, func, hitlist);
233 list_del_init(&u->link);
234 }
235 }
154} 236}
155 237
156static void maybe_unmark_and_push(struct sock *x) 238static void dec_inflight(struct sock *sk)
157{ 239{
158 struct unix_sock *u = unix_sk(x); 240 atomic_dec(&unix_sk(sk)->inflight);
241}
159 242
160 if (u->gc_tree != GC_ORPHAN) 243static void inc_inflight(struct sock *sk)
161 return; 244{
162 sock_hold(x); 245 atomic_inc(&unix_sk(sk)->inflight);
163 u->gc_tree = gc_current;
164 gc_current = x;
165} 246}
166 247
248static void inc_inflight_move_tail(struct sock *sk)
249{
250 struct unix_sock *u = unix_sk(sk);
251
252 atomic_inc(&u->inflight);
253 /*
254 * If this is still a candidate, move it to the end of the
255 * list, so that it's checked even if it was already passed
256 * over
257 */
258 if (u->gc_candidate)
259 list_move_tail(&u->link, &gc_candidates);
260}
167 261
168/* The external entry point: unix_gc() */ 262/* The external entry point: unix_gc() */
169 263
170void unix_gc(void) 264void unix_gc(void)
171{ 265{
172 static DEFINE_MUTEX(unix_gc_sem); 266 static bool gc_in_progress = false;
173 int i;
174 struct sock *s;
175 struct sk_buff_head hitlist;
176 struct sk_buff *skb;
177 267
178 /* 268 struct unix_sock *u;
179 * Avoid a recursive GC. 269 struct unix_sock *next;
180 */ 270 struct sk_buff_head hitlist;
271 struct list_head cursor;
181 272
182 if (!mutex_trylock(&unix_gc_sem)) 273 spin_lock(&unix_gc_lock);
183 return;
184 274
185 spin_lock(&unix_table_lock); 275 /* Avoid a recursive GC. */
276 if (gc_in_progress)
277 goto out;
186 278
187 forall_unix_sockets(i, s) 279 gc_in_progress = true;
188 {
189 unix_sk(s)->gc_tree = GC_ORPHAN;
190 }
191 /* 280 /*
192 * Everything is now marked 281 * First, select candidates for garbage collection. Only
193 */ 282 * in-flight sockets are considered, and from those only ones
194 283 * which don't have any external reference.
195 /* Invariant to be maintained: 284 *
196 - everything unmarked is either: 285 * Holding unix_gc_lock will protect these candidates from
197 -- (a) on the stack, or 286 * being detached, and hence from gaining an external
198 -- (b) has all of its children unmarked 287 * reference. This also means, that since there are no
199 - everything on the stack is always unmarked 288 * possible receivers, the receive queues of these sockets are
200 - nothing is ever pushed onto the stack twice, because: 289 * static during the GC, even though the dequeue is done
201 -- nothing previously unmarked is ever pushed on the stack 290 * before the detach without atomicity guarantees.
202 */ 291 */
292 list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
293 int total_refs;
294 int inflight_refs;
295
296 total_refs = file_count(u->sk.sk_socket->file);
297 inflight_refs = atomic_read(&u->inflight);
298
299 BUG_ON(inflight_refs < 1);
300 BUG_ON(total_refs < inflight_refs);
301 if (total_refs == inflight_refs) {
302 list_move_tail(&u->link, &gc_candidates);
303 u->gc_candidate = 1;
304 }
305 }
203 306
204 /* 307 /*
205 * Push root set 308 * Now remove all internal in-flight reference to children of
309 * the candidates.
206 */ 310 */
207 311 list_for_each_entry(u, &gc_candidates, link)
208 forall_unix_sockets(i, s) 312 scan_children(&u->sk, dec_inflight, NULL);
209 {
210 int open_count = 0;
211
212 /*
213 * If all instances of the descriptor are not
214 * in flight we are in use.
215 *
216 * Special case: when socket s is embrion, it may be
217 * hashed but still not in queue of listening socket.
218 * In this case (see unix_create1()) we set artificial
219 * negative inflight counter to close race window.
220 * It is trick of course and dirty one.
221 */
222 if (s->sk_socket && s->sk_socket->file)
223 open_count = file_count(s->sk_socket->file);
224 if (open_count > atomic_read(&unix_sk(s)->inflight))
225 maybe_unmark_and_push(s);
226 }
227 313
228 /* 314 /*
229 * Mark phase 315 * Restore the references for children of all candidates,
316 * which have remaining references. Do this recursively, so
317 * only those remain, which form cyclic references.
318 *
319 * Use a "cursor" link, to make the list traversal safe, even
320 * though elements might be moved about.
230 */ 321 */
322 list_add(&cursor, &gc_candidates);
323 while (cursor.next != &gc_candidates) {
324 u = list_entry(cursor.next, struct unix_sock, link);
231 325
232 while (!empty_stack()) 326 /* Move cursor to after the current position. */
233 { 327 list_move(&cursor, &u->link);
234 struct sock *x = pop_stack();
235 struct sock *sk;
236
237 spin_lock(&x->sk_receive_queue.lock);
238 skb = skb_peek(&x->sk_receive_queue);
239
240 /*
241 * Loop through all but first born
242 */
243 328
244 while (skb && skb != (struct sk_buff *)&x->sk_receive_queue) { 329 if (atomic_read(&u->inflight) > 0) {
245 /* 330 list_move_tail(&u->link, &gc_inflight_list);
246 * Do we have file descriptors ? 331 u->gc_candidate = 0;
247 */ 332 scan_children(&u->sk, inc_inflight_move_tail, NULL);
248 if(UNIXCB(skb).fp)
249 {
250 /*
251 * Process the descriptors of this socket
252 */
253 int nfd=UNIXCB(skb).fp->count;
254 struct file **fp = UNIXCB(skb).fp->fp;
255 while(nfd--)
256 {
257 /*
258 * Get the socket the fd matches if
259 * it indeed does so
260 */
261 if((sk=unix_get_socket(*fp++))!=NULL)
262 {
263 maybe_unmark_and_push(sk);
264 }
265 }
266 }
267 /* We have to scan not-yet-accepted ones too */
268 if (x->sk_state == TCP_LISTEN)
269 maybe_unmark_and_push(skb->sk);
270 skb=skb->next;
271 } 333 }
272 spin_unlock(&x->sk_receive_queue.lock);
273 sock_put(x);
274 } 334 }
335 list_del(&cursor);
275 336
337 /*
338 * Now gc_candidates contains only garbage. Restore original
339 * inflight counters for these as well, and remove the skbuffs
340 * which are creating the cycle(s).
341 */
276 skb_queue_head_init(&hitlist); 342 skb_queue_head_init(&hitlist);
343 list_for_each_entry(u, &gc_candidates, link)
344 scan_children(&u->sk, inc_inflight, &hitlist);
277 345
278 forall_unix_sockets(i, s) 346 spin_unlock(&unix_gc_lock);
279 {
280 struct unix_sock *u = unix_sk(s);
281 347
282 if (u->gc_tree == GC_ORPHAN) { 348 /* Here we are. Hitlist is filled. Die. */
283 struct sk_buff *nextsk; 349 __skb_queue_purge(&hitlist);
284 350
285 spin_lock(&s->sk_receive_queue.lock); 351 spin_lock(&unix_gc_lock);
286 skb = skb_peek(&s->sk_receive_queue);
287 while (skb &&
288 skb != (struct sk_buff *)&s->sk_receive_queue) {
289 nextsk = skb->next;
290 /*
291 * Do we have file descriptors ?
292 */
293 if (UNIXCB(skb).fp) {
294 __skb_unlink(skb,
295 &s->sk_receive_queue);
296 __skb_queue_tail(&hitlist, skb);
297 }
298 skb = nextsk;
299 }
300 spin_unlock(&s->sk_receive_queue.lock);
301 }
302 u->gc_tree = GC_ORPHAN;
303 }
304 spin_unlock(&unix_table_lock);
305 352
306 /* 353 /* All candidates should have been detached by now. */
307 * Here we are. Hitlist is filled. Die. 354 BUG_ON(!list_empty(&gc_candidates));
308 */ 355 gc_in_progress = false;
309 356
310 __skb_queue_purge(&hitlist); 357 out:
311 mutex_unlock(&unix_gc_sem); 358 spin_unlock(&unix_gc_lock);
312} 359}